Chapter 2 Data

#save.image("./_dat/mal_ineq_dat5.RData")
load("./_dat/mal_ineq_dat5.RData")

2.1 DHS Data

rm(list = ls())
library(purrr)
library(tidyverse)
library(haven)
library(janitor)
library(labelled)
library(magrittr)

files <- dir(path = "./_dat/DHS/", pattern = "*.DTA" , recursive = T, full.names = T)

dat <- data_frame(filename = files) %>% 
  mutate(country = str_sub(filename, -12,-11),
         type = str_sub(filename, -10,-9),
         file_contents = map(.x = filename, .f = ~read_stata(.))) %>%
  filter(type=="PR")
  
# Function ========
info <- function(dat) {
  a <- tibble(var_names = colnames(dat),
                var_lab1 = var_label(dat, unlist = T),
                var_lab = as.character(var_lab1),
                var_lab_make = make_clean_names(var_lab)) %>%
    dplyr::select(-var_lab1)
  return(a)
}
# Function ========

# Variables summary/matches
d0 <- dat %>%  
  mutate(codebook = map(.x = file_contents, .f = ~info(.x))) %>%
  dplyr::select(country, codebook) %>%
  unnest(cols = c(codebook)) 

d1 <- d0 %>%
  group_by(var_lab_make) %>%
  count()

vars <- c("region","province","result_of_malaria_measurement", "wealth_index_factor_score_5_decimals", "wealth_index_factor_score_combined_5_decimals",
          "mothers_highest_educational_level", "primary_sampling_unit", "ultimate_area_unit", "household_sample_weight_6_decimals",
          "sample_weight", "result_of_malaria_rapid_test", "result_of_malaria_rapid_test_2", "result_of_rapid_test_of_malaria",
          "year_of_interview", "cluster_number",
          "age_of_household_members") # update re-analysis [Kim]

# Selected variables

dat_s <- dat %>%
  mutate(subset0 = map(.x = file_contents, .f = ~set_colnames(.x, var_label(.) %>% data_frame() %>% unlist()) %>%
                        clean_names() %>%
                        dplyr::select(any_of(vars)) %>%
                        set_colnames(var_label(.) %>% data_frame() %>% unlist()) %>%
                        clean_names()),
         subset = map(.x = subset0, .f = ~mutate_all(.x, as_factor) %>%
                        mutate_all(as.character))
         )

dat_s_flat <- dat_s %>%
  #mutate(export = map(.x = subset, .f = ~mutate_all(.x, as.character))) %>%
  #dplyr::select(subset) %>%
  dplyr::select(country, subset) %>%
  unnest()

# epiDisplay::tab1(dat_s_flat$result_of_malaria_measurement)
# epiDisplay::tab1(dat_s_flat$result_of_malaria_rapid_test)
# hist(dat_s_flat$wealth_index_factor_score_5_decimals)

# "primary_sampling_unit", "ultimate_area_unit" iguales a "cluster_number"

write.csv(dat_s_flat, "./_dat/dat_s_flat.csv", na="", row.names = F)

2.2 GPS Data

#save.image("~/mal_ineq/_dat/mal_ineq_dat3_gps.RData")
load("./_dat/mal_ineq_dat3_gps.RData")
library(sf)

files_gps <- dir(path = "./_dat/GPS/", pattern = "*.shp" , recursive = T, full.names = T) %>% data.frame() %>% set_colnames("filename") %>%
  anti_join(dir(path = "./_dat/GPS/", pattern = "*.xml" , recursive = T, full.names = T) %>% data.frame() %>% set_colnames("filename"),
            by = "filename")

dat_gps <- files_gps %>% 
  mutate(country = str_sub(filename, -12,-11),
         type = str_sub(filename, -10,-9),
         file_contents = map(.x = filename, .f = ~sf::st_read(.) %>% st_set_geometry(NULL)))

dat_gps_flat <- dat_gps %>%
  unnest() %>%
  arrange(-DHSYEAR) %>%
  ungroup() %>%
  distinct(country, DHSID, .keep_all = T)

2.3 ADM Data

AO <- st_read("./_dat/SHP/AO/shps/sdr_subnational_boundaries.shp") %>%
  select(ISO, REGNAME) %>%
  st_cast("MULTIPOLYGON")
BU <- st_read("./_dat/SHP/BU/shps/sdr_subnational_boundaries.shp") %>%
  select(ISO, REGNAME) %>%
  mutate(ISO = "BU") %>%
  st_cast("MULTIPOLYGON")
LB <- st_read("./_dat/SHP/LB/shps/sdr_subnational_boundaries.shp") %>%
  select(ISO, REGNAME) %>%
  mutate(ISO = "LB") %>%
  st_cast("MULTIPOLYGON")
MD <- st_read("./_dat/SHP/MD/shps/sdr_subnational_boundaries.shp") %>%
  select(ISO, REGNAME) %>%
  mutate(ISO = "MD") %>%
  st_cast("MULTIPOLYGON")
TG <- st_read("./_dat/SHP/TG/shps/sdr_subnational_boundaries.shp") %>%
  select(ISO, REGNAME) %>%
  st_cast("MULTIPOLYGON")
TZ <- st_read("./_dat/SHP/TZ/shps/sdr_subnational_boundaries.shp") %>%
  select(ISO, REGNAME) %>%
  st_cast("MULTIPOLYGON")
UG <- st_read("./_dat/SHP/UG2/shps/sdr_subnational_boundaries.shp") %>%
  select(ISO, REGNAME) %>%
  st_cast("MULTIPOLYGON")
subnat <- st_read("./_dat/SHP/sdr_subnational_data_2020-09-27/shps/sdr_subnational_data.shp") %>%
  filter(ISO %in% c("BF","KE","MW","ML","MZ","SL")) %>%
  select(ISO, REGNAME) %>%
  st_cast("MULTIPOLYGON")

spat <- rbind(subnat,AO) %>%
  rbind(BU) %>%
  rbind(LB) %>%
  rbind(MD) %>%
  rbind(TG) %>%
  rbind(TZ) %>%
  rbind(UG) %>%
  rename(country = ISO) %>%
  mutate(REGNAME = toupper(REGNAME),
         REGNAME = str_replace_all(REGNAME, pattern = "-", replacement = " "),
         REGNAME = str_replace_all(REGNAME, pattern = "REGION", replacement = ""),
         REGNAME = str_replace_all(REGNAME, pattern = "PROVINCE", replacement = ""),
         REGNAME = str_replace_all(REGNAME, pattern = "AREA", replacement = ""),
         REGNAME = str_replace_all(REGNAME, pattern = "NORTHEASTERN", 
                                   replacement = "NORTH EASTERN"),
         REGNAME = str_replace_all(REGNAME, pattern = "NORTHERN", replacement = "NORTH"),
         REGNAME = str_replace_all(REGNAME, pattern = "SOUTHERN", replacement = "SOUTH"),
         REGNAME = str_replace_all(REGNAME, pattern = "GREATER", replacement = ""),
         REGNAME = str_replace_all(REGNAME, pattern = " DE ", replacement = " DU "),
         REGNAME = str_replace_all(REGNAME, pattern = "KARUZI", replacement = "KARUSI"),
         REGNAME = str_trim(REGNAME, side = "both"),
         poly = 1)

st_write(spat, "./_dat/SHP/union/DHS_adm.shp")

#rm(AO,BU,LB,MD,TG,TZ,UG,subnat,spat,p,p2)

2.4 Assemble

dhs_export <- dat_s_flat %>%
  dplyr::select(-c(primary_sampling_unit,ultimate_area_unit)) %>%
  mutate(w = ifelse(is.na(household_sample_weight_6_decimals), sample_weight,
                    household_sample_weight_6_decimals),
         REGNAME = ifelse(country=="BU" | country=="MZ", province,
                    region),
         wealth = ifelse(is.na(wealth_index_factor_score_5_decimals), 
                         wealth_index_factor_score_combined_5_decimals,
                         wealth_index_factor_score_5_decimals),
         rapid_test0 = ifelse(is.na(result_of_rapid_test_of_malaria), result_of_malaria_rapid_test_2,
                             result_of_rapid_test_of_malaria),
         rapid_test0 = ifelse(is.na(rapid_test0), result_of_malaria_rapid_test,
                             rapid_test0),
         # rapid_test0 = ifelse(result_of_rapid_test_of_malaria=="", result_of_malaria_rapid_test_2,
         #                     result_of_rapid_test_of_malaria),
         # rapid_test0 = ifelse(rapid_test0=="", result_of_malaria_rapid_test,
         #                     rapid_test0),
         rapid_test = ifelse(str_detect(rapid_test0, "positive"), 1, 0),
         rapid_test = ifelse(rapid_test0=="", NA, rapid_test),
         education = recode(mothers_highest_educational_level, "9" = "0_dontknow",
                            "don't know" = "0_dontknow",
                            "no education" = "1_noeducation",
                            "no education / pre-primary" = "1_noeducation",
                            "elementary (1-6)" = "2_elementary/primary",
                            "primary" = "2_elementary/primary",
                            "junior high (7-9)" = "3_junorsenhigh/secondary",
                            "secondary" = "3_junorsenhigh/secondary",
                            "senior high (10-12)" = "3_junorsenhigh/secondary",
                            "higher" = "4_higher"),
         edu = as.numeric(as.factor(education)),
         edu = ifelse(edu==1,NA,edu-1))

gps_export <- dat_gps_flat %>%
  ungroup() %>%
  mutate(cluster_number = as.character(DHSCLUST)) %>%
  dplyr::select(country, DHSID, DHSYEAR, cluster_number, URBAN_RURA, LATNUM, LONGNUM, ALT_DEM, DATUM) %>%
  arrange(-DHSYEAR) %>%
  distinct(country, cluster_number, .keep_all = T)

write.csv(gps_export, "./_dat/dat_gps_flat.csv", na="", row.names = F)

dat_export <- dhs_export %>%
  inner_join(gps_export, by = c("country", "cluster_number")) %>%
  mutate(REGNAME = toupper(REGNAME),
         REGNAME = str_replace_all(REGNAME, pattern = "-", replacement = " "),
         REGNAME = str_replace_all(REGNAME, pattern = "REGION", replacement = ""),
         REGNAME = str_replace_all(REGNAME, pattern = "PROVINCE", 
                                   replacement = ""),
         REGNAME = str_replace_all(REGNAME, pattern = "AREA", replacement = ""),
         REGNAME = str_replace_all(REGNAME, pattern = "NORTHEASTERN", 
                                   replacement = "NORTH EASTERN"),
         REGNAME = str_replace_all(REGNAME, pattern = "NORTHERN", 
                                   replacement = "NORTH"),
         REGNAME = str_replace_all(REGNAME, pattern = "SOUTHERN", 
                                   replacement = "SOUTH"),
         REGNAME = str_replace_all(REGNAME, pattern = "GREATER", replacement = ""),
         REGNAME = str_replace_all(REGNAME, pattern = " DE ", replacement = " DU "),
         REGNAME = str_replace_all(REGNAME, pattern = "KARUZI", 
                                   replacement = "KARUSI"),
         REGNAME = str_trim(REGNAME, side = "both"))

write.csv(dat_export, "./_dat/dat_mal_ineq_v3.csv", na="", row.names = F) # update re-analysis [Kim]
# 
# epiDisplay::tab1(dat_s_flat$result_of_rapid_test_of_malaria)
# epiDisplay::tab1(dat_s_flat$result_of_rapid_test_of_malaria)
# epiDisplay::tab1(dhs_export$result_of_rapid_test_of_malaria)
# epiDisplay::tab1(dhs_export$rapid_test0)
# epiDisplay::tab1(dhs_export$rapid_test)

2.5 Summary

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ dplyr   1.0.2
## ✓ tibble  3.0.5     ✓ stringr 1.4.0
## ✓ tidyr   1.1.2     ✓ forcats 0.5.0
## ✓ readr   1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:purrr':
## 
##     set_names

2.5.1 Text data

dat.n.w <- dat_export %>%
  group_by(country, REGNAME, cluster_number) %>%
  nest() %>%
  mutate(sample.size.N = map(.x = data, .f = ~dplyr::select(.x, rapid_test, wealth, w) %>%
                               count(name = "N") %>% unlist()),
         sample.size.m = map(.x = data, .f = ~dplyr::select(.x, rapid_test, wealth, w) %>%
                               filter(!is.na(rapid_test)) %>%
                               count(name = "N_m") %>%
                               unlist()),
         dat_s = map(.x = data, .f = ~dplyr::select(.x, rapid_test, wealth, w) %>%
                       filter(complete.cases(.))),
         sample.size = map(.x = dat_s, .f = ~count(.x) %>% unlist()),
         prev = map(.x = dat_s, .f = ~mean(.x$rapid_test, na.rm=T)),
         wealth_cats = map(.x = dat_s, .f = ~distinct(.x, wealth) %>% nrow())) %>%
  # FILTERS =======================
  filter(prev > 0 & prev < 1) %>%
  filter(sample.size>=10) %>%
  filter(wealth_cats>1)

dat.n.e <- dat_export %>%
  group_by(country, REGNAME, cluster_number) %>%
  nest() %>%
  mutate(sample.size.N = map(.x = data, .f = ~dplyr::select(.x, rapid_test, edu, w) %>%
                               count(name = "N") %>% unlist()),
         sample.size.m = map(.x = data, .f = ~dplyr::select(.x, rapid_test, edu, w) %>%
                               filter(!is.na(rapid_test)) %>%
                               count(name = "N_m") %>%
                               unlist()),
         dat_s = map(.x = data, .f = ~dplyr::select(.x, rapid_test, edu, w) %>%
                       filter(complete.cases(.))),
         sample.size = map(.x = dat_s, .f = ~count(.x) %>% unlist()),
         prev = map(.x = dat_s, .f = ~mean(.x$rapid_test, na.rm=T)),
         edu_cats = map(.x = dat_s, .f = ~distinct(.x, edu) %>% nrow())) %>%
  # FILTERS =======================
  filter(prev > 0 & prev < 1) %>%
  filter(sample.size>=10) %>%
  filter(edu_cats>1)

dat.n.w2 <- dat.n.w %>%
  mutate(w = map(.x = data, .f = ~max(.x$w))) %>%
  select(sample.size.N, sample.size.m, sample.size,w) %>%
  unnest()

dat.n.e2 <- dat.n.e %>%
  mutate(w = map(.x = data, .f = ~max(.x$w))) %>%
  select(sample.size.N, sample.size.m, sample.size, w) %>%
  unnest()

# Total participants
sum(dat.n.w2$sample.size)
## [1] 47404
sum(dat.n.e2$sample.size)
## [1] 33187
# Total PSU
nrow(dat.n.w2)
## [1] 1874
nrow(dat.n.e2)
## [1] 1603
# The sample size in the PSUs
range(dat.n.w2$sample.size)
## [1]  10 114
range(dat.n.e2$sample.size)
## [1]  10 107
# Prevalence
# Total
library(weights)

dat.prev <- dat.n.w %>%
  select(sample.size.N, sample.size.m, sample.size,dat_s) %>%
  unnest()

wpct(dat.prev$rapid_test, dat.prev$w)[2]
##         1 
## 0.3644252
#country

dat.prev.n <- dat.prev %>%
  #mutate(country = str_trim(country)) %>%
  group_by(country#, 
           #REGNAME, 
           #cluster_number
           ) %>%
  nest() %>%
  mutate(w.p = map(.x = data, .f = ~wpct(.x$rapid_test, .x$w)[2])) %>%
  select(-data) %>%
  unnest()