Chapter 2 Data
2.1 DHS Data
rm(list = ls())
library(purrr)
library(tidyverse)
library(haven)
library(janitor)
library(labelled)
library(magrittr)
files <- dir(path = "./_dat/DHS/", pattern = "*.DTA" , recursive = T, full.names = T)
dat <- data_frame(filename = files) %>%
mutate(country = str_sub(filename, -12,-11),
type = str_sub(filename, -10,-9),
file_contents = map(.x = filename, .f = ~read_stata(.))) %>%
filter(type=="PR")
# Function ========
info <- function(dat) {
a <- tibble(var_names = colnames(dat),
var_lab1 = var_label(dat, unlist = T),
var_lab = as.character(var_lab1),
var_lab_make = make_clean_names(var_lab)) %>%
dplyr::select(-var_lab1)
return(a)
}
# Function ========
# Variables summary/matches
d0 <- dat %>%
mutate(codebook = map(.x = file_contents, .f = ~info(.x))) %>%
dplyr::select(country, codebook) %>%
unnest(cols = c(codebook))
d1 <- d0 %>%
group_by(var_lab_make) %>%
count()
vars <- c("region","province","result_of_malaria_measurement", "wealth_index_factor_score_5_decimals", "wealth_index_factor_score_combined_5_decimals",
"mothers_highest_educational_level", "primary_sampling_unit", "ultimate_area_unit", "household_sample_weight_6_decimals",
"sample_weight", "result_of_malaria_rapid_test", "result_of_malaria_rapid_test_2", "result_of_rapid_test_of_malaria",
"year_of_interview", "cluster_number",
"age_of_household_members") # update re-analysis [Kim]
# Selected variables
dat_s <- dat %>%
mutate(subset0 = map(.x = file_contents, .f = ~set_colnames(.x, var_label(.) %>% data_frame() %>% unlist()) %>%
clean_names() %>%
dplyr::select(any_of(vars)) %>%
set_colnames(var_label(.) %>% data_frame() %>% unlist()) %>%
clean_names()),
subset = map(.x = subset0, .f = ~mutate_all(.x, as_factor) %>%
mutate_all(as.character))
)
dat_s_flat <- dat_s %>%
#mutate(export = map(.x = subset, .f = ~mutate_all(.x, as.character))) %>%
#dplyr::select(subset) %>%
dplyr::select(country, subset) %>%
unnest()
# epiDisplay::tab1(dat_s_flat$result_of_malaria_measurement)
# epiDisplay::tab1(dat_s_flat$result_of_malaria_rapid_test)
# hist(dat_s_flat$wealth_index_factor_score_5_decimals)
# "primary_sampling_unit", "ultimate_area_unit" iguales a "cluster_number"
write.csv(dat_s_flat, "./_dat/dat_s_flat.csv", na="", row.names = F)
2.2 GPS Data
library(sf)
files_gps <- dir(path = "./_dat/GPS/", pattern = "*.shp" , recursive = T, full.names = T) %>% data.frame() %>% set_colnames("filename") %>%
anti_join(dir(path = "./_dat/GPS/", pattern = "*.xml" , recursive = T, full.names = T) %>% data.frame() %>% set_colnames("filename"),
by = "filename")
dat_gps <- files_gps %>%
mutate(country = str_sub(filename, -12,-11),
type = str_sub(filename, -10,-9),
file_contents = map(.x = filename, .f = ~sf::st_read(.) %>% st_set_geometry(NULL)))
dat_gps_flat <- dat_gps %>%
unnest() %>%
arrange(-DHSYEAR) %>%
ungroup() %>%
distinct(country, DHSID, .keep_all = T)
2.3 ADM Data
AO <- st_read("./_dat/SHP/AO/shps/sdr_subnational_boundaries.shp") %>%
select(ISO, REGNAME) %>%
st_cast("MULTIPOLYGON")
BU <- st_read("./_dat/SHP/BU/shps/sdr_subnational_boundaries.shp") %>%
select(ISO, REGNAME) %>%
mutate(ISO = "BU") %>%
st_cast("MULTIPOLYGON")
LB <- st_read("./_dat/SHP/LB/shps/sdr_subnational_boundaries.shp") %>%
select(ISO, REGNAME) %>%
mutate(ISO = "LB") %>%
st_cast("MULTIPOLYGON")
MD <- st_read("./_dat/SHP/MD/shps/sdr_subnational_boundaries.shp") %>%
select(ISO, REGNAME) %>%
mutate(ISO = "MD") %>%
st_cast("MULTIPOLYGON")
TG <- st_read("./_dat/SHP/TG/shps/sdr_subnational_boundaries.shp") %>%
select(ISO, REGNAME) %>%
st_cast("MULTIPOLYGON")
TZ <- st_read("./_dat/SHP/TZ/shps/sdr_subnational_boundaries.shp") %>%
select(ISO, REGNAME) %>%
st_cast("MULTIPOLYGON")
UG <- st_read("./_dat/SHP/UG2/shps/sdr_subnational_boundaries.shp") %>%
select(ISO, REGNAME) %>%
st_cast("MULTIPOLYGON")
subnat <- st_read("./_dat/SHP/sdr_subnational_data_2020-09-27/shps/sdr_subnational_data.shp") %>%
filter(ISO %in% c("BF","KE","MW","ML","MZ","SL")) %>%
select(ISO, REGNAME) %>%
st_cast("MULTIPOLYGON")
spat <- rbind(subnat,AO) %>%
rbind(BU) %>%
rbind(LB) %>%
rbind(MD) %>%
rbind(TG) %>%
rbind(TZ) %>%
rbind(UG) %>%
rename(country = ISO) %>%
mutate(REGNAME = toupper(REGNAME),
REGNAME = str_replace_all(REGNAME, pattern = "-", replacement = " "),
REGNAME = str_replace_all(REGNAME, pattern = "REGION", replacement = ""),
REGNAME = str_replace_all(REGNAME, pattern = "PROVINCE", replacement = ""),
REGNAME = str_replace_all(REGNAME, pattern = "AREA", replacement = ""),
REGNAME = str_replace_all(REGNAME, pattern = "NORTHEASTERN",
replacement = "NORTH EASTERN"),
REGNAME = str_replace_all(REGNAME, pattern = "NORTHERN", replacement = "NORTH"),
REGNAME = str_replace_all(REGNAME, pattern = "SOUTHERN", replacement = "SOUTH"),
REGNAME = str_replace_all(REGNAME, pattern = "GREATER", replacement = ""),
REGNAME = str_replace_all(REGNAME, pattern = " DE ", replacement = " DU "),
REGNAME = str_replace_all(REGNAME, pattern = "KARUZI", replacement = "KARUSI"),
REGNAME = str_trim(REGNAME, side = "both"),
poly = 1)
st_write(spat, "./_dat/SHP/union/DHS_adm.shp")
#rm(AO,BU,LB,MD,TG,TZ,UG,subnat,spat,p,p2)
2.4 Assemble
dhs_export <- dat_s_flat %>%
dplyr::select(-c(primary_sampling_unit,ultimate_area_unit)) %>%
mutate(w = ifelse(is.na(household_sample_weight_6_decimals), sample_weight,
household_sample_weight_6_decimals),
REGNAME = ifelse(country=="BU" | country=="MZ", province,
region),
wealth = ifelse(is.na(wealth_index_factor_score_5_decimals),
wealth_index_factor_score_combined_5_decimals,
wealth_index_factor_score_5_decimals),
rapid_test0 = ifelse(is.na(result_of_rapid_test_of_malaria), result_of_malaria_rapid_test_2,
result_of_rapid_test_of_malaria),
rapid_test0 = ifelse(is.na(rapid_test0), result_of_malaria_rapid_test,
rapid_test0),
# rapid_test0 = ifelse(result_of_rapid_test_of_malaria=="", result_of_malaria_rapid_test_2,
# result_of_rapid_test_of_malaria),
# rapid_test0 = ifelse(rapid_test0=="", result_of_malaria_rapid_test,
# rapid_test0),
rapid_test = ifelse(str_detect(rapid_test0, "positive"), 1, 0),
rapid_test = ifelse(rapid_test0=="", NA, rapid_test),
education = recode(mothers_highest_educational_level, "9" = "0_dontknow",
"don't know" = "0_dontknow",
"no education" = "1_noeducation",
"no education / pre-primary" = "1_noeducation",
"elementary (1-6)" = "2_elementary/primary",
"primary" = "2_elementary/primary",
"junior high (7-9)" = "3_junorsenhigh/secondary",
"secondary" = "3_junorsenhigh/secondary",
"senior high (10-12)" = "3_junorsenhigh/secondary",
"higher" = "4_higher"),
edu = as.numeric(as.factor(education)),
edu = ifelse(edu==1,NA,edu-1))
gps_export <- dat_gps_flat %>%
ungroup() %>%
mutate(cluster_number = as.character(DHSCLUST)) %>%
dplyr::select(country, DHSID, DHSYEAR, cluster_number, URBAN_RURA, LATNUM, LONGNUM, ALT_DEM, DATUM) %>%
arrange(-DHSYEAR) %>%
distinct(country, cluster_number, .keep_all = T)
write.csv(gps_export, "./_dat/dat_gps_flat.csv", na="", row.names = F)
dat_export <- dhs_export %>%
inner_join(gps_export, by = c("country", "cluster_number")) %>%
mutate(REGNAME = toupper(REGNAME),
REGNAME = str_replace_all(REGNAME, pattern = "-", replacement = " "),
REGNAME = str_replace_all(REGNAME, pattern = "REGION", replacement = ""),
REGNAME = str_replace_all(REGNAME, pattern = "PROVINCE",
replacement = ""),
REGNAME = str_replace_all(REGNAME, pattern = "AREA", replacement = ""),
REGNAME = str_replace_all(REGNAME, pattern = "NORTHEASTERN",
replacement = "NORTH EASTERN"),
REGNAME = str_replace_all(REGNAME, pattern = "NORTHERN",
replacement = "NORTH"),
REGNAME = str_replace_all(REGNAME, pattern = "SOUTHERN",
replacement = "SOUTH"),
REGNAME = str_replace_all(REGNAME, pattern = "GREATER", replacement = ""),
REGNAME = str_replace_all(REGNAME, pattern = " DE ", replacement = " DU "),
REGNAME = str_replace_all(REGNAME, pattern = "KARUZI",
replacement = "KARUSI"),
REGNAME = str_trim(REGNAME, side = "both"))
write.csv(dat_export, "./_dat/dat_mal_ineq_v3.csv", na="", row.names = F) # update re-analysis [Kim]
#
# epiDisplay::tab1(dat_s_flat$result_of_rapid_test_of_malaria)
# epiDisplay::tab1(dat_s_flat$result_of_rapid_test_of_malaria)
# epiDisplay::tab1(dhs_export$result_of_rapid_test_of_malaria)
# epiDisplay::tab1(dhs_export$rapid_test0)
# epiDisplay::tab1(dhs_export$rapid_test)
2.5 Summary
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ dplyr 1.0.6
## ✓ tibble 3.1.2 ✓ stringr 1.4.0
## ✓ tidyr 1.1.3 ✓ forcats 0.5.1
## ✓ readr 1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:purrr':
##
## set_names
2.5.1 Text data
dat.n.w <- dat_export %>%
group_by(country, REGNAME, cluster_number) %>%
nest() %>%
mutate(sample.size.N = map(.x = data, .f = ~dplyr::select(.x, rapid_test, wealth, w) %>%
count(name = "N") %>% unlist()),
sample.size.m = map(.x = data, .f = ~dplyr::select(.x, rapid_test, wealth, w) %>%
filter(!is.na(rapid_test)) %>%
count(name = "N_m") %>%
unlist()),
dat_s = map(.x = data, .f = ~dplyr::select(.x, rapid_test, wealth, w) %>%
filter(complete.cases(.))),
sample.size = map(.x = dat_s, .f = ~count(.x) %>% unlist()),
prev = map(.x = dat_s, .f = ~mean(.x$rapid_test, na.rm=T)),
wealth_cats = map(.x = dat_s, .f = ~distinct(.x, wealth) %>% nrow())) %>%
# FILTERS =======================
filter(prev > 0 & prev < 1) %>%
filter(sample.size>=10) %>%
filter(wealth_cats>1)
dat.n.e <- dat_export %>%
group_by(country, REGNAME, cluster_number) %>%
nest() %>%
mutate(sample.size.N = map(.x = data, .f = ~dplyr::select(.x, rapid_test, edu, w) %>%
count(name = "N") %>% unlist()),
sample.size.m = map(.x = data, .f = ~dplyr::select(.x, rapid_test, edu, w) %>%
filter(!is.na(rapid_test)) %>%
count(name = "N_m") %>%
unlist()),
dat_s = map(.x = data, .f = ~dplyr::select(.x, rapid_test, edu, w) %>%
filter(complete.cases(.))),
sample.size = map(.x = dat_s, .f = ~count(.x) %>% unlist()),
prev = map(.x = dat_s, .f = ~mean(.x$rapid_test, na.rm=T)),
edu_cats = map(.x = dat_s, .f = ~distinct(.x, edu) %>% nrow())) %>%
# FILTERS =======================
filter(prev > 0 & prev < 1) %>%
filter(sample.size>=10) %>%
filter(edu_cats>1)
dat.n.w2 <- dat.n.w %>%
mutate(w = map(.x = data, .f = ~max(.x$w))) %>%
select(sample.size.N, sample.size.m, sample.size,w) %>%
unnest()
dat.n.e2 <- dat.n.e %>%
mutate(w = map(.x = data, .f = ~max(.x$w))) %>%
select(sample.size.N, sample.size.m, sample.size, w) %>%
unnest()
# Total participants
sum(dat.n.w2$sample.size)
## [1] 47404
## [1] 33187
## [1] 1874
## [1] 1603
## [1] 10 114
## [1] 10 107
# Prevalence
# Total
library(weights)
dat.prev <- dat.n.w %>%
select(sample.size.N, sample.size.m, sample.size,dat_s) %>%
unnest()
wpct(dat.prev$rapid_test, dat.prev$w)[2]
## 1
## 0.3644252