Chapter 1 Data Wrangling

1.1 Socio-demographic data

rm(list=ls())
library(tidyverse)
library(skimr)
library(sf)

icemr <- read.csv("./_data/JASON/ICEMR2.0_P1_longJul_v1_20190406.csv", stringsAsFactors = F) %>%
  filter(!is.na(id_muestra)) %>%
  dplyr::select(id_muestra, id_house, id_study, edad, nm_sex, 
                nm_level_study, viaje_ult_mes, lat, long, resultado_micro,
                especie_micro, temp_axilar,hist_fever,main_act_ec) %>% # FALTA date_fever
  mutate(id_study = as.numeric(id_study),
         long = as.numeric(long))

cam <- read.csv("./_data/JASON/Master 20180905_ON.csv", stringsAsFactors = F) %>%
  dplyr::select(id_muestra, id_house, id_study, nm_age_int, nm_sex, 
                nm_level_study, ce_travel, latitud, longitud, resultado_micro,
                especie_micro, ce_temp_ax, ce_temp,ce_economic_act) %>%
  mutate(latitud = as.numeric(latitud))

colnames(cam) <- colnames(icemr)

1.2 Lab data

sero <- read.csv("./_data/JASON/PvSeroTAT_Peru_Ab_data_2020-03-05.csv", stringsAsFactors = F) %>%
  rename(id_muestra = X)

1.3 Assembling

d1 <- cam %>%
  bind_rows(icemr) %>%
  inner_join(sero, by= "id_muestra")

skim(d1)

coord <- st_read("./_data/ser_data.shp")

d2 <- coord %>%
  distinct(.keep_all = T) %>%
  inner_join(d1, by= "id_house") %>%
  mutate(sero = ifelse(SEROPOSITIVE == "Positive",1,0),
         fever = ifelse(temp_axilar<37.5,0,1),
         area = as.factor(ifelse(as.numeric(as.character(comm))<600,"periurban","rural")),
         age_cat = cut(edad, breaks = c(-Inf,5,15,30,50,Inf)),
         nm_sex = as.factor(ifelse(as.numeric(as.character(nm_sex))<1,"Female","Male"))) %>%
  mutate_at(c("nm_sex", "nm_level_study", "viaje_ult_mes", "resultado_micro", "especie_micro", "fever",
              "area"), as.factor)

skim(d2)