Chapter 1 Data

1.1 Socio-demographic data

rm(list=ls())
library(tidyverse)
library(skimr)
library(sf)

icemr <- read.csv("./_data/JASON/ICEMR2.0_P1_longJul_v1_20190406.csv", stringsAsFactors = F) %>%
  filter(!is.na(id_muestra)) %>%
  dplyr::select(id_muestra, id_house, id_study, edad, nm_sex, 
                nm_level_study, viaje_ult_mes, lat, long, resultado_micro,
                especie_micro, temp_axilar,hist_fever, main_act_ec, tipo_casa, 
                animales_casa, fumigacion, hour_sleep, result_pcr) %>% # FALTA date_fever
  mutate(id_study = as.numeric(id_study),
         long = as.numeric(long))

cam <- read.csv("./_data/JASON/Master 20180905_ON.csv", stringsAsFactors = F) %>%
  dplyr::select(id_muestra, id_house, id_study, nm_age_int, nm_sex, 
                nm_level_study, ce_travel, latitud, longitud, resultado_micro,
                especie_micro, ce_temp_ax, ce_temp, ce_economic_act, ce_house_type, 
                ce_in_animals, ce_fumig, ce_sleep_hour, result_pcr.mangold) %>%
  mutate(latitud = as.numeric(latitud))

colnames(cam) <- colnames(icemr)

1.2 Lab data

# sero <- read.csv("./_data/JASON/PvSeroTAT_Peru_Ab_data_2020-03-05.csv", stringsAsFactors = F) %>%
#   rename(id_muestra = X)

# Updated 2021-03-29
sero <- read.csv("./_data/JASON/PVSEROTAT_RF_CLASS_RESULTS_0303221.csv", stringsAsFactors = F) %>%
  rename(id_muestra = Bleedcode,
         SEROPOSITIVE = SEROPOSITIVE_63SE_90SP) %>%
  inner_join(read.csv("./_data/JASON/PvSeroTAT_Peru_Ab_data_2020-03-05.csv", stringsAsFactors = F) %>% 
               rename(id_muestra = X) %>%
               select(id_muestra, TREATMENT),
             by = "id_muestra")

1.2.1 Assemble

d1 <- cam %>%
  bind_rows(icemr) %>%
  inner_join(sero, by= "id_muestra")

skim(d1)
Table 1.1: Data summary
Name d1
Number of rows 1904
Number of columns 32
_______________________
Column type frequency:
character 5
numeric 27
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
hour_sleep 0 1 7 8 0 41 0
SEROPOSITIVE_79SE_79SP 0 1 8 8 0 2 0
SEROPOSITIVE 0 1 8 8 0 2 0
SEROPOSITIVE_90SE_59SP 0 1 8 8 0 2 0
TREATMENT 0 1 9 12 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id_muestra 0 1.00 5096818.03 4260507.20 501001.00 502190.75 9020246.00 9042933.50 9072251.00 ▇▁▁▁▇
id_house 0 1.00 718499.70 200661.45 501001.00 502182.00 902008.00 904107.25 907083.00 ▇▁▁▁▇
id_study 13 0.99 714191804.13 199690114.30 500100101.00 500217901.50 900200704.00 900410603.50 900708302.00 ▇▁▁▁▇
edad 0 1.00 28.85 21.83 0.00 10.00 24.00 44.00 117.00 ▇▅▃▁▁
nm_sex 0 1.00 0.45 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▆
nm_level_study 0 1.00 607.53 2381.72 1.00 3.00 4.00 5.00 9999.00 ▇▁▁▁▁
viaje_ult_mes 0 1.00 26.45 511.85 0.00 0.00 0.00 0.00 9999.00 ▇▁▁▁▁
lat 102 0.95 -3.95 4.91 -73.23 -3.80 -3.51 -3.44 -3.35 ▁▁▁▁▇
long 102 0.95 -33.50 35.41 -73.34 -73.33 -3.51 -3.44 73.23 ▆▁▇▁▁
resultado_micro 32 0.98 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
especie_micro 1006 0.47 0.07 0.36 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
temp_axilar 0 1.00 36.19 0.51 35.50 35.90 36.10 36.50 40.20 ▇▃▁▁▁
hist_fever 0 1.00 0.12 0.33 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
main_act_ec 0 1.00 7.40 9.11 0.00 5.00 7.00 8.00 88.00 ▇▁▁▁▁
tipo_casa 0 1.00 3.28 0.79 1.00 3.00 3.00 4.00 4.00 ▁▂▁▇▇
animales_casa 0 1.00 0.29 0.46 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
fumigacion 0 1.00 0.48 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▇
result_pcr 0 1.00 0.09 0.40 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
W16_RAMA 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W02_L02 0 1.00 0.00 0.01 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W58_EBPII 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W50_RBP2b 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W01_MSP119 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W30_MSP8 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W08_L12 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W39_MSP3a 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
RFOREST_MODEL_VOTES 0 1.00 0.74 0.23 0.11 0.56 0.82 0.95 1.00 ▁▂▂▃▇
coord <- st_read("./_data/ser_data.shp")
## Reading layer `ser_data' from data source `/Users/gabrielcarrasco/Dropbox/Work/Colabs UPCH/Serology [CAM:Jason]/Analysis/RSCD_JR/_data/ser_data.shp' using driver `ESRI Shapefile'
## Simple feature collection with 1233 features and 2 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: -73.34285 ymin: -3.835081 xmax: -72.97889 ymax: -3.354823
## geographic CRS: WGS 84
d2 <- coord %>%
  distinct(.keep_all = T) %>%
  inner_join(d1, by= "id_house") %>%
  mutate(sero = ifelse(SEROPOSITIVE == "Positive",1,0),
         fever = ifelse(temp_axilar<37.5,0,1),
         nm_sex = ifelse(nm_sex==1,"1_male","0_female"),
         area = factor(ifelse(as.numeric(as.character(comm))<600,"0_periurban","1_rural")),
         age_cat = cut(edad, breaks = c(-Inf,5,15,30,50,Inf))) %>%
  mutate_at(c("nm_sex", "nm_level_study", "viaje_ult_mes", "resultado_micro", "especie_micro", "fever",
              "area"), as.factor)

skim(d2)
Table 1.1: Data summary
Name d2
Number of rows 1790
Number of columns 38
_______________________
Column type frequency:
character 7
factor 8
numeric 23
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
comm 0 1 3 3 0 10 0
hour_sleep 0 1 7 8 0 41 0
SEROPOSITIVE_79SE_79SP 0 1 8 8 0 2 0
SEROPOSITIVE 0 1 8 8 0 2 0
SEROPOSITIVE_90SE_59SP 0 1 8 8 0 2 0
TREATMENT 0 1 9 12 0 2 0
geometry 0 1 21 39 0 572 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
nm_sex 0 1.00 FALSE 2 0_f: 973, 1_m: 817
nm_level_study 0 1.00 FALSE 12 3: 627, 5: 316, 4: 295, 6: 137
viaje_ult_mes 0 1.00 FALSE 3 0: 1436, 1: 349, 999: 5
resultado_micro 23 0.99 FALSE 2 0: 1729, 1: 38
especie_micro 987 0.45 FALSE 3 0: 768, 2: 27, 1: 8
fever 0 1.00 FALSE 2 0: 1764, 1: 26
area 0 1.00 FALSE 2 1_r: 1005, 0_p: 785
age_cat 0 1.00 FALSE 5 (5,: 568, (30: 388, (50: 357, (15: 299

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id_house 0 1.00 728025.71 199707.77 501001.00 502212.75 902021.00 905002.00 907079.00 ▆▁▁▁▇
id_muestra 0 1.00 5298789.57 4240476.79 501001.00 502200.25 9020646.00 9050038.50 9072251.00 ▆▁▁▁▇
id_study 12 0.99 723740754.26 198798954.37 500100101.00 500220802.00 900201904.50 900411103.75 900707901.00 ▆▁▁▁▇
edad 0 1.00 28.99 22.04 0.00 10.00 24.00 45.00 117.00 ▇▅▃▁▁
lat 3 1.00 -3.95 4.93 -73.23 -3.80 -3.51 -3.44 -3.35 ▁▁▁▁▇
long 3 1.00 -33.75 35.45 -73.34 -73.33 -3.51 -3.44 73.23 ▆▁▇▁▁
temp_axilar 0 1.00 36.18 0.48 35.50 35.90 36.10 36.50 39.30 ▇▆▁▁▁
hist_fever 0 1.00 0.12 0.33 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
main_act_ec 0 1.00 7.35 9.18 0.00 5.00 7.00 8.00 88.00 ▇▁▁▁▁
tipo_casa 0 1.00 3.29 0.77 1.00 3.00 3.00 4.00 4.00 ▁▂▁▆▇
animales_casa 0 1.00 0.29 0.45 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
fumigacion 0 1.00 0.47 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▇
result_pcr 0 1.00 0.08 0.38 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
W16_RAMA 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W02_L02 0 1.00 0.00 0.01 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W58_EBPII 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W50_RBP2b 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W01_MSP119 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W30_MSP8 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W08_L12 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W39_MSP3a 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
RFOREST_MODEL_VOTES 0 1.00 0.74 0.23 0.11 0.56 0.82 0.95 1.00 ▁▂▂▃▇
sero 0 1.00 0.33 0.47 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃

1.3 Descriptive [Table 1]

t1 <- d2 %>%
  st_set_geometry(NULL) %>%
  dplyr::select(area, comm, edad, age_cat, nm_sex, nm_level_study, viaje_ult_mes, resultado_micro,
                especie_micro, fever,temp_axilar,hist_fever,SEROPOSITIVE, TREATMENT)

library(table1)
table1(~. | SEROPOSITIVE, data = t1)
Negative
(N=1200)
Positive
(N=590)
Overall
(N=1790)
area
0_periurban 605 (50.4%) 180 (30.5%) 785 (43.9%)
1_rural 595 (49.6%) 410 (69.5%) 1005 (56.1%)
comm
501 197 (16.4%) 53 (9.0%) 250 (14.0%)
502 187 (15.6%) 86 (14.6%) 273 (15.3%)
503 221 (18.4%) 41 (6.9%) 262 (14.6%)
901 13 (1.1%) 34 (5.8%) 47 (2.6%)
902 77 (6.4%) 102 (17.3%) 179 (10.0%)
903 22 (1.8%) 36 (6.1%) 58 (3.2%)
904 186 (15.5%) 84 (14.2%) 270 (15.1%)
905 82 (6.8%) 15 (2.5%) 97 (5.4%)
906 109 (9.1%) 57 (9.7%) 166 (9.3%)
907 106 (8.8%) 82 (13.9%) 188 (10.5%)
edad
Mean (SD) 24.1 (20.4) 39.0 (21.9) 29.0 (22.0)
Median [Min, Max] 15.0 [0, 117] 38.0 [1.00, 92.0] 24.0 [0, 117]
age_cat
(-Inf,5] 160 (13.3%) 18 (3.1%) 178 (9.9%)
(5,15] 467 (38.9%) 101 (17.1%) 568 (31.7%)
(15,30] 201 (16.8%) 98 (16.6%) 299 (16.7%)
(30,50] 208 (17.3%) 180 (30.5%) 388 (21.7%)
(50, Inf] 164 (13.7%) 193 (32.7%) 357 (19.9%)
nm_sex
0_female 695 (57.9%) 278 (47.1%) 973 (54.4%)
1_male 505 (42.1%) 312 (52.9%) 817 (45.6%)
nm_level_study
1 82 (6.8%) 54 (9.2%) 136 (7.6%)
2 97 (8.1%) 7 (1.2%) 104 (5.8%)
3 425 (35.4%) 202 (34.2%) 627 (35.0%)
4 146 (12.2%) 149 (25.3%) 295 (16.5%)
5 218 (18.2%) 98 (16.6%) 316 (17.7%)
6 100 (8.3%) 37 (6.3%) 137 (7.7%)
7 13 (1.1%) 3 (0.5%) 16 (0.9%)
8 18 (1.5%) 3 (0.5%) 21 (1.2%)
9 7 (0.6%) 1 (0.2%) 8 (0.4%)
10 11 (0.9%) 4 (0.7%) 15 (0.8%)
11 0 (0%) 1 (0.2%) 1 (0.1%)
9999 83 (6.9%) 31 (5.3%) 114 (6.4%)
viaje_ult_mes
0 1016 (84.7%) 420 (71.2%) 1436 (80.2%)
1 181 (15.1%) 168 (28.5%) 349 (19.5%)
9999 3 (0.2%) 2 (0.3%) 5 (0.3%)
resultado_micro
0 1167 (97.2%) 562 (95.3%) 1729 (96.6%)
1 15 (1.2%) 23 (3.9%) 38 (2.1%)
Missing 18 (1.5%) 5 (0.8%) 23 (1.3%)
especie_micro
0 596 (49.7%) 172 (29.2%) 768 (42.9%)
1 5 (0.4%) 3 (0.5%) 8 (0.4%)
2 8 (0.7%) 19 (3.2%) 27 (1.5%)
Missing 591 (49.2%) 396 (67.1%) 987 (55.1%)
fever
0 1182 (98.5%) 582 (98.6%) 1764 (98.5%)
1 18 (1.5%) 8 (1.4%) 26 (1.5%)
temp_axilar
Mean (SD) 36.2 (0.480) 36.2 (0.482) 36.2 (0.481)
Median [Min, Max] 36.1 [35.5, 39.3] 36.1 [35.5, 39.3] 36.1 [35.5, 39.3]
hist_fever
Mean (SD) 0.104 (0.306) 0.158 (0.365) 0.122 (0.327)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
SEROPOSITIVE
Negative 1200 (100%) 0 (0%) 1200 (67.0%)
Positive 0 (0%) 590 (100%) 590 (33.0%)
TREATMENT
No treatment 917 (76.4%) 0 (0%) 917 (51.2%)
Treatment 283 (23.6%) 590 (100%) 873 (48.8%)
library(tableone)
CreateTableOne(vars = names(t1)[1:12], strata = "SEROPOSITIVE", data = t1, 
               factorVars = names(t1)[c(1:2,4:10,12)])
##                          Stratified by SEROPOSITIVE
##                           Negative      Positive      p      test
##   n                        1200           590                    
##   area = 1_rural (%)        595 (49.6)    410 (69.5)  <0.001     
##   comm (%)                                            <0.001     
##      501                    197 (16.4)     53 ( 9.0)             
##      502                    187 (15.6)     86 (14.6)             
##      503                    221 (18.4)     41 ( 6.9)             
##      901                     13 ( 1.1)     34 ( 5.8)             
##      902                     77 ( 6.4)    102 (17.3)             
##      903                     22 ( 1.8)     36 ( 6.1)             
##      904                    186 (15.5)     84 (14.2)             
##      905                     82 ( 6.8)     15 ( 2.5)             
##      906                    109 ( 9.1)     57 ( 9.7)             
##      907                    106 ( 8.8)     82 (13.9)             
##   edad (mean (SD))        24.08 (20.42) 38.97 (21.85) <0.001     
##   age_cat (%)                                         <0.001     
##      (-Inf,5]               160 (13.3)     18 ( 3.1)             
##      (5,15]                 467 (38.9)    101 (17.1)             
##      (15,30]                201 (16.8)     98 (16.6)             
##      (30,50]                208 (17.3)    180 (30.5)             
##      (50, Inf]              164 (13.7)    193 (32.7)             
##   nm_sex = 1_male (%)       505 (42.1)    312 (52.9)  <0.001     
##   nm_level_study (%)                                  <0.001     
##      1                       82 ( 6.8)     54 ( 9.2)             
##      2                       97 ( 8.1)      7 ( 1.2)             
##      3                      425 (35.4)    202 (34.2)             
##      4                      146 (12.2)    149 (25.3)             
##      5                      218 (18.2)     98 (16.6)             
##      6                      100 ( 8.3)     37 ( 6.3)             
##      7                       13 ( 1.1)      3 ( 0.5)             
##      8                       18 ( 1.5)      3 ( 0.5)             
##      9                        7 ( 0.6)      1 ( 0.2)             
##      10                      11 ( 0.9)      4 ( 0.7)             
##      11                       0 ( 0.0)      1 ( 0.2)             
##      9999                    83 ( 6.9)     31 ( 5.3)             
##   viaje_ult_mes (%)                                   <0.001     
##      0                     1016 (84.7)    420 (71.2)             
##      1                      181 (15.1)    168 (28.5)             
##      9999                     3 ( 0.2)      2 ( 0.3)             
##   resultado_micro = 1 (%)    15 ( 1.3)     23 ( 3.9)   0.001     
##   especie_micro (%)                                   <0.001     
##      0                      596 (97.9)    172 (88.7)             
##      1                        5 ( 0.8)      3 ( 1.5)             
##      2                        8 ( 1.3)     19 ( 9.8)             
##   fever = 1 (%)              18 ( 1.5)      8 ( 1.4)   0.977     
##   temp_axilar (mean (SD)) 36.18 (0.48)  36.19 (0.48)   0.686     
##   hist_fever = 1 (%)        125 (10.4)     93 (15.8)   0.002