Chapter 1 Data

1.1 Socio-demographic data

rm(list=ls())
library(tidyverse)
library(skimr)
library(sf)

icemr <- read.csv("./_data/JASON/ICEMR2.0_P1_longJul_v1_20190406.csv", stringsAsFactors = F) %>%
  filter(!is.na(id_muestra)) %>%
  dplyr::select(id_muestra, id_house, id_study, edad, nm_sex, 
                nm_level_study, viaje_ult_mes, lat, long, resultado_micro,
                especie_micro, temp_axilar,hist_fever, main_act_ec, tipo_casa, 
                animales_casa, fumigacion, hour_sleep, result_pcr) %>% # FALTA date_fever
  mutate(id_study = as.numeric(id_study),
         long = as.numeric(long))

cam <- read.csv("./_data/JASON/Master 20180905_ON.csv", stringsAsFactors = F) %>%
  dplyr::select(id_muestra, id_house, id_study, nm_age_int, nm_sex, 
                nm_level_study, ce_travel, latitud, longitud, resultado_micro,
                especie_micro, ce_temp_ax, ce_temp, ce_economic_act, ce_house_type, 
                ce_in_animals, ce_fumig, ce_sleep_hour, result_pcr.mangold) %>%
  mutate(latitud = as.numeric(latitud))

colnames(cam) <- colnames(icemr)

1.2 Lab data

sero <- read.csv("./_data/JASON/PvSeroTAT_Peru_Ab_data_2020-03-05.csv", stringsAsFactors = F) %>%
  rename(id_muestra = X)

1.2.1 Assemble

d1 <- cam %>%
  bind_rows(icemr) %>%
  inner_join(sero, by= "id_muestra")

skim(d1)
Table 1.1: Data summary
Name d1
Number of rows 1904
Number of columns 30
_______________________
Column type frequency:
character 3
numeric 27
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
hour_sleep 0 1 7 8 0 41 0
SEROPOSITIVE 0 1 8 8 0 2 0
TREATMENT 0 1 9 12 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id_muestra 0 1.00 5096818.03 4260507.20 501001.00 502190.75 9020246.00 9042933.50 9072251.00 ▇▁▁▁▇
id_house 0 1.00 718499.70 200661.45 501001.00 502182.00 902008.00 904107.25 907083.00 ▇▁▁▁▇
id_study 13 0.99 714191804.13 199690114.30 500100101.00 500217901.50 900200704.00 900410603.50 900708302.00 ▇▁▁▁▇
edad 0 1.00 28.85 21.83 0.00 10.00 24.00 44.00 117.00 ▇▅▃▁▁
nm_sex 0 1.00 0.45 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▆
nm_level_study 0 1.00 607.53 2381.72 1.00 3.00 4.00 5.00 9999.00 ▇▁▁▁▁
viaje_ult_mes 0 1.00 26.45 511.85 0.00 0.00 0.00 0.00 9999.00 ▇▁▁▁▁
lat 102 0.95 -3.95 4.91 -73.23 -3.80 -3.51 -3.44 -3.35 ▁▁▁▁▇
long 102 0.95 -33.50 35.41 -73.34 -73.33 -3.51 -3.44 73.23 ▆▁▇▁▁
resultado_micro 32 0.98 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
especie_micro 1006 0.47 0.07 0.36 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
temp_axilar 0 1.00 36.19 0.51 35.50 35.90 36.10 36.50 40.20 ▇▃▁▁▁
hist_fever 0 1.00 0.12 0.33 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
main_act_ec 0 1.00 7.40 9.11 0.00 5.00 7.00 8.00 88.00 ▇▁▁▁▁
tipo_casa 0 1.00 3.28 0.79 1.00 3.00 3.00 4.00 4.00 ▁▂▁▇▇
animales_casa 0 1.00 0.29 0.46 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
fumigacion 0 1.00 0.48 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▇
result_pcr 0 1.00 0.09 0.40 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
W16_RAMA 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W02_L02 0 1.00 0.00 0.01 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W58_EBPII 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W50_RBP2b 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W01_MSP119 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W30_MSP8 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W08_L12 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W39_MSP3a 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
RFOREST_MODEL_VOTES 0 1.00 0.74 0.23 0.11 0.56 0.82 0.95 1.00 ▁▂▂▃▇
coord <- st_read("./_data/ser_data.shp")
## Reading layer `ser_data' from data source `/Users/gcarrasco/Dropbox/Work/Colabs UPCH/Serology [CAM:Jason]/Analysis/RSCD_JR/_data/ser_data.shp' using driver `ESRI Shapefile'
## Simple feature collection with 1233 features and 2 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: -73.34285 ymin: -3.835081 xmax: -72.97889 ymax: -3.354823
## CRS:            4326
d2 <- coord %>%
  distinct(.keep_all = T) %>%
  inner_join(d1, by= "id_house") %>%
  mutate(sero = ifelse(SEROPOSITIVE == "Positive",1,0),
         fever = ifelse(temp_axilar<37.5,0,1),
         nm_sex = ifelse(nm_sex==1,"1_male","0_female"),
         area = factor(ifelse(as.numeric(as.character(comm))<600,"0_periurban","1_rural")),
         age_cat = cut(edad, breaks = c(-Inf,5,15,30,50,Inf))) %>%
  mutate_at(c("nm_sex", "nm_level_study", "viaje_ult_mes", "resultado_micro", "especie_micro", "fever",
              "area"), as.factor)

skim(d2)
Table 1.1: Data summary
Name d2
Number of rows 1790
Number of columns 36
_______________________
Column type frequency:
character 4
factor 9
numeric 23
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
hour_sleep 0 1 7 8 0 41 0
SEROPOSITIVE 0 1 8 8 0 2 0
TREATMENT 0 1 9 12 0 2 0
geometry 0 1 21 39 0 572 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
comm 0 1.00 FALSE 10 502: 273, 904: 270, 503: 262, 501: 250
nm_sex 0 1.00 FALSE 2 0_f: 973, 1_m: 817
nm_level_study 0 1.00 FALSE 12 3: 627, 5: 316, 4: 295, 6: 137
viaje_ult_mes 0 1.00 FALSE 3 0: 1436, 1: 349, 999: 5
resultado_micro 23 0.99 FALSE 2 0: 1729, 1: 38
especie_micro 987 0.45 FALSE 3 0: 768, 2: 27, 1: 8
fever 0 1.00 FALSE 2 0: 1764, 1: 26
area 0 1.00 FALSE 2 1_r: 1005, 0_p: 785
age_cat 0 1.00 FALSE 5 (5,: 568, (30: 388, (50: 357, (15: 299

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id_house 0 1.00 728025.71 199707.77 501001.00 502212.75 902021.00 905002.00 907079.00 ▆▁▁▁▇
id_muestra 0 1.00 5298789.57 4240476.79 501001.00 502200.25 9020646.00 9050038.50 9072251.00 ▆▁▁▁▇
id_study 12 0.99 723740754.26 198798954.37 500100101.00 500220802.00 900201904.50 900411103.75 900707901.00 ▆▁▁▁▇
edad 0 1.00 28.99 22.04 0.00 10.00 24.00 45.00 117.00 ▇▅▃▁▁
lat 3 1.00 -3.95 4.93 -73.23 -3.80 -3.51 -3.44 -3.35 ▁▁▁▁▇
long 3 1.00 -33.75 35.45 -73.34 -73.33 -3.51 -3.44 73.23 ▆▁▇▁▁
temp_axilar 0 1.00 36.18 0.48 35.50 35.90 36.10 36.50 39.30 ▇▆▁▁▁
hist_fever 0 1.00 0.12 0.33 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
main_act_ec 0 1.00 7.35 9.18 0.00 5.00 7.00 8.00 88.00 ▇▁▁▁▁
tipo_casa 0 1.00 3.29 0.77 1.00 3.00 3.00 4.00 4.00 ▁▂▁▆▇
animales_casa 0 1.00 0.29 0.45 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
fumigacion 0 1.00 0.47 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▇
result_pcr 0 1.00 0.08 0.38 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
W16_RAMA 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W02_L02 0 1.00 0.00 0.01 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W58_EBPII 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W50_RBP2b 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W01_MSP119 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W30_MSP8 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W08_L12 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
W39_MSP3a 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 ▇▁▁▁▁
RFOREST_MODEL_VOTES 0 1.00 0.74 0.23 0.11 0.56 0.82 0.95 1.00 ▁▂▂▃▇
sero 0 1.00 0.49 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▇

1.3 Descriptive [Table 1]

t1 <- d2 %>%
  st_set_geometry(NULL) %>%
  dplyr::select(area, comm, edad, age_cat, nm_sex, nm_level_study, viaje_ult_mes, resultado_micro,
                especie_micro, fever,temp_axilar,hist_fever,SEROPOSITIVE, TREATMENT)

library(table1)
table1(~. | SEROPOSITIVE, data = t1)
Negative
(N=917)
Positive
(N=873)
Overall
(N=1790)
area
0_periurban 481 (52.5%) 304 (34.8%) 785 (43.9%)
1_rural 436 (47.5%) 569 (65.2%) 1005 (56.1%)
comm
501 165 (18.0%) 85 (9.7%) 250 (14.0%)
502 132 (14.4%) 141 (16.2%) 273 (15.3%)
503 184 (20.1%) 78 (8.9%) 262 (14.6%)
901 6 (0.7%) 41 (4.7%) 47 (2.6%)
902 45 (4.9%) 134 (15.3%) 179 (10.0%)
903 18 (2.0%) 40 (4.6%) 58 (3.2%)
904 144 (15.7%) 126 (14.4%) 270 (15.1%)
905 67 (7.3%) 30 (3.4%) 97 (5.4%)
906 84 (9.2%) 82 (9.4%) 166 (9.3%)
907 72 (7.9%) 116 (13.3%) 188 (10.5%)
edad
Mean (SD) 20.6 (18.8) 37.8 (21.8) 29.0 (22.0)
Median [Min, Max] 13.0 [0, 117] 37.0 [1.00, 92.0] 24.0 [0, 117]
age_cat
(-Inf,5] 147 (16.0%) 31 (3.6%) 178 (9.9%)
(5,15] 405 (44.2%) 163 (18.7%) 568 (31.7%)
(15,30] 148 (16.1%) 151 (17.3%) 299 (16.7%)
(30,50] 126 (13.7%) 262 (30.0%) 388 (21.7%)
(50, Inf] 91 (9.9%) 266 (30.5%) 357 (19.9%)
nm_sex
0_female 536 (58.5%) 437 (50.1%) 973 (54.4%)
1_male 381 (41.5%) 436 (49.9%) 817 (45.6%)
nm_level_study
1 67 (7.3%) 69 (7.9%) 136 (7.6%)
2 88 (9.6%) 16 (1.8%) 104 (5.8%)
3 323 (35.2%) 304 (34.8%) 627 (35.0%)
4 90 (9.8%) 205 (23.5%) 295 (16.5%)
5 167 (18.2%) 149 (17.1%) 316 (17.7%)
6 77 (8.4%) 60 (6.9%) 137 (7.7%)
7 12 (1.3%) 4 (0.5%) 16 (0.9%)
8 12 (1.3%) 9 (1.0%) 21 (1.2%)
9 7 (0.8%) 1 (0.1%) 8 (0.4%)
10 9 (1.0%) 6 (0.7%) 15 (0.8%)
11 0 (0%) 1 (0.1%) 1 (0.1%)
9999 65 (7.1%) 49 (5.6%) 114 (6.4%)
viaje_ult_mes
0 794 (86.6%) 642 (73.5%) 1436 (80.2%)
1 120 (13.1%) 229 (26.2%) 349 (19.5%)
9999 3 (0.3%) 2 (0.2%) 5 (0.3%)
resultado_micro
0 894 (97.5%) 835 (95.6%) 1729 (96.6%)
1 12 (1.3%) 26 (3.0%) 38 (2.1%)
Missing 11 (1.2%) 12 (1.4%) 23 (1.3%)
especie_micro
0 472 (51.5%) 296 (33.9%) 768 (42.9%)
1 4 (0.4%) 4 (0.5%) 8 (0.4%)
2 6 (0.7%) 21 (2.4%) 27 (1.5%)
Missing 435 (47.4%) 552 (63.2%) 987 (55.1%)
fever
0 900 (98.1%) 864 (99.0%) 1764 (98.5%)
1 17 (1.9%) 9 (1.0%) 26 (1.5%)
temp_axilar
Mean (SD) 36.2 (0.490) 36.2 (0.471) 36.2 (0.481)
Median [Min, Max] 36.1 [35.5, 39.3] 36.1 [35.5, 39.3] 36.1 [35.5, 39.3]
hist_fever
Mean (SD) 0.105 (0.306) 0.140 (0.347) 0.122 (0.327)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
SEROPOSITIVE
Negative 917 (100%) 0 (0%) 917 (51.2%)
Positive 0 (0%) 873 (100%) 873 (48.8%)
TREATMENT
No treatment 917 (100%) 0 (0%) 917 (51.2%)
Treatment 0 (0%) 873 (100%) 873 (48.8%)
library(tableone)
CreateTableOne(vars = names(t1)[1:12], strata = "SEROPOSITIVE", data = t1, 
               factorVars = names(t1)[c(1:2,4:10,12)])
##                          Stratified by SEROPOSITIVE
##                           Negative      Positive      p      test
##   n                         917           873                    
##   area = 1_rural (%)        436 (47.5)    569 (65.2)  <0.001     
##   comm (%)                                            <0.001     
##      501                    165 (18.0)     85 ( 9.7)             
##      502                    132 (14.4)    141 (16.2)             
##      503                    184 (20.1)     78 ( 8.9)             
##      901                      6 ( 0.7)     41 ( 4.7)             
##      902                     45 ( 4.9)    134 (15.3)             
##      903                     18 ( 2.0)     40 ( 4.6)             
##      904                    144 (15.7)    126 (14.4)             
##      905                     67 ( 7.3)     30 ( 3.4)             
##      906                     84 ( 9.2)     82 ( 9.4)             
##      907                     72 ( 7.9)    116 (13.3)             
##   edad (mean (SD))        20.64 (18.82) 37.76 (21.78) <0.001     
##   age_cat (%)                                         <0.001     
##      (-Inf,5]               147 (16.0)     31 ( 3.6)             
##      (5,15]                 405 (44.2)    163 (18.7)             
##      (15,30]                148 (16.1)    151 (17.3)             
##      (30,50]                126 (13.7)    262 (30.0)             
##      (50, Inf]               91 ( 9.9)    266 (30.5)             
##   nm_sex = 1_male (%)       381 (41.5)    436 (49.9)  <0.001     
##   nm_level_study (%)                                  <0.001     
##      1                       67 ( 7.3)     69 ( 7.9)             
##      2                       88 ( 9.6)     16 ( 1.8)             
##      3                      323 (35.2)    304 (34.8)             
##      4                       90 ( 9.8)    205 (23.5)             
##      5                      167 (18.2)    149 (17.1)             
##      6                       77 ( 8.4)     60 ( 6.9)             
##      7                       12 ( 1.3)      4 ( 0.5)             
##      8                       12 ( 1.3)      9 ( 1.0)             
##      9                        7 ( 0.8)      1 ( 0.1)             
##      10                       9 ( 1.0)      6 ( 0.7)             
##      11                       0 ( 0.0)      1 ( 0.1)             
##      9999                    65 ( 7.1)     49 ( 5.6)             
##   viaje_ult_mes (%)                                   <0.001     
##      0                      794 (86.6)    642 (73.5)             
##      1                      120 (13.1)    229 (26.2)             
##      9999                     3 ( 0.3)      2 ( 0.2)             
##   resultado_micro = 1 (%)    12 ( 1.3)     26 ( 3.0)   0.022     
##   especie_micro (%)                                   <0.001     
##      0                      472 (97.9)    296 (92.2)             
##      1                        4 ( 0.8)      4 ( 1.2)             
##      2                        6 ( 1.2)     21 ( 6.5)             
##   fever = 1 (%)              17 ( 1.9)      9 ( 1.0)   0.209     
##   temp_axilar (mean (SD)) 36.19 (0.49)  36.17 (0.47)   0.358     
##   hist_fever = 1 (%)         96 (10.5)    122 (14.0)   0.028