A g-computation [Validation Overall]

A.1 Parametric g-formula

Based on Hernan and Robins, 2020 Chapter 13

Zou’s modified Poisson regression code

A.2 NEW FUNCTIONS

# Helpers g-computation

# standardization =============================================================

standardization <- function(data, indices, treatment, outcome, formula1) {
  
  treatment1 <- enquo(treatment)
  outcome1 <- enquo(outcome)
  
  # create a dataset with 3 copies of each subject
  # 1st copy: equal to original one`
  d <- data[indices, ] 
  d <- d %>% mutate(interv = -1,
                    seqno = 1:n()) # for gee estimation
  # 2nd copy: treatment set to 0, outcome to missing
  d0 <- d %>% mutate(interv = 0,
                      !!treatment1 := first(levels(droplevels(!!treatment1))),
                      !!outcome1 := NA)
  # 3rd copy: treatment set to 1, outcome to missing
  d1 <- d %>% mutate(interv = 1,
                      !!treatment1 := last(levels(droplevels(!!treatment1))),
                      !!outcome1 := NA)
  d.onesample <- rbind(d, d0, d1) # combining datasets
  
  if (class(unlist(select(data, !!outcome1)))=="factor") {
    d.onesample <- d.onesample %>%
      mutate(!!outcome1 := as.numeric(!!outcome1)-1)
  }
  
  # linear model to estimate mean outcome conditional on treatment and confounders
  # parameters are estimated using original observations only (interv= -1)
  # parameter estimates are used to predict mean outcome for observations with set
  # treatment (interv=0 and interv=1)
  # fit <- glm(formula = formula1 ,data = d.onesample, family = "binomial")
  # d.onesample$predicted_meanY <- predict(fit, d.onesample, type="response")
  
  # Zou, 2004
  require(geepack)
  fit <- geeglm(formula = formula1,
                data    = d.onesample,
                family  = poisson(link = "log"),
                id      = seqno,
                corstr  = "exchangeable")

  d.onesample$predicted_meanY <- predict(fit, d.onesample, type="response")
  
  # estimate mean outcome in each of the groups interv=-1, interv=0, and interv=1
  M.ob <- mean(d.onesample$predicted_meanY[d.onesample$interv == -1], na.rm=T)
  M.t0 <- mean(d.onesample$predicted_meanY[d.onesample$interv == 0], na.rm=T)
  M.t1 <- mean(d.onesample$predicted_meanY[d.onesample$interv == 1], na.rm=T)
  M.t10 <- M.t1 - M.t0
  
  return(c(M.ob, M.t0, M.t1, M.t10))
}


# plot_eff =============================================================

plot_eff <- function(dat1, dat2, dat3, coefs, legend.title = "Total Effect", model.names = c("Overall", "Rural", "Periurban"), size = 1, base_size = 13, xlims = c(-.3,1.3)) {
  library(ggsci)
  
  dat1$coeftable %>% as_tibble(rownames = "term") %>% mutate(model = model.names[1]) %>%
    rbind(dat2$coeftable %>% as_tibble(rownames = "term") %>% mutate(model = model.names[2])) %>%
    rbind(dat3$coeftable %>% as_tibble(rownames = "term") %>% mutate(model = model.names[3])) %>%
    clean_names() %>%
    filter(term %in% coefs) %>%
    ggplot(aes(y = model, x = est, col = model)) +
    geom_pointrange(aes(xmin = x2_5_percent, xmax = x97_5_percent), size=size) +
    geom_vline(xintercept = 0, linetype = "dashed") +
    scale_color_npg(limits = model.names) +
    scale_y_discrete(limits = rev(model.names)) +
    scale_x_continuous(limits = xlims) +
    theme_bw(base_size = base_size) +
    guides(color = guide_legend(title.position = "top", 
                                title.hjust = 0)) +
    labs(y = "", x = "Estimate", col = legend.title) +
    theme(legend.position = "top")
}

# standardization_sim =============================================================

standardization_sim <- function(data, indices, treatment, outcome, formula1, p_out) {
  
  treatment1 <- enquo(treatment)
  outcome1 <- enquo(outcome)
  
  # create a dataset with 3 copies of each subject
  # 1st copy: equal to original one`
  d <- data[indices, ] 
  d <- d %>% mutate(interv = -1,
                    seqno = 1:n()) # for gee estimation
  # 2nd copy: treatment set to 0, outcome to missing
  d0 <- d %>% mutate(interv = 0,
                     !!treatment1 := first(levels(droplevels(!!treatment1))),
                     !!outcome1 := NA)
  # 3rd copy: treatment set to 1, outcome to missing
  d1 <- d %>% mutate(interv = 1,
                     !!treatment1 := last(levels(droplevels(!!treatment1))),
                     !!outcome1 := NA)
  # 4th copy: treatment set to simulated prop, outcome to missing
  d2 <- d %>% mutate(interv = 2,
                     !!treatment1 := rbinom(n=nrow(d), size=1, prob=p_out),
                     !!treatment1 := ifelse(!!treatment1 == 1, 
                                            "Outside", "Inside"),
                     !!outcome1 := NA)
  d.onesample <- rbind(d, d0, d1, d2) # combining datasets
  
  if (class(unlist(select(data, !!outcome1)))=="factor") {
    d.onesample <- d.onesample %>%
      mutate(!!outcome1 := as.numeric(!!outcome1)-1)
  }
  
  # linear model to estimate mean outcome conditional on treatment and confounders
  # parameters are estimated using original observations only (interv= -1)
  # parameter estimates are used to predict mean outcome for observations with set
  # treatment (interv=0 and interv=1)
  # fit <- glm(formula = formula1 ,data = d.onesample, family = "binomial")
  # d.onesample$predicted_meanY <- predict(fit, d.onesample, type="response")
  
  # Zou, 2004
  require(geepack)
  fit <- geeglm(formula = formula1,
                data    = d.onesample,
                family  = poisson(link = "log"),
                id      = seqno,
                corstr  = "exchangeable")
  
  d.onesample$predicted_meanY <- predict(fit, d.onesample, type="response")
  
  # estimate mean outcome in each of the groups interv=-1, interv=0, and interv=1
  M.ob <- mean(d.onesample$predicted_meanY[d.onesample$interv == -1], na.rm=T)
  M.t0 <- mean(d.onesample$predicted_meanY[d.onesample$interv == 0], na.rm=T)
  M.t1 <- mean(d.onesample$predicted_meanY[d.onesample$interv == 1], na.rm=T)
  M.ts <- mean(d.onesample$predicted_meanY[d.onesample$interv == 2], na.rm=T)
  M.t10 <- M.t1 - M.t0
  M.t1s <- M.t1 - M.ts
  M.ts0 <- M.ts - M.t0
  M.tsob <- M.ts - M.ob
  
  return(c(M.ob, M.t0, M.t1, M.t10, M.ts, M.t1s, M.ts0, M.tsob))
}

# tab_sim =============================================================

tab_sim <- function(results) {
  se <- c(sd(results$t[, 1]),
          sd(results$t[, 2]),
          sd(results$t[, 3]),
          sd(results$t[, 4]),
          sd(results$t[, 5]),
          sd(results$t[, 6]),
          sd(results$t[, 7]),
          sd(results$t[, 8]))
  mean <- results$t0
  ll <- mean - qnorm(0.975) * se
  ul <- mean + qnorm(0.975) * se
  
  # OBS = observed, NE = No Exposed, FE = Exposed, SE = Simulated Exposure
  
  bootstrap <-data.frame(cbind(type=c("OBS", 
                                      "NE", 
                                      "FE", 
                                      "FE - NE",
                                      "SE", 
                                      "FE - SE",
                                      "SE - NE",
                                      "SE - OBS"),
                               mean, se, ll, ul)) %>%
    mutate_at(.vars = c(2:5), ~as.numeric(as.character(.)))
  
  return(bootstrap)
  
}

# standardization_bin =============================================================

standardization_bin <- function(data, indices, treatment, outcome, formula1, var_sim, cat_sim) {
  
  treatment1 <- enquo(treatment)
  outcome1 <- enquo(outcome)
  var_sim1 <- enquo(var_sim)
  
  # create a dataset with 3 copies of each subject
  # 1st copy: equal to original one`
  d <- data[indices, ] 
  d <- d %>% mutate(interv = -1,
                    seqno = 1:n(), # for gee estimation
                    cat = !!var_sim1)
  # # 2nd copy: treatment set to 0, outcome to missing
  # d0 <- d %>% mutate(interv = 0,
  #                    !!treatment1 := first(levels(droplevels(!!treatment1))),
  #                    !!outcome1 := NA)
  # # 3rd copy: treatment set to 1, outcome to missing
  # d1 <- d %>% mutate(interv = 1,
  #                    !!treatment1 := last(levels(droplevels(!!treatment1))),
  #                    !!outcome1 := NA)
  # 4th copy: treatment set to 0 for cat_sim, outcome to missing
  d2 <- d %>% mutate(interv = 2,
                     !!treatment1 := as.character(!!treatment1),
                     !!treatment1 := ifelse(cat == cat_sim, 
                                            "Inside", !!treatment1),
                     !!treatment1 := as.factor(!!treatment1),
                     !!outcome1 := NA)
  # 5th copy: treatment set to 1 for cat_sim, outcome to missing
  d3 <- d %>% mutate(interv = 3,
                     !!treatment1 := as.character(!!treatment1),
                     !!treatment1 := ifelse(cat != cat_sim, 
                                            "Outside", !!treatment1),
                     !!treatment1 := as.factor(!!treatment1),
                     !!outcome1 := NA)
  d.onesample <- rbind(d, d2, d3) # combining datasets
  
  if (class(unlist(select(data, !!outcome1)))=="factor") {
    d.onesample <- d.onesample %>%
      mutate(!!outcome1 := as.numeric(!!outcome1)-1)
  }
  
  # linear model to estimate mean outcome conditional on treatment and confounders
  # parameters are estimated using original observations only (interv= -1)
  # parameter estimates are used to predict mean outcome for observations with set
  # treatment (interv=0 and interv=1)
  # fit <- glm(formula = formula1 ,data = d.onesample, family = "binomial")
  # d.onesample$predicted_meanY <- predict(fit, d.onesample, type="response")
  
  # Zou, 2004
  require(geepack)
  fit <- geeglm(formula = formula1,
                data    = d.onesample,
                family  = poisson(link = "log"),
                id      = seqno,
                corstr  = "exchangeable")
  
  d.onesample$predicted_meanY <- predict(fit, d.onesample, type="response")
  
  # estimate mean outcome in each of the groups interv=-1, interv=0, and interv=1
  M.ob <- mean(d.onesample$predicted_meanY[d.onesample$interv == -1], na.rm=T)
  M.tc0 <- mean(d.onesample$predicted_meanY[d.onesample$interv == 2], na.rm=T)
  M.tc1 <- mean(d.onesample$predicted_meanY[d.onesample$interv == 3], na.rm=T)
  M.tc10 <- M.tc1 - M.tc0
  M.tc1ob <- M.tc1 - M.ob
  M.tc0ob <- M.tc0 - M.ob
  
  return(c(M.ob, M.tc0, M.tc1, M.tc10, M.tc1ob, M.tc0ob))
}

# tab_bin =============================================================

tab_bin <- function(results, cat_sim) {
  se <- c(sd(results$t[, 1]),
          sd(results$t[, 2]),
          sd(results$t[, 3]),
          sd(results$t[, 4]),
          sd(results$t[, 5]),
          sd(results$t[, 6]))
  mean <- results$t0
  ll <- mean - qnorm(0.975) * se
  ul <- mean + qnorm(0.975) * se
  
  # OBS = observed, NE = No Exposed, FE = Exposed
  
  bootstrap <-data.frame(cbind(type=c("OBS", 
                                      "NE", 
                                      "FE", 
                                      "FE - NE",
                                      "FE - OBS",
                                      "NE - OBS"),
                               mean, se, ll, ul)) %>%
    mutate_at(.vars = c(2:5), ~as.numeric(as.character(.)))
  
  return(bootstrap)
  
}

A.2.1 ESTIMATION

f <- SEROPOSITIVE ~ work_out + edad + nm_sex

library(boot)

# bootstrap
results <- boot(data = dat,
                statistic = standardization, 
                treatment = work_out, 
                outcome = SEROPOSITIVE, 
                formula1 = f, #parameters from function
                R = 5)

# generating confidence intervals
se <- c(sd(results$t[, 1]),
        sd(results$t[, 2]),
        sd(results$t[, 3]),
        sd(results$t[, 4]))
mean <- results$t0
ll <- mean - qnorm(0.975) * se
ul <- mean + qnorm(0.975) * se

bootstrap <-data.frame(cbind(type=c("Observed", 
                                    "No Treatment", 
                                    "Treatment", 
                                    "Treatment - No Treatment"),
                             mean, se, ll, ul)) %>%
  mutate_at(.vars = c(2:5), ~as.numeric(as.character(.)))

kable(bootstrap, digits = 5)
type mean se ll ul
Observed 0.48771 0.01295 0.46233 0.51309
No Treatment 0.41185 0.01626 0.37998 0.44371
Treatment 0.62928 0.01428 0.60130 0.65727
Treatment - No Treatment 0.21744 0.01519 0.18766 0.24722

A.3 Benchmark [RISCA package]

To compute Marginal effects of the treatment (ATE) based on RISCA package

A.3.1 Using logistic regression as the Q-model

library(RISCA)

dat_gf <- dat %>%
  dplyr::select(travel_1m, work_out, edad, SEROPOSITIVE, nm_sex) %>%
  mutate(id = seq(1:n()),
         time = 0,
         travel_1m = as.integer(as.numeric(travel_1m)-1),
         work_out = as.numeric(work_out)-1, 
         SEROPOSITIVE = as.numeric(SEROPOSITIVE)-1,
         edad = as.numeric(edad)) %>%
  filter(complete.cases(.)) %>%
  as.data.frame()

glm.multi <- glm(formula = f, data=dat_gf, 
                 family = binomial(link=logit))

gc.ate1 <- gc.logistic(glm.obj=glm.multi, data=dat_gf, 
                       group = "work_out", effect="ATE",
                       var.method="bootstrap", iterations=1000,
                       n.cluster=1)

gc.ate1.dat <-data.frame(cbind(type=c("No Treatment", 
                                      "Treatment", 
                                      "Treatment - No Treatment"),
                             bind_rows(gc.ate1$p0, gc.ate1$p1,
                                       gc.ate1$delta)))

kable(gc.ate1.dat, digits = 5)
type estimate ci.lower ci.upper std.error
2.5%…1 No Treatment 0.42545 0.39665 0.45410 NA
2.5%…2 Treatment 0.65798 0.60958 0.70779 NA
2.5%…3 Treatment - No Treatment 0.23252 0.17215 0.28979 0.03071