5.4 Lab: How randomization induces independence

We discussed before that randomization induces that treated and control groups will be identical in all respects, observable and unobservable on average (long run). Remember that we had a population of 2000 units and our sample included 200 units. Let’s sample again…

sample <- sample_n(population, 200) # create new object containing sample
sample <- sample %>% arrange(age) # sort according to age
sample$treatment <- simple_ra(N = nrow(sample)) # random assignemnt
head(sample)

id	firstname	lastname	age	email	treatment
35005	Christian	Uyeda	18	Christian.Uyeda@example8.org	0
16756	Jaspreet	Roberts	18	Jaspreet.Roberts@example2.com	0
63706	Samual	Munoz	18	Samual.Munoz@example9.es	1
5770	Brandon	Quintana-Garcia	19	Brandon.Quintana-Garcia@example1.de	1
40546	Desmond	Smith	20	Desmond.Smith@example6.com	1
805	Suthdara	Thorpe	20	Suthdara.Thorpe@example7.es	1

The average age in the population is 51.58 and 50.6 in the sample. Q: Why is there a difference?

Subsequently, we’ll test whether we can randomly assign units in our sample to two groups, and whether the average age in the two groups is equal. We learned that this should be the case.

sample %>% group_by(treatment) %>% dplyr::summarise(mean_age = mean(age))

treatment	mean_age
0	47.96386
1	52.47009

That looks quite good.. so basically, if we are able to randomize, we can actually induce independence between our treatment and any other (observed and unobserved) variables that may be related to the treatment (and the outcome) in the real world. Accordingly, they hardly correlate at 0.11 (R code: round(cor(sample$age, sample$treatment),2)).

But as we also discussed, there is a long run aspect. Q: What was that about? (think of both randomization and sampling)

Let’s do the randomization for some smaller subsets of our sample and see whether treated and control groups balance out regarding age.

# First with 5 units
sample_5 <- sample_n(sample, 5) %>% dplyr::select(-email)
sample_5$treatment <- simple_ra(N = nrow(sample_5))
sample_5

id	firstname	lastname	age	treatment
25563	Beni	Castillo	67	0
93469	Marceen	el-Fadel	77	0
48177	Isis	Leong	40	0
87522	Zubaida	Williams	46	1
24943	Diksha	Maldonado Ornelas	84	1

sample_5 %>% group_by(treatment) %>% summarise(mean_age = mean(age))

treatment	mean_age
0	61.33333
1	65.00000

# Then with 10 units
sample_10 <- sample_n(sample, 10) %>% dplyr::select(-email)
sample_10$treatment <- simple_ra(N = nrow(sample_10))
sample_10

id	firstname	lastname	age	treatment
52281	Anthony	Lujan	71	0
94430	Matthew	Pinon	21	1
40416	Michael	el-Younis	31	0
28764	Karleina	Harris	73	1
58478	Daniel	Kang	47	0
56628	Jason	Blackwater	81	1
84273	Tyme	Frahmann	48	1
18838	Nyeasha	Ostdiek	20	0
36332	Dominick	Richardson	37	0
95752	Alisa	Willow	69	1

sample_10 %>% group_by(treatment) %>% summarise(mean_age = mean(age))

treatment	mean_age
0	41.2
1	58.4

# Then with 50 units
sample_100 <- sample_n(sample, 100) %>% dplyr::select(-email)
sample_100$treatment <- simple_ra(N = nrow(sample_100))
head(sample_100)

id	firstname	lastname	age	treatment
81356	Hameeda	Dunn	49	0
84273	Tyme	Frahmann	48	1
38127	Christopher	Velie	29	1
35506	Kaatima	Panyavong	69	1
29358	Angelica	al-Allee	37	0
11597	Rachel	Molinar Rodriguez	45	1

sample_100 %>% group_by(treatment) %>% summarise(mean_age = mean(age))

treatment	mean_age
0	49.89583
1	54.17308

Ok, basically the examples above show that there is a certain long run effect in that we need to do the randomization with a large enough sample so that we really get get treatment and control group(s) that are balanced.

Q: What if we want to achieve balance on many variables that have many values (high-dimensionality). Is that more difficult to achieve?

5.4.1 Long-run randomization & balance (not finished)

We can also simulate when the randomization really takes effect in terms of balance. Below we first start with an example in which we repeatedly rerandomize followed by an example in which we randomize once. Both times we plot the difference in the average of a covarate (age) between treatment and control.

# Rerandomization all the time
  sample <- sample_n(population, 1000) # Take sample from population
  mean_age_control_all <- NULL # Object to store simulations
  mean_age_treatment_all <- NULL # Object to store simulations
  mean_age_difference_all <- NULL # Object to store simulations
  sample_size <- seq(8,200,1) # Define vector of sample sizes
  
for(i in 1:length(sample_size)){
  #print(i)
  sample_size.i <- sample_size[i]
  sample2 <- sample_n(sample, sample_size.i) %>% 
                      dplyr::select(-email) # Take sample from sample
  sample2$treatment <- simple_ra(N = nrow(sample2)) # Create treatment
  sample2 <- sample2 %>% 
             group_by(treatment) %>% 
             summarise(mean_age = mean(age)) # Calculate averages
  
  # Store averages
  mean_age_control <- sample2$mean_age[sample2$treatment==0]
  mean_age_treatment <-  sample2$mean_age[sample2$treatment==1]
  mean_age_difference <- mean_age_treatment- mean_age_control
  
  # Append to other simulations
  mean_age_control_all <- c(mean_age_control_all, mean_age_control)
  mean_age_treatment_all <- c(mean_age_treatment_all, mean_age_treatment)
  mean_age_difference_all <- c(mean_age_difference_all, mean_age_difference)
}

  data_sim <- data.frame(sample_size = sample_size,
                    mean_age_control = mean_age_control_all,
                    mean_age_treatment = mean_age_treatment_all,
                    mean_age_difference = mean_age_difference_all)
  
  plot_ly(data_sim, 
          x = ~sample_size, 
          y = ~mean_age_difference, 
          type = 'scatter', 
          mode = 'lines',
          name = "Mean difference") %>%
    add_trace(x = ~sample_size, y = ~mean_age_treatment, color ="red",
              name = "Mean in treatment") %>%
    add_trace(x = ~sample_size, y = ~mean_age_control, color ="green",
              name = "Mean in control") %>%
    layout(xaxis = list(title = "Sample sizes (simulated, rerandomization)"),
           yaxis = list(title = "Age: Means (treatment/control) and mean difference"))

# One-time randomization
  sample <- sample_n(population, 400)
  sample <- sample %>% dplyr::select(-email)
  sample$treatment <- simple_ra(N = nrow(sample))
  # Reshuffle rows
  sample <- sample_n(sample, 400)
  
  mean_age_control_all <- NULL
  mean_age_treatment_all <- NULL
  mean_age_difference_all <- NULL  
  subset <- seq(8,370,1)  # Define vector of sample sizes
  
  for(i in 1:length(subset)){
  #print(i)
  subset.i <- subset[i]
  sample2 <- sample[1:subset.i,] # no resampling, simply subsetting
  sample2 <- sample2 %>% group_by(treatment) %>% summarise(mean_age = mean(age))
  mean_age_control <- sample2$mean_age[sample2$treatment==0]
  mean_age_treatment <-  sample2$mean_age[sample2$treatment==1]
  mean_age_difference <- mean_age_treatment- mean_age_control
  
  mean_age_control_all <- c(mean_age_control_all, mean_age_control)
  mean_age_treatment_all <- c(mean_age_treatment_all, mean_age_treatment)
  mean_age_difference_all <- c(mean_age_difference_all, mean_age_difference)
}

  data_sim <- data.frame(sample_size = subset,
                    mean_age_control = mean_age_control_all,
                    mean_age_treatment = mean_age_treatment_all,
                    mean_age_difference = mean_age_difference_all)
  
  plot_ly(data_sim, x = ~sample_size, y = ~mean_age_difference, type = 'scatter', mode = 'lines',
          name = "Mean difference") %>%
    add_trace(x = ~sample_size, y = ~mean_age_treatment, color ="red",
              name = "Mean in treatment") %>%
    add_trace(x = ~sample_size, y = ~mean_age_control, color ="green",
              name = "Mean in control") %>%
    layout(xaxis = list(title = "Sample sizes (simulated, one-time randomization)"),
           yaxis = list(title = "Age: Means (treatment/control) and mean difference"))