Lab: List column workflow

Learning outcomes/objective

Understand logic underlying list column workflow in R and learn to use the corresponding functions

1 Package and functions

All the functions we use are contained in packages tied to the tidyverse (library(tidyverse)).

nest(): Create a list column
map(): Work with list columns
nest() and map_*(): Simplify the list columns

2 Load data and packages

# Load packages
  # install.packages("pacman")
  pacman::p_load(tidyverse, 
                 broom, 
                 tidymodels)

# Load data
  data <- read_csv(sprintf("https://docs.google.com/uc?id=%s&export=download",
                           "1I0eVUFyw0yn9T5roxzr67hPEl_5ZClrL"))

# Explore the data
  table(data$education)


   0    1    2    3    4    5    6    7    8    9   10 
 380  806  194   89 2182  324  687  474  195  425  877

  table(data$victim)


   0    1 
5966  667

  table(data$trust)


   0    1    2    3    4    5    6    7    8    9   10 
 303   42  172  270  368 1281  852 1342 1294  353  356

  head(data) # Show first 6 rows

# A tibble: 6 x 5
  idpers trust victim education Name    
   <dbl> <dbl>  <dbl>     <dbl> <chr>   
1      1     4      0         8 Danika  
2      2     5      1         1 Imani   
3      3     0      0         0 Billy   
4      6     5      0         9 Benjamin
5     12     7      0         4 Austin  
6     19     5      0         1 Georgina

3 Explore list column workflow

# Nesting your data ####

  # Prepare the nested data frame data_nested
  data_nested <- data %>% 
    group_by(victim) %>% # ONLY 2 groups
    nest() %>%
    ungroup()

# Explore data_nested
  head(data_nested)

# A tibble: 2 x 2
  victim data                
   <dbl> <list>              
1      0 <tibble [5,966 x 4]>
2      1 <tibble [667 x 4]>

# Unnesting your data ####

  # Create the unnested data frame called gap_unnnested
  data_unnested <- data_nested %>% 
    unnest(data)
    
  # Confirm that your data was not modified  
  identical(data, data_unnested) # Why false?

[1] FALSE

# Explore a nested cell ####

  # Extract the data of non_victims
  non_victims_df <- data_nested$data[[1]]
  
  # Calculate the minimum of the education vector
  min(non_victims_df$education)

[1] 0

  # Calculate the mean of the education vector
  mean(non_victims_df$education)

[1] 5.085987

# Mapping your data ####
  # map() function
  # Takes vector or list - applies f to every element an returns list

  # Calculate the mean education for each group
  education_nested <- data_nested %>%
    mutate(mean_education = map(.x = data, ~mean(.x$education)))
  
  # Take a look at education_nested
  education_nested

# A tibble: 2 x 3
  victim data                 mean_education
   <dbl> <list>               <list>        
1      0 <tibble [5,966 x 4]> <dbl [1]>     
2      1 <tibble [667 x 4]>   <dbl [1]>

  # Extract the mean_education value by using unnest
  education_mean <- education_nested %>% 
    unnest(mean_education)
  
  # Take a look at education_mean
  education_mean

# A tibble: 2 x 3
  victim data                 mean_education
   <dbl> <list>                        <dbl>
1      0 <tibble [5,966 x 4]>           5.09
2      1 <tibble [667 x 4]>             4.59

# Expecting mapped output ####
  # Calculate mean education and store result as a double with map_dbl()
  education_mean <- data_nested %>%
    mutate(mean_education = map_dbl(.x = data, ~mean(.x$education)))
  
  # Take a look at education_mean
  education_mean

# A tibble: 2 x 3
  victim data                 mean_education
   <dbl> <list>                        <dbl>
1      0 <tibble [5,966 x 4]>           5.09
2      1 <tibble [667 x 4]>             4.59

# Mapping many models ####
  
  # Build a linear model for each group
  data_models <- data_nested %>%
      mutate(model = map(.x = data, ~lm(formula = trust ~ education, data = .x)))
  data_models

# A tibble: 2 x 3
  victim data                 model 
   <dbl> <list>               <list>
1      0 <tibble [5,966 x 4]> <lm>  
2      1 <tibble [667 x 4]>   <lm>

  # Extract the model for non_victims    
  non_victims_model <- data_models$model[[1]]
  non_victims_model


Call:
lm(formula = trust ~ education, data = .x)

Coefficients:
(Intercept)    education  
     5.5643       0.1258

# Tidy up the coefficients of your nested models ####

  # Extract the coefficient statistics of each model into nested data frames
  data_model_coefs <- data_models %>% 
      mutate(coef = map(model, ~tidy(.x, conf.int = TRUE)))
  data_model_coefs

# A tibble: 2 x 4
  victim data                 model  coef            
   <dbl> <list>               <list> <list>          
1      0 <tibble [5,966 x 4]> <lm>   <tibble [2 x 7]>
2      1 <tibble [667 x 4]>   <lm>   <tibble [2 x 7]>

  # Simplify the coef data frames for each model    
  data_model_coefs <- data_model_coefs %>%
      unnest(coef)
  data_model_coefs

# A tibble: 4 x 10
  victim data     model  term  estim~1 std.e~2 stati~3   p.value conf.~4 conf.~5
   <dbl> <list>   <list> <chr>   <dbl>   <dbl>   <dbl>     <dbl>   <dbl>   <dbl>
1      0 <tibble> <lm>   (Int~   5.56  0.0574    97.0  0          5.45     5.68 
2      0 <tibble> <lm>   educ~   0.126 0.00974   12.9  1.09e- 37  0.107    0.145
3      1 <tibble> <lm>   (Int~   4.85  0.167     29.0  2.04e-120  4.52     5.18 
4      1 <tibble> <lm>   educ~   0.136 0.0299     4.55 6.24e-  6  0.0775   0.195
# ... with abbreviated variable names 1: estimate, 2: std.error, 3: statistic,
#   4: conf.low, 5: conf.high

# Glance at the fit of your models ####
  
  # Extract the fit statistics of each model into data frames
  data_model_stats <- data_models %>% 
      mutate(fit = map(model, ~glance(.x)))
  data_model_stats

# A tibble: 2 x 4
  victim data                 model  fit              
   <dbl> <list>               <list> <list>           
1      0 <tibble [5,966 x 4]> <lm>   <tibble [1 x 12]>
2      1 <tibble [667 x 4]>   <lm>   <tibble [1 x 12]>

  # Simplify the fit data frames for each model    
  data_model_stats <- data_model_stats %>% 
      unnest(fit)
  data_model_stats

# A tibble: 2 x 15
  victim data     model  r.squared adj.r.~1 sigma stati~2  p.value    df  logLik
   <dbl> <list>   <list>     <dbl>    <dbl> <dbl>   <dbl>    <dbl> <dbl>   <dbl>
1      0 <tibble> <lm>      0.0272   0.0271  2.24   167.  1.09e-37     1 -13271.
2      1 <tibble> <lm>      0.0303   0.0288  2.46    20.7 6.24e- 6     1  -1545.
# ... with 5 more variables: AIC <dbl>, BIC <dbl>, deviance <dbl>,
#   df.residual <int>, nobs <int>, and abbreviated variable names
#   1: adj.r.squared, 2: statistic

# Augment the fitted values of each of the two models ####

  data_models_augmented <- data_models %>% 
    # Build the augmented data frame for each group model
    mutate(augmented = map(.x = model, ~augment(.x))) %>% 
    # Expand the augmented data frames
   unnest(augmented)

4 HOMEWORK: List Column Workflow

Next week we will work with the dataset underlying Dressel and Farid (2018). The homework serves as a first exposure to that dataset. Use the code under # 1. below to load the dataset.
Explore the dataset with nrow(), ncol() and str(). How many variables are there and which are the most interesting ones?
We will use the following variables:
- is_recid: Outcome recidivism $y$ (0,1,0,0,1,1,...), i.e., wether someone reoffends
- Various predictors $x^{'} s$
  - age: age in years
  - priors_count: Number of prior offenses
  - race = Ethnic background
- Use the code below under # 3. to subset the data and explore those variables using the summary function.
Nest the dataset according to the variable race and use the map() function to add the mean of age, priors_count and is_recid to the dataset. What difference does it make when you use map_dbl() or map()?
Can you think of a way of getting the same result using simple dplyr functions?

library(tidyverse)

# Load the dataset
  data <- read_csv(sprintf("https://docs.google.com/uc?id=%s&export=download",
                           "1UjSP7qYNLVO85isGZaeq0PRIOAX_CJho"))

  
  
  
# 2. 
  

  
  
# 3.
# Keep the variable below
data <- data %>% select(is_recid, age, priors_count, race)





# 4.





# 5.

5 All the code

# Load packages
  # install.packages("pacman")
  pacman::p_load(tidyverse, 
                 broom, 
                 tidymodels)

# Load data
  data <- read_csv(sprintf("https://docs.google.com/uc?id=%s&export=download",
                           "1I0eVUFyw0yn9T5roxzr67hPEl_5ZClrL"))

# Explore the data
  table(data$education)
  table(data$victim)
  table(data$trust)
  head(data) # Show first 6 rows


# Nesting your data ####

  # Prepare the nested data frame data_nested
  data_nested <- data %>% 
    group_by(victim) %>% # ONLY 2 groups
    nest() %>%
    ungroup()

# Explore data_nested
  head(data_nested)


  
# Unnesting your data ####

  # Create the unnested data frame called gap_unnnested
  data_unnested <- data_nested %>% 
    unnest(data)
    
  # Confirm that your data was not modified  
  identical(data, data_unnested) # Why false?


# Explore a nested cell ####

  # Extract the data of non_victims
  non_victims_df <- data_nested$data[[1]]
  
  # Calculate the minimum of the education vector
  min(non_victims_df$education)

  # Calculate the mean of the education vector
  mean(non_victims_df$education)



# Mapping your data ####
  # map() function
  # Takes vector or list - applies f to every element an returns list

  # Calculate the mean education for each group
  education_nested <- data_nested %>%
    mutate(mean_education = map(.x = data, ~mean(.x$education)))
  
  # Take a look at education_nested
  education_nested
  
  # Extract the mean_education value by using unnest
  education_mean <- education_nested %>% 
    unnest(mean_education)
  
  # Take a look at education_mean
  education_mean



# Expecting mapped output ####
  # Calculate mean education and store result as a double with map_dbl()
  education_mean <- data_nested %>%
    mutate(mean_education = map_dbl(.x = data, ~mean(.x$education)))
  
  # Take a look at education_mean
  education_mean
  

  
# Mapping many models ####
  
  # Build a linear model for each group
  data_models <- data_nested %>%
      mutate(model = map(.x = data, ~lm(formula = trust ~ education, data = .x)))
  data_models
      
  # Extract the model for non_victims    
  non_victims_model <- data_models$model[[1]]
  non_victims_model
  

# Tidy up the coefficients of your nested models ####

  # Extract the coefficient statistics of each model into nested data frames
  data_model_coefs <- data_models %>% 
      mutate(coef = map(model, ~tidy(.x, conf.int = TRUE)))
  data_model_coefs
      
  # Simplify the coef data frames for each model    
  data_model_coefs <- data_model_coefs %>%
      unnest(coef)
  data_model_coefs


# Glance at the fit of your models ####
  
  # Extract the fit statistics of each model into data frames
  data_model_stats <- data_models %>% 
      mutate(fit = map(model, ~glance(.x)))
  data_model_stats
  
  
  # Simplify the fit data frames for each model    
  data_model_stats <- data_model_stats %>% 
      unnest(fit)
  data_model_stats    





# Augment the fitted values of each of the two models ####

  data_models_augmented <- data_models %>% 
    # Build the augmented data frame for each group model
    mutate(augmented = map(.x = model, ~augment(.x))) %>% 
    # Expand the augmented data frames
   unnest(augmented)
library(tidyverse)

# Load the dataset
  data <- read_csv(sprintf("https://docs.google.com/uc?id=%s&export=download",
                           "1UjSP7qYNLVO85isGZaeq0PRIOAX_CJho"))

  
  
  
# 2. 
  

  
  
# 3.
# Keep the variable below
data <- data %>% select(is_recid, age, priors_count, race)





# 4.





# 5.

References

Dressel, Julia, and Hany Farid. 2018. “The Accuracy, Fairness, and Limits of Predicting Recidivism.” Sci Adv 4 (1): eaao5580.