# Load packages
# install.packages("pacman")
::p_load(tidyverse,
pacman
broom,
tidymodels)
# Load data
<- read_csv(sprintf("https://docs.google.com/uc?id=%s&export=download",
data "1I0eVUFyw0yn9T5roxzr67hPEl_5ZClrL"))
Lab: List column workflow
Learning outcomes/objective
- Understand logic underlying list column workflow in R and learn to use the corresponding functions
1 Package and functions
All the functions we use are contained in packages tied to the tidyverse (library(tidyverse)
).
nest()
: Create a list columnmap()
: Work with list columnsnest()
andmap_*()
: Simplify the list columns
2 Load data and packages
# Explore the data
table(data$education)
0 1 2 3 4 5 6 7 8 9 10
380 806 194 89 2182 324 687 474 195 425 877
table(data$victim)
0 1
5966 667
table(data$trust)
0 1 2 3 4 5 6 7 8 9 10
303 42 172 270 368 1281 852 1342 1294 353 356
head(data) # Show first 6 rows
# A tibble: 6 x 5
idpers trust victim education Name
<dbl> <dbl> <dbl> <dbl> <chr>
1 1 4 0 8 Danika
2 2 5 1 1 Imani
3 3 0 0 0 Billy
4 6 5 0 9 Benjamin
5 12 7 0 4 Austin
6 19 5 0 1 Georgina
3 Explore list column workflow
# Nesting your data ####
# Prepare the nested data frame data_nested
<- data %>%
data_nested group_by(victim) %>% # ONLY 2 groups
nest() %>%
ungroup()
# Explore data_nested
head(data_nested)
# A tibble: 2 x 2
victim data
<dbl> <list>
1 0 <tibble [5,966 x 4]>
2 1 <tibble [667 x 4]>
# Unnesting your data ####
# Create the unnested data frame called gap_unnnested
<- data_nested %>%
data_unnested unnest(data)
# Confirm that your data was not modified
identical(data, data_unnested) # Why false?
[1] FALSE
# Explore a nested cell ####
# Extract the data of non_victims
<- data_nested$data[[1]]
non_victims_df
# Calculate the minimum of the education vector
min(non_victims_df$education)
[1] 0
# Calculate the mean of the education vector
mean(non_victims_df$education)
[1] 5.085987
# Mapping your data ####
# map() function
# Takes vector or list - applies f to every element an returns list
# Calculate the mean education for each group
<- data_nested %>%
education_nested mutate(mean_education = map(.x = data, ~mean(.x$education)))
# Take a look at education_nested
education_nested
# A tibble: 2 x 3
victim data mean_education
<dbl> <list> <list>
1 0 <tibble [5,966 x 4]> <dbl [1]>
2 1 <tibble [667 x 4]> <dbl [1]>
# Extract the mean_education value by using unnest
<- education_nested %>%
education_mean unnest(mean_education)
# Take a look at education_mean
education_mean
# A tibble: 2 x 3
victim data mean_education
<dbl> <list> <dbl>
1 0 <tibble [5,966 x 4]> 5.09
2 1 <tibble [667 x 4]> 4.59
# Expecting mapped output ####
# Calculate mean education and store result as a double with map_dbl()
<- data_nested %>%
education_mean mutate(mean_education = map_dbl(.x = data, ~mean(.x$education)))
# Take a look at education_mean
education_mean
# A tibble: 2 x 3
victim data mean_education
<dbl> <list> <dbl>
1 0 <tibble [5,966 x 4]> 5.09
2 1 <tibble [667 x 4]> 4.59
# Mapping many models ####
# Build a linear model for each group
<- data_nested %>%
data_models mutate(model = map(.x = data, ~lm(formula = trust ~ education, data = .x)))
data_models
# A tibble: 2 x 3
victim data model
<dbl> <list> <list>
1 0 <tibble [5,966 x 4]> <lm>
2 1 <tibble [667 x 4]> <lm>
# Extract the model for non_victims
<- data_models$model[[1]]
non_victims_model non_victims_model
Call:
lm(formula = trust ~ education, data = .x)
Coefficients:
(Intercept) education
5.5643 0.1258
# Tidy up the coefficients of your nested models ####
# Extract the coefficient statistics of each model into nested data frames
<- data_models %>%
data_model_coefs mutate(coef = map(model, ~tidy(.x, conf.int = TRUE)))
data_model_coefs
# A tibble: 2 x 4
victim data model coef
<dbl> <list> <list> <list>
1 0 <tibble [5,966 x 4]> <lm> <tibble [2 x 7]>
2 1 <tibble [667 x 4]> <lm> <tibble [2 x 7]>
# Simplify the coef data frames for each model
<- data_model_coefs %>%
data_model_coefs unnest(coef)
data_model_coefs
# A tibble: 4 x 10
victim data model term estim~1 std.e~2 stati~3 p.value conf.~4 conf.~5
<dbl> <list> <list> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 <tibble> <lm> (Int~ 5.56 0.0574 97.0 0 5.45 5.68
2 0 <tibble> <lm> educ~ 0.126 0.00974 12.9 1.09e- 37 0.107 0.145
3 1 <tibble> <lm> (Int~ 4.85 0.167 29.0 2.04e-120 4.52 5.18
4 1 <tibble> <lm> educ~ 0.136 0.0299 4.55 6.24e- 6 0.0775 0.195
# ... with abbreviated variable names 1: estimate, 2: std.error, 3: statistic,
# 4: conf.low, 5: conf.high
# Glance at the fit of your models ####
# Extract the fit statistics of each model into data frames
<- data_models %>%
data_model_stats mutate(fit = map(model, ~glance(.x)))
data_model_stats
# A tibble: 2 x 4
victim data model fit
<dbl> <list> <list> <list>
1 0 <tibble [5,966 x 4]> <lm> <tibble [1 x 12]>
2 1 <tibble [667 x 4]> <lm> <tibble [1 x 12]>
# Simplify the fit data frames for each model
<- data_model_stats %>%
data_model_stats unnest(fit)
data_model_stats
# A tibble: 2 x 15
victim data model r.squared adj.r.~1 sigma stati~2 p.value df logLik
<dbl> <list> <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 <tibble> <lm> 0.0272 0.0271 2.24 167. 1.09e-37 1 -13271.
2 1 <tibble> <lm> 0.0303 0.0288 2.46 20.7 6.24e- 6 1 -1545.
# ... with 5 more variables: AIC <dbl>, BIC <dbl>, deviance <dbl>,
# df.residual <int>, nobs <int>, and abbreviated variable names
# 1: adj.r.squared, 2: statistic
# Augment the fitted values of each of the two models ####
<- data_models %>%
data_models_augmented # Build the augmented data frame for each group model
mutate(augmented = map(.x = model, ~augment(.x))) %>%
# Expand the augmented data frames
unnest(augmented)
4 HOMEWORK: List Column Workflow
- Next week we will work with the dataset underlying Dressel and Farid (2018). The homework serves as a first exposure to that dataset. Use the code under
# 1.
below to load the dataset. - Explore the dataset with
nrow()
,ncol()
andstr()
. How many variables are there and which are the most interesting ones? - We will use the following variables:
is_recid
: Outcome recidivism \(y\) (0,1,0,0,1,1,...
), i.e., wether someone reoffends- Various predictors \(x's\)
age
: age in yearspriors_count
: Number of prior offensesrace
= Ethnic background
- Use the code below under
# 3.
to subset the data and explore those variables using the summary function.
- Nest the dataset according to the variable
race
and use the map() function to add the mean ofage
,priors_count
andis_recid
to the dataset. What difference does it make when you usemap_dbl()
ormap()
? - Can you think of a way of getting the same result using simple
dplyr
functions?
library(tidyverse)
# Load the dataset
<- read_csv(sprintf("https://docs.google.com/uc?id=%s&export=download",
data "1UjSP7qYNLVO85isGZaeq0PRIOAX_CJho"))
# 2.
# 3.
# Keep the variable below
<- data %>% select(is_recid, age, priors_count, race)
data
# 4.
# 5.
5 All the code
# Load packages
# install.packages("pacman")
::p_load(tidyverse,
pacman
broom,
tidymodels)
# Load data
<- read_csv(sprintf("https://docs.google.com/uc?id=%s&export=download",
data "1I0eVUFyw0yn9T5roxzr67hPEl_5ZClrL"))
# Explore the data
table(data$education)
table(data$victim)
table(data$trust)
head(data) # Show first 6 rows
# Nesting your data ####
# Prepare the nested data frame data_nested
<- data %>%
data_nested group_by(victim) %>% # ONLY 2 groups
nest() %>%
ungroup()
# Explore data_nested
head(data_nested)
# Unnesting your data ####
# Create the unnested data frame called gap_unnnested
<- data_nested %>%
data_unnested unnest(data)
# Confirm that your data was not modified
identical(data, data_unnested) # Why false?
# Explore a nested cell ####
# Extract the data of non_victims
<- data_nested$data[[1]]
non_victims_df
# Calculate the minimum of the education vector
min(non_victims_df$education)
# Calculate the mean of the education vector
mean(non_victims_df$education)
# Mapping your data ####
# map() function
# Takes vector or list - applies f to every element an returns list
# Calculate the mean education for each group
<- data_nested %>%
education_nested mutate(mean_education = map(.x = data, ~mean(.x$education)))
# Take a look at education_nested
education_nested
# Extract the mean_education value by using unnest
<- education_nested %>%
education_mean unnest(mean_education)
# Take a look at education_mean
education_mean
# Expecting mapped output ####
# Calculate mean education and store result as a double with map_dbl()
<- data_nested %>%
education_mean mutate(mean_education = map_dbl(.x = data, ~mean(.x$education)))
# Take a look at education_mean
education_mean
# Mapping many models ####
# Build a linear model for each group
<- data_nested %>%
data_models mutate(model = map(.x = data, ~lm(formula = trust ~ education, data = .x)))
data_models
# Extract the model for non_victims
<- data_models$model[[1]]
non_victims_model
non_victims_model
# Tidy up the coefficients of your nested models ####
# Extract the coefficient statistics of each model into nested data frames
<- data_models %>%
data_model_coefs mutate(coef = map(model, ~tidy(.x, conf.int = TRUE)))
data_model_coefs
# Simplify the coef data frames for each model
<- data_model_coefs %>%
data_model_coefs unnest(coef)
data_model_coefs
# Glance at the fit of your models ####
# Extract the fit statistics of each model into data frames
<- data_models %>%
data_model_stats mutate(fit = map(model, ~glance(.x)))
data_model_stats
# Simplify the fit data frames for each model
<- data_model_stats %>%
data_model_stats unnest(fit)
data_model_stats
# Augment the fitted values of each of the two models ####
<- data_models %>%
data_models_augmented # Build the augmented data frame for each group model
mutate(augmented = map(.x = model, ~augment(.x))) %>%
# Expand the augmented data frames
unnest(augmented)
library(tidyverse)
# Load the dataset
<- read_csv(sprintf("https://docs.google.com/uc?id=%s&export=download",
data "1UjSP7qYNLVO85isGZaeq0PRIOAX_CJho"))
# 2.
# 3.
# Keep the variable below
<- data %>% select(is_recid, age, priors_count, race)
data
# 4.
# 5.
References
Dressel, Julia, and Hany Farid. 2018. “The Accuracy, Fairness, and Limits of Predicting Recidivism.” Sci Adv 4 (1): eaao5580.