Chapter 23 Introduction to tidyr

What You’ll Learn:

  • Tidy data principles
  • Wide vs long format
  • pivot_longer() and pivot_wider()
  • Common reshaping errors
  • When to reshape data

Key Errors Covered: 15+ tidyr errors

Difficulty: ⭐⭐ Intermediate to ⭐⭐⭐ Advanced

23.1 Introduction

tidyr helps reshape data between wide and long formats:

library(tidyr)
library(dplyr)

# Wide format
wide_data <- tibble(
  id = 1:3,
  time_1 = c(10, 20, 30),
  time_2 = c(15, 25, 35),
  time_3 = c(20, 30, 40)
)

# Convert to long format
long_data <- wide_data %>%
  pivot_longer(
    cols = starts_with("time"),
    names_to = "time",
    values_to = "value"
  )

long_data
#> # A tibble: 9 × 3
#>      id time   value
#>   <int> <chr>  <dbl>
#> 1     1 time_1    10
#> 2     1 time_2    15
#> 3     1 time_3    20
#> 4     2 time_1    20
#> 5     2 time_2    25
#> 6     2 time_3    30
#> 7     3 time_1    30
#> 8     3 time_2    35
#> 9     3 time_3    40

Let’s master reshaping!

23.2 Tidy Data Principles

💡 Key Insight: What is Tidy Data?

# Tidy data rules:
# 1. Each variable is a column
# 2. Each observation is a row
# 3. Each type of observational unit is a table

# Example: UNTIDY (wide format)
untidy <- tibble(
  person = c("Alice", "Bob"),
  age = c(25, 30),
  income_2020 = c(50000, 60000),
  income_2021 = c(52000, 62000),
  income_2022 = c(54000, 64000)
)

untidy
#> # A tibble: 2 × 5
#>   person   age income_2020 income_2021 income_2022
#>   <chr>  <dbl>       <dbl>       <dbl>       <dbl>
#> 1 Alice     25       50000       52000       54000
#> 2 Bob       30       60000       62000       64000

# TIDY (long format)
tidy <- untidy %>%
  pivot_longer(
    cols = starts_with("income"),
    names_to = "year",
    names_prefix = "income_",
    values_to = "income"
  )

tidy
#> # A tibble: 6 × 4
#>   person   age year  income
#>   <chr>  <dbl> <chr>  <dbl>
#> 1 Alice     25 2020   50000
#> 2 Alice     25 2021   52000
#> 3 Alice     25 2022   54000
#> 4 Bob       30 2020   60000
#> 5 Bob       30 2021   62000
#> 6 Bob       30 2022   64000

# Now year is a variable, each row is person-year observation

Why tidy data? - Works naturally with dplyr and ggplot2 - Easy to filter, group, and summarize - Consistent structure for analysis

23.3 pivot_longer() Basics

💡 Key Insight: Wide to Long with pivot_longer()

# Basic pivot_longer
wide <- tibble(
  id = 1:3,
  q1 = c(5, 3, 4),
  q2 = c(4, 5, 3),
  q3 = c(3, 4, 5)
)

wide %>%
  pivot_longer(
    cols = q1:q3,           # Columns to pivot
    names_to = "quarter",   # New column for names
    values_to = "score"     # New column for values
  )
#> # A tibble: 9 × 3
#>      id quarter score
#>   <int> <chr>   <dbl>
#> 1     1 q1          5
#> 2     1 q2          4
#> 3     1 q3          3
#> 4     2 q1          3
#> 5     2 q2          5
#> 6     2 q3          4
#> 7     3 q1          4
#> 8     3 q2          3
#> 9     3 q3          5

# Using selection helpers
wide %>%
  pivot_longer(
    cols = starts_with("q"),
    names_to = "quarter",
    values_to = "score"
  )
#> # A tibble: 9 × 3
#>      id quarter score
#>   <int> <chr>   <dbl>
#> 1     1 q1          5
#> 2     1 q2          4
#> 3     1 q3          3
#> 4     2 q1          3
#> 5     2 q2          5
#> 6     2 q3          4
#> 7     3 q1          4
#> 8     3 q2          3
#> 9     3 q3          5

# Keep some columns
wide %>%
  pivot_longer(
    cols = -id,             # All except id
    names_to = "quarter",
    values_to = "score"
  )
#> # A tibble: 9 × 3
#>      id quarter score
#>   <int> <chr>   <dbl>
#> 1     1 q1          5
#> 2     1 q2          4
#> 3     1 q3          3
#> 4     2 q1          3
#> 5     2 q2          5
#> 6     2 q3          4
#> 7     3 q1          4
#> 8     3 q2          3
#> 9     3 q3          5

# Multiple value types
data <- tibble(
  id = 1:2,
  x_value = c(10, 20),
  x_error = c(1, 2),
  y_value = c(30, 40),
  y_error = c(3, 4)
)

data %>%
  pivot_longer(
    cols = -id,
    names_to = c("variable", ".value"),
    names_sep = "_"
  )
#> # A tibble: 4 × 4
#>      id variable value error
#>   <int> <chr>    <dbl> <dbl>
#> 1     1 x           10     1
#> 2     1 y           30     3
#> 3     2 x           20     2
#> 4     2 y           40     4

23.4 Error #1: cols must select at least one column

⭐ BEGINNER 📋 ARGS

23.4.1 The Error

data <- tibble(
  id = 1:3,
  value = c(10, 20, 30)
)

data %>%
  pivot_longer(
    cols = starts_with("time"),  # No columns match!
    names_to = "time",
    values_to = "measurement"
  )
#> Error in `pivot_longer()`:
#> ! `cols` must select at least one column.

🔴 ERROR

Error in pivot_longer():
! `cols` must select at least one column.

23.4.2 What It Means

The column selection didn’t match any columns.

23.4.3 Solutions

SOLUTION 1: Check Column Names

# Verify columns exist
names(data)
#> [1] "id"    "value"

# Use correct selection
data_wide <- tibble(
  id = 1:3,
  time_1 = c(10, 20, 30),
  time_2 = c(15, 25, 35)
)

data_wide %>%
  pivot_longer(
    cols = starts_with("time"),
    names_to = "time",
    values_to = "value"
  )
#> # A tibble: 6 × 3
#>      id time   value
#>   <int> <chr>  <dbl>
#> 1     1 time_1    10
#> 2     1 time_2    15
#> 3     2 time_1    20
#> 4     2 time_2    25
#> 5     3 time_1    30
#> 6     3 time_2    35

SOLUTION 2: Use Flexible Selection

# Select all numeric columns
data_wide %>%
  pivot_longer(
    cols = where(is.numeric),
    names_to = "variable",
    values_to = "value"
  )
#> # A tibble: 9 × 2
#>   variable value
#>   <chr>    <dbl>
#> 1 id           1
#> 2 time_1      10
#> 3 time_2      15
#> 4 id           2
#> 5 time_1      20
#> 6 time_2      25
#> 7 id           3
#> 8 time_1      30
#> 9 time_2      35

# Or exclude specific columns
data_wide %>%
  pivot_longer(
    cols = -id,
    names_to = "variable",
    values_to = "value"
  )
#> # A tibble: 6 × 3
#>      id variable value
#>   <int> <chr>    <dbl>
#> 1     1 time_1      10
#> 2     1 time_2      15
#> 3     2 time_1      20
#> 4     2 time_2      25
#> 5     3 time_1      30
#> 6     3 time_2      35

23.5 pivot_wider() Basics

💡 Key Insight: Long to Wide with pivot_wider()

# Basic pivot_wider
long <- tibble(
  id = rep(1:3, each = 2),
  time = rep(c("before", "after"), 3),
  value = c(10, 15, 20, 25, 30, 35)
)

long %>%
  pivot_wider(
    names_from = time,      # Column to get names from
    values_from = value     # Column to get values from
  )
#> # A tibble: 3 × 3
#>      id before after
#>   <int>  <dbl> <dbl>
#> 1     1     10    15
#> 2     2     20    25
#> 3     3     30    35

# Multiple value columns
long_multi <- tibble(
  id = rep(1:2, each = 2),
  time = rep(c("t1", "t2"), 2),
  score = c(10, 15, 20, 25),
  error = c(1, 2, 3, 4)
)

long_multi %>%
  pivot_wider(
    names_from = time,
    values_from = c(score, error)
  )
#> # A tibble: 2 × 5
#>      id score_t1 score_t2 error_t1 error_t2
#>   <int>    <dbl>    <dbl>    <dbl>    <dbl>
#> 1     1       10       15        1        2
#> 2     2       20       25        3        4

# With name formatting
long %>%
  pivot_wider(
    names_from = time,
    values_from = value,
    names_prefix = "time_"
  )
#> # A tibble: 3 × 3
#>      id time_before time_after
#>   <int>       <dbl>      <dbl>
#> 1     1          10         15
#> 2     2          20         25
#> 3     3          30         35

23.6 Error #2: Values are not uniquely identified

⭐⭐ INTERMEDIATE 🧠 LOGIC

23.6.1 The Error

# Data with duplicates
data <- tibble(
  id = c(1, 1, 2, 2),
  time = c("before", "before", "before", "after"),
  value = c(10, 15, 20, 25)
)

data %>%
  pivot_wider(
    names_from = time,
    values_from = value
  )
#> Warning: Values from `value` are not uniquely identified; output will contain list-cols.
#> • Use `values_fn = list` to suppress this warning.
#> • Use `values_fn = {summary_fun}` to summarise duplicates.
#> • Use the following dplyr code to identify duplicates.
#>   {data} |>
#>   dplyr::summarise(n = dplyr::n(), .by = c(id, time)) |>
#>   dplyr::filter(n > 1L)
#> # A tibble: 2 × 3
#>      id before    after    
#>   <dbl> <list>    <list>   
#> 1     1 <dbl [2]> <NULL>   
#> 2     2 <dbl [1]> <dbl [1]>

🔴 WARNING/ERROR

Warning: Values from `value` are not uniquely identified; output will contain list-cols.

23.6.2 What It Means

Multiple values exist for same id-time combination. Can’t decide which to use.

23.6.3 Common Causes

# Multiple measurements per combination
measurements <- tibble(
  person = c("Alice", "Alice", "Bob", "Bob"),
  test = c("test1", "test1", "test2", "test2"),
  score = c(85, 87, 90, 92)  # Two scores for Alice-test1
)

measurements %>%
  pivot_wider(
    names_from = test,
    values_from = score
  )
#> Warning: Values from `score` are not uniquely identified; output will contain list-cols.
#> • Use `values_fn = list` to suppress this warning.
#> • Use `values_fn = {summary_fun}` to summarise duplicates.
#> • Use the following dplyr code to identify duplicates.
#>   {data} |>
#>   dplyr::summarise(n = dplyr::n(), .by = c(person, test)) |>
#>   dplyr::filter(n > 1L)
#> # A tibble: 2 × 3
#>   person test1     test2    
#>   <chr>  <list>    <list>   
#> 1 Alice  <dbl [2]> <NULL>   
#> 2 Bob    <NULL>    <dbl [2]>
# Creates list columns!

23.6.4 Solutions

SOLUTION 1: Aggregate First

measurements %>%
  group_by(person, test) %>%
  summarize(score = mean(score), .groups = "drop") %>%
  pivot_wider(
    names_from = test,
    values_from = score
  )
#> # A tibble: 2 × 3
#>   person test1 test2
#>   <chr>  <dbl> <dbl>
#> 1 Alice     86    NA
#> 2 Bob       NA    91

SOLUTION 2: Add Identifier

# Add row number to make unique
measurements %>%
  group_by(person, test) %>%
  mutate(attempt = row_number()) %>%
  pivot_wider(
    names_from = c(test, attempt),
    values_from = score
  )
#> # A tibble: 2 × 5
#> # Groups:   person [2]
#>   person test1_1 test1_2 test2_1 test2_2
#>   <chr>    <dbl>   <dbl>   <dbl>   <dbl>
#> 1 Alice       85      87      NA      NA
#> 2 Bob         NA      NA      90      92

SOLUTION 3: Use values_fn

# Specify how to aggregate
measurements %>%
  pivot_wider(
    names_from = test,
    values_from = score,
    values_fn = list(score = mean)
  )
#> # A tibble: 2 × 3
#>   person test1 test2
#>   <chr>  <dbl> <dbl>
#> 1 Alice     86    NA
#> 2 Bob       NA    91

# Or use list for multiple aggregations
measurements %>%
  pivot_wider(
    names_from = test,
    values_from = score,
    values_fn = list(score = list)
  )
#> # A tibble: 2 × 3
#>   person test1     test2    
#>   <chr>  <list>    <list>   
#> 1 Alice  <dbl [2]> <NULL>   
#> 2 Bob    <NULL>    <dbl [2]>

23.7 separate() and unite()

💡 Key Insight: Splitting and Combining Columns

# separate: split one column into multiple
data <- tibble(
  id = 1:3,
  name = c("Alice_Smith", "Bob_Jones", "Charlie_Brown")
)

data %>%
  separate(name, into = c("first", "last"), sep = "_")
#> # A tibble: 3 × 3
#>      id first   last 
#>   <int> <chr>   <chr>
#> 1     1 Alice   Smith
#> 2     2 Bob     Jones
#> 3     3 Charlie Brown

# With different separator
data2 <- tibble(
  id = 1:3,
  date = c("2024-01-15", "2024-02-20", "2024-03-25")
)

data2 %>%
  separate(date, into = c("year", "month", "day"), sep = "-")
#> # A tibble: 3 × 4
#>      id year  month day  
#>   <int> <chr> <chr> <chr>
#> 1     1 2024  01    15   
#> 2     2 2024  02    20   
#> 3     3 2024  03    25

# Convert types
data2 %>%
  separate(date, into = c("year", "month", "day"), 
           sep = "-", convert = TRUE)
#> # A tibble: 3 × 4
#>      id  year month   day
#>   <int> <int> <int> <int>
#> 1     1  2024     1    15
#> 2     2  2024     2    20
#> 3     3  2024     3    25

# unite: combine multiple columns
data_split <- tibble(
  id = 1:3,
  first = c("Alice", "Bob", "Charlie"),
  last = c("Smith", "Jones", "Brown")
)

data_split %>%
  unite(name, first, last, sep = " ")
#> # A tibble: 3 × 2
#>      id name         
#>   <int> <chr>        
#> 1     1 Alice Smith  
#> 2     2 Bob Jones    
#> 3     3 Charlie Brown

# Keep original columns
data_split %>%
  unite(name, first, last, sep = " ", remove = FALSE)
#> # A tibble: 3 × 4
#>      id name          first   last 
#>   <int> <chr>         <chr>   <chr>
#> 1     1 Alice Smith   Alice   Smith
#> 2     2 Bob Jones     Bob     Jones
#> 3     3 Charlie Brown Charlie Brown

23.8 fill() for Missing Values

💡 Key Insight: Fill Missing Values

# Data with implicit missing values
data <- tibble(
  group = c("A", NA, NA, "B", NA, "C"),
  value = 1:6
)

# Fill down
data %>%
  fill(group)
#> # A tibble: 6 × 2
#>   group value
#>   <chr> <int>
#> 1 A         1
#> 2 A         2
#> 3 A         3
#> 4 B         4
#> 5 B         5
#> 6 C         6

# Fill up
data %>%
  fill(group, .direction = "up")
#> # A tibble: 6 × 2
#>   group value
#>   <chr> <int>
#> 1 A         1
#> 2 B         2
#> 3 B         3
#> 4 B         4
#> 5 C         5
#> 6 C         6

# Fill both directions
data %>%
  fill(group, .direction = "downup")
#> # A tibble: 6 × 2
#>   group value
#>   <chr> <int>
#> 1 A         1
#> 2 A         2
#> 3 A         3
#> 4 B         4
#> 5 B         5
#> 6 C         6

23.9 complete() for Implicit Missing Values

💡 Key Insight: Make Implicit NAs Explicit

# Incomplete data
sales <- tibble(
  product = c("A", "A", "B"),
  quarter = c(1, 2, 1),
  sales = c(100, 150, 200)
)

# Missing: product B quarter 2
# Make it explicit
sales %>%
  complete(product, quarter)
#> # A tibble: 4 × 3
#>   product quarter sales
#>   <chr>     <dbl> <dbl>
#> 1 A             1   100
#> 2 A             2   150
#> 3 B             1   200
#> 4 B             2    NA

# Fill with specific value
sales %>%
  complete(product, quarter, fill = list(sales = 0))
#> # A tibble: 4 × 3
#>   product quarter sales
#>   <chr>     <dbl> <dbl>
#> 1 A             1   100
#> 2 A             2   150
#> 3 B             1   200
#> 4 B             2     0

# With sequences
dates <- tibble(
  date = as.Date(c("2024-01-01", "2024-01-03", "2024-01-05")),
  value = c(10, 20, 30)
)

dates %>%
  complete(date = seq(min(date), max(date), by = "day"))
#> # A tibble: 5 × 2
#>   date       value
#>   <date>     <dbl>
#> 1 2024-01-01    10
#> 2 2024-01-02    NA
#> 3 2024-01-03    20
#> 4 2024-01-04    NA
#> 5 2024-01-05    30

23.10 Common Patterns

🎯 Best Practice: Reshaping Workflows

# 1. Survey data wide to long
survey_wide <- tibble(
  id = 1:3,
  q1_satisfaction = c(5, 4, 3),
  q1_importance = c(5, 5, 4),
  q2_satisfaction = c(4, 3, 5),
  q2_importance = c(4, 4, 5)
)

survey_long <- survey_wide %>%
  pivot_longer(
    cols = -id,
    names_to = c("question", "measure"),
    names_sep = "_",
    values_to = "rating"
  )

survey_long
#> # A tibble: 12 × 4
#>       id question measure      rating
#>    <int> <chr>    <chr>         <dbl>
#>  1     1 q1       satisfaction      5
#>  2     1 q1       importance        5
#>  3     1 q2       satisfaction      4
#>  4     1 q2       importance        4
#>  5     2 q1       satisfaction      4
#>  6     2 q1       importance        5
#>  7     2 q2       satisfaction      3
#>  8     2 q2       importance        4
#>  9     3 q1       satisfaction      3
#> 10     3 q1       importance        4
#> 11     3 q2       satisfaction      5
#> 12     3 q2       importance        5

# 2. Time series wide to long
stocks <- tibble(
  date = as.Date("2024-01-01") + 0:2,
  AAPL = c(150, 152, 149),
  GOOGL = c(140, 142, 138),
  MSFT = c(380, 385, 378)
)

stocks %>%
  pivot_longer(
    cols = -date,
    names_to = "stock",
    values_to = "price"
  )
#> # A tibble: 9 × 3
#>   date       stock price
#>   <date>     <chr> <dbl>
#> 1 2024-01-01 AAPL    150
#> 2 2024-01-01 GOOGL   140
#> 3 2024-01-01 MSFT    380
#> 4 2024-01-02 AAPL    152
#> 5 2024-01-02 GOOGL   142
#> 6 2024-01-02 MSFT    385
#> 7 2024-01-03 AAPL    149
#> 8 2024-01-03 GOOGL   138
#> 9 2024-01-03 MSFT    378

# 3. Pivot wider for comparison
comparisons <- tibble(
  id = rep(1:3, each = 2),
  condition = rep(c("control", "treatment"), 3),
  outcome = c(10, 12, 15, 18, 20, 22)
)

comparisons %>%
  pivot_wider(
    names_from = condition,
    values_from = outcome
  ) %>%
  mutate(difference = treatment - control)
#> # A tibble: 3 × 4
#>      id control treatment difference
#>   <int>   <dbl>     <dbl>      <dbl>
#> 1     1      10        12          2
#> 2     2      15        18          3
#> 3     3      20        22          2

# 4. Complex reshape
data <- tibble(
  id = 1:2,
  treatment_1_pre = c(10, 15),
  treatment_1_post = c(12, 17),
  treatment_2_pre = c(20, 25),
  treatment_2_post = c(22, 27)
)

data %>%
  pivot_longer(
    cols = -id,
    names_to = c("treatment", "time"),
    names_pattern = "treatment_(.*)_(.*)",
    values_to = "score"
  )
#> # A tibble: 8 × 4
#>      id treatment time  score
#>   <int> <chr>     <chr> <dbl>
#> 1     1 1         pre      10
#> 2     1 1         post     12
#> 3     1 2         pre      20
#> 4     1 2         post     22
#> 5     2 1         pre      15
#> 6     2 1         post     17
#> 7     2 2         pre      25
#> 8     2 2         post     27

23.11 Summary

Key Takeaways:

  1. Tidy data principles - Variables in columns, observations in rows
  2. pivot_longer() - Wide to long format
  3. pivot_wider() - Long to wide format
  4. Check for duplicates - Before pivot_wider()
  5. separate() and unite() - Split/combine columns
  6. fill() - Handle missing values
  7. complete() - Make implicit NAs explicit

Quick Reference:

Function Purpose Common Use
pivot_longer() Wide → Long Convert multiple columns to rows
pivot_wider() Long → Wide Spread values across columns
separate() Split column Parse combined data
unite() Combine columns Create composite keys
fill() Fill NAs Propagate values
complete() Add missing combos Explicit NAs

Reshaping:

# Wide to long
data %>%
  pivot_longer(
    cols = col1:col3,
    names_to = "variable",
    values_to = "value"
  )

# Long to wide
data %>%
  pivot_wider(
    names_from = variable,
    values_from = value
  )

# Split column
data %>%
  separate(col, into = c("part1", "part2"), sep = "_")

# Combine columns
data %>%
  unite(new_col, col1, col2, sep = "_")

# Fill missing
data %>%
  fill(col, .direction = "down")

# Complete combinations
data %>%
  complete(var1, var2, fill = list(value = 0))

Best Practices:

# ✅ Good
Check data structure first                # Know what you have
Aggregate before pivot_wider              # Avoid list columns
Use descriptive names_to/values_to        # Clear column names
Test on small sample first                # Verify logic

# ❌ Avoid
Pivot_wider with duplicates               # Creates list columns
Complex reshapes in one step              # Hard to debug
Forgetting to specify cols                # Error or unexpected
Not checking result structure             # May not be what you want

23.12 Exercises

📝 Exercise 1: Wide to Long

Given:

grades <- tibble(
  student = c("Alice", "Bob", "Charlie"),
  math_q1 = c(85, 78, 92),
  math_q2 = c(88, 82, 95),
  english_q1 = c(90, 85, 88),
  english_q2 = c(92, 87, 90)
)

Convert to long format with columns: student, subject, quarter, grade

📝 Exercise 2: Long to Wide

Given:

measurements <- tibble(
  id = rep(1:3, each = 3),
  time = rep(c("t1", "t2", "t3"), 3),
  value = c(10, 12, 14, 20, 22, 24, 30, 32, 34)
)

Convert to wide format with time as columns

📝 Exercise 3: Safe Pivot

Write safe_pivot_wider(data, names_from, values_from) that: 1. Checks for duplicate combinations 2. Aggregates if needed 3. Performs pivot 4. Validates result

📝 Exercise 4: Complex Reshape

Given survey data:

survey <- tibble(
  id = 1:3,
  age = c(25, 30, 35),
  q1_pre = c(3, 4, 2),
  q1_post = c(4, 5, 3),
  q2_pre = c(2, 3, 4),
  q2_post = c(3, 4, 5)
)

Reshape to: id, age, question, time, response

23.13 Exercise Answers

Click to see answers

Exercise 1:

grades <- tibble(
  student = c("Alice", "Bob", "Charlie"),
  math_q1 = c(85, 78, 92),
  math_q2 = c(88, 82, 95),
  english_q1 = c(90, 85, 88),
  english_q2 = c(92, 87, 90)
)

# Method 1: Two-step pivot
result <- grades %>%
  pivot_longer(
    cols = -student,
    names_to = "subject_quarter",
    values_to = "grade"
  ) %>%
  separate(subject_quarter, into = c("subject", "quarter"), sep = "_")

result
#> # A tibble: 12 × 4
#>    student subject quarter grade
#>    <chr>   <chr>   <chr>   <dbl>
#>  1 Alice   math    q1         85
#>  2 Alice   math    q2         88
#>  3 Alice   english q1         90
#>  4 Alice   english q2         92
#>  5 Bob     math    q1         78
#>  6 Bob     math    q2         82
#>  7 Bob     english q1         85
#>  8 Bob     english q2         87
#>  9 Charlie math    q1         92
#> 10 Charlie math    q2         95
#> 11 Charlie english q1         88
#> 12 Charlie english q2         90

# Method 2: Use names_pattern
result2 <- grades %>%
  pivot_longer(
    cols = -student,
    names_to = c("subject", "quarter"),
    names_pattern = "(.*)_(.*)",
    values_to = "grade"
  )

result2
#> # A tibble: 12 × 4
#>    student subject quarter grade
#>    <chr>   <chr>   <chr>   <dbl>
#>  1 Alice   math    q1         85
#>  2 Alice   math    q2         88
#>  3 Alice   english q1         90
#>  4 Alice   english q2         92
#>  5 Bob     math    q1         78
#>  6 Bob     math    q2         82
#>  7 Bob     english q1         85
#>  8 Bob     english q2         87
#>  9 Charlie math    q1         92
#> 10 Charlie math    q2         95
#> 11 Charlie english q1         88
#> 12 Charlie english q2         90

Exercise 2:

measurements <- tibble(
  id = rep(1:3, each = 3),
  time = rep(c("t1", "t2", "t3"), 3),
  value = c(10, 12, 14, 20, 22, 24, 30, 32, 34)
)

result <- measurements %>%
  pivot_wider(
    names_from = time,
    values_from = value
  )

result
#> # A tibble: 3 × 4
#>      id    t1    t2    t3
#>   <int> <dbl> <dbl> <dbl>
#> 1     1    10    12    14
#> 2     2    20    22    24
#> 3     3    30    32    34

# Verify dimensions
cat("Original: ", nrow(measurements), "rows\n")
#> Original:  9 rows
cat("Wide: ", nrow(result), "rows,", ncol(result) - 1, "time columns\n")
#> Wide:  3 rows, 3 time columns

Exercise 3:

safe_pivot_wider <- function(data, names_from, values_from, 
                             aggregate_fn = mean, warn = TRUE) {
  
  names_col <- rlang::ensym(names_from)
  values_col <- rlang::ensym(values_from)
  
  # Get ID columns (everything except names_from and values_from)
  id_cols <- setdiff(names(data), c(as.character(names_col), 
                                     as.character(values_col)))
  
  # Check for duplicates
  dup_check <- data %>%
    group_by(across(all_of(c(id_cols, as.character(names_col))))) %>%
    filter(n() > 1) %>%
    ungroup()
  
  if (nrow(dup_check) > 0) {
    if (warn) {
      warning("Found ", nrow(dup_check), " duplicate rows. ",
              "Aggregating with ", deparse(substitute(aggregate_fn)))
      print(head(dup_check))
    }
    
    # Aggregate
    data <- data %>%
      group_by(across(all_of(c(id_cols, as.character(names_col))))) %>%
      summarize(
        !!values_col := aggregate_fn(!!values_col),
        .groups = "drop"
      )
  }
  
  # Perform pivot
  result <- data %>%
    pivot_wider(
      names_from = !!names_col,
      values_from = !!values_col
    )
  
  # Validate
  expected_rows <- length(unique(interaction(data[id_cols])))
  actual_rows <- nrow(result)
  
  if (expected_rows != actual_rows) {
    warning("Row count mismatch! Expected: ", expected_rows, 
            ", Got: ", actual_rows)
  }
  
  result
}

# Test with clean data
measurements <- tibble(
  id = rep(1:3, each = 2),
  time = rep(c("t1", "t2"), 3),
  value = c(10, 15, 20, 25, 30, 35)
)

safe_pivot_wider(measurements, time, value)
#> # A tibble: 3 × 3
#>      id    t1    t2
#>   <int> <dbl> <dbl>
#> 1     1    10    15
#> 2     2    20    25
#> 3     3    30    35

# Test with duplicates
measurements_dup <- tibble(
  id = c(1, 1, 2, 2, 2),
  time = c("t1", "t1", "t1", "t2", "t2"),
  value = c(10, 12, 20, 25, 27)
)

safe_pivot_wider(measurements_dup, time, value)
#> Warning in safe_pivot_wider(measurements_dup, time, value): Found 4 duplicate
#> rows. Aggregating with mean
#> # A tibble: 4 × 3
#>      id time  value
#>   <dbl> <chr> <dbl>
#> 1     1 t1       10
#> 2     1 t1       12
#> 3     2 t2       25
#> 4     2 t2       27
#> # A tibble: 2 × 3
#>      id    t1    t2
#>   <dbl> <dbl> <dbl>
#> 1     1    11    NA
#> 2     2    20    26

Exercise 4:

survey <- tibble(
  id = 1:3,
  age = c(25, 30, 35),
  q1_pre = c(3, 4, 2),
  q1_post = c(4, 5, 3),
  q2_pre = c(2, 3, 4),
  q2_post = c(3, 4, 5)
)

# Method 1: Step by step
result <- survey %>%
  pivot_longer(
    cols = starts_with("q"),
    names_to = "question_time",
    values_to = "response"
  ) %>%
  separate(question_time, into = c("question", "time"), sep = "_")

result
#> # A tibble: 12 × 5
#>       id   age question time  response
#>    <int> <dbl> <chr>    <chr>    <dbl>
#>  1     1    25 q1       pre          3
#>  2     1    25 q1       post         4
#>  3     1    25 q2       pre          2
#>  4     1    25 q2       post         3
#>  5     2    30 q1       pre          4
#>  6     2    30 q1       post         5
#>  7     2    30 q2       pre          3
#>  8     2    30 q2       post         4
#>  9     3    35 q1       pre          2
#> 10     3    35 q1       post         3
#> 11     3    35 q2       pre          4
#> 12     3    35 q2       post         5

# Method 2: Using names_pattern
result2 <- survey %>%
  pivot_longer(
    cols = starts_with("q"),
    names_to = c("question", "time"),
    names_pattern = "(q\\d+)_(.*)",
    values_to = "response"
  )

result2
#> # A tibble: 12 × 5
#>       id   age question time  response
#>    <int> <dbl> <chr>    <chr>    <dbl>
#>  1     1    25 q1       pre          3
#>  2     1    25 q1       post         4
#>  3     1    25 q2       pre          2
#>  4     1    25 q2       post         3
#>  5     2    30 q1       pre          4
#>  6     2    30 q1       post         5
#>  7     2    30 q2       pre          3
#>  8     2    30 q2       post         4
#>  9     3    35 q1       pre          2
#> 10     3    35 q1       post         3
#> 11     3    35 q2       pre          4
#> 12     3    35 q2       post         5

# Verify
cat("Original questions:", (ncol(survey) - 2), "\n")
#> Original questions: 4
cat("Long format rows:", nrow(result), "\n")
#> Long format rows: 12
cat("Should be:", nrow(survey) * (ncol(survey) - 2), "\n")
#> Should be: 12