Chapter 23 Introduction to tidyr
What You’ll Learn:
- Tidy data principles
- Wide vs long format
- pivot_longer() and pivot_wider()
- Common reshaping errors
- When to reshape data
Key Errors Covered: 15+ tidyr errors
Difficulty: ⭐⭐ Intermediate to ⭐⭐⭐ Advanced
23.1 Introduction
tidyr helps reshape data between wide and long formats:
library(tidyr)
library(dplyr)
# Wide format
wide_data <- tibble(
id = 1:3,
time_1 = c(10, 20, 30),
time_2 = c(15, 25, 35),
time_3 = c(20, 30, 40)
)
# Convert to long format
long_data <- wide_data %>%
pivot_longer(
cols = starts_with("time"),
names_to = "time",
values_to = "value"
)
long_data
#> # A tibble: 9 × 3
#> id time value
#> <int> <chr> <dbl>
#> 1 1 time_1 10
#> 2 1 time_2 15
#> 3 1 time_3 20
#> 4 2 time_1 20
#> 5 2 time_2 25
#> 6 2 time_3 30
#> 7 3 time_1 30
#> 8 3 time_2 35
#> 9 3 time_3 40Let’s master reshaping!
23.2 Tidy Data Principles
💡 Key Insight: What is Tidy Data?
# Tidy data rules:
# 1. Each variable is a column
# 2. Each observation is a row
# 3. Each type of observational unit is a table
# Example: UNTIDY (wide format)
untidy <- tibble(
person = c("Alice", "Bob"),
age = c(25, 30),
income_2020 = c(50000, 60000),
income_2021 = c(52000, 62000),
income_2022 = c(54000, 64000)
)
untidy
#> # A tibble: 2 × 5
#> person age income_2020 income_2021 income_2022
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Alice 25 50000 52000 54000
#> 2 Bob 30 60000 62000 64000
# TIDY (long format)
tidy <- untidy %>%
pivot_longer(
cols = starts_with("income"),
names_to = "year",
names_prefix = "income_",
values_to = "income"
)
tidy
#> # A tibble: 6 × 4
#> person age year income
#> <chr> <dbl> <chr> <dbl>
#> 1 Alice 25 2020 50000
#> 2 Alice 25 2021 52000
#> 3 Alice 25 2022 54000
#> 4 Bob 30 2020 60000
#> 5 Bob 30 2021 62000
#> 6 Bob 30 2022 64000
# Now year is a variable, each row is person-year observationWhy tidy data? - Works naturally with dplyr and ggplot2 - Easy to filter, group, and summarize - Consistent structure for analysis
23.3 pivot_longer() Basics
💡 Key Insight: Wide to Long with pivot_longer()
# Basic pivot_longer
wide <- tibble(
id = 1:3,
q1 = c(5, 3, 4),
q2 = c(4, 5, 3),
q3 = c(3, 4, 5)
)
wide %>%
pivot_longer(
cols = q1:q3, # Columns to pivot
names_to = "quarter", # New column for names
values_to = "score" # New column for values
)
#> # A tibble: 9 × 3
#> id quarter score
#> <int> <chr> <dbl>
#> 1 1 q1 5
#> 2 1 q2 4
#> 3 1 q3 3
#> 4 2 q1 3
#> 5 2 q2 5
#> 6 2 q3 4
#> 7 3 q1 4
#> 8 3 q2 3
#> 9 3 q3 5
# Using selection helpers
wide %>%
pivot_longer(
cols = starts_with("q"),
names_to = "quarter",
values_to = "score"
)
#> # A tibble: 9 × 3
#> id quarter score
#> <int> <chr> <dbl>
#> 1 1 q1 5
#> 2 1 q2 4
#> 3 1 q3 3
#> 4 2 q1 3
#> 5 2 q2 5
#> 6 2 q3 4
#> 7 3 q1 4
#> 8 3 q2 3
#> 9 3 q3 5
# Keep some columns
wide %>%
pivot_longer(
cols = -id, # All except id
names_to = "quarter",
values_to = "score"
)
#> # A tibble: 9 × 3
#> id quarter score
#> <int> <chr> <dbl>
#> 1 1 q1 5
#> 2 1 q2 4
#> 3 1 q3 3
#> 4 2 q1 3
#> 5 2 q2 5
#> 6 2 q3 4
#> 7 3 q1 4
#> 8 3 q2 3
#> 9 3 q3 5
# Multiple value types
data <- tibble(
id = 1:2,
x_value = c(10, 20),
x_error = c(1, 2),
y_value = c(30, 40),
y_error = c(3, 4)
)
data %>%
pivot_longer(
cols = -id,
names_to = c("variable", ".value"),
names_sep = "_"
)
#> # A tibble: 4 × 4
#> id variable value error
#> <int> <chr> <dbl> <dbl>
#> 1 1 x 10 1
#> 2 1 y 30 3
#> 3 2 x 20 2
#> 4 2 y 40 423.4 Error #1: cols must select at least one column
⭐ BEGINNER 📋 ARGS
23.4.1 The Error
data <- tibble(
id = 1:3,
value = c(10, 20, 30)
)
data %>%
pivot_longer(
cols = starts_with("time"), # No columns match!
names_to = "time",
values_to = "measurement"
)
#> Error in `pivot_longer()`:
#> ! `cols` must select at least one column.🔴 ERROR
Error in pivot_longer():
! `cols` must select at least one column.
23.4.3 Solutions
✅ SOLUTION 1: Check Column Names
# Verify columns exist
names(data)
#> [1] "id" "value"
# Use correct selection
data_wide <- tibble(
id = 1:3,
time_1 = c(10, 20, 30),
time_2 = c(15, 25, 35)
)
data_wide %>%
pivot_longer(
cols = starts_with("time"),
names_to = "time",
values_to = "value"
)
#> # A tibble: 6 × 3
#> id time value
#> <int> <chr> <dbl>
#> 1 1 time_1 10
#> 2 1 time_2 15
#> 3 2 time_1 20
#> 4 2 time_2 25
#> 5 3 time_1 30
#> 6 3 time_2 35✅ SOLUTION 2: Use Flexible Selection
# Select all numeric columns
data_wide %>%
pivot_longer(
cols = where(is.numeric),
names_to = "variable",
values_to = "value"
)
#> # A tibble: 9 × 2
#> variable value
#> <chr> <dbl>
#> 1 id 1
#> 2 time_1 10
#> 3 time_2 15
#> 4 id 2
#> 5 time_1 20
#> 6 time_2 25
#> 7 id 3
#> 8 time_1 30
#> 9 time_2 35
# Or exclude specific columns
data_wide %>%
pivot_longer(
cols = -id,
names_to = "variable",
values_to = "value"
)
#> # A tibble: 6 × 3
#> id variable value
#> <int> <chr> <dbl>
#> 1 1 time_1 10
#> 2 1 time_2 15
#> 3 2 time_1 20
#> 4 2 time_2 25
#> 5 3 time_1 30
#> 6 3 time_2 3523.5 pivot_wider() Basics
💡 Key Insight: Long to Wide with pivot_wider()
# Basic pivot_wider
long <- tibble(
id = rep(1:3, each = 2),
time = rep(c("before", "after"), 3),
value = c(10, 15, 20, 25, 30, 35)
)
long %>%
pivot_wider(
names_from = time, # Column to get names from
values_from = value # Column to get values from
)
#> # A tibble: 3 × 3
#> id before after
#> <int> <dbl> <dbl>
#> 1 1 10 15
#> 2 2 20 25
#> 3 3 30 35
# Multiple value columns
long_multi <- tibble(
id = rep(1:2, each = 2),
time = rep(c("t1", "t2"), 2),
score = c(10, 15, 20, 25),
error = c(1, 2, 3, 4)
)
long_multi %>%
pivot_wider(
names_from = time,
values_from = c(score, error)
)
#> # A tibble: 2 × 5
#> id score_t1 score_t2 error_t1 error_t2
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 10 15 1 2
#> 2 2 20 25 3 4
# With name formatting
long %>%
pivot_wider(
names_from = time,
values_from = value,
names_prefix = "time_"
)
#> # A tibble: 3 × 3
#> id time_before time_after
#> <int> <dbl> <dbl>
#> 1 1 10 15
#> 2 2 20 25
#> 3 3 30 3523.6 Error #2: Values are not uniquely identified
⭐⭐ INTERMEDIATE 🧠 LOGIC
23.6.1 The Error
# Data with duplicates
data <- tibble(
id = c(1, 1, 2, 2),
time = c("before", "before", "before", "after"),
value = c(10, 15, 20, 25)
)
data %>%
pivot_wider(
names_from = time,
values_from = value
)
#> Warning: Values from `value` are not uniquely identified; output will contain list-cols.
#> • Use `values_fn = list` to suppress this warning.
#> • Use `values_fn = {summary_fun}` to summarise duplicates.
#> • Use the following dplyr code to identify duplicates.
#> {data} |>
#> dplyr::summarise(n = dplyr::n(), .by = c(id, time)) |>
#> dplyr::filter(n > 1L)
#> # A tibble: 2 × 3
#> id before after
#> <dbl> <list> <list>
#> 1 1 <dbl [2]> <NULL>
#> 2 2 <dbl [1]> <dbl [1]>🔴 WARNING/ERROR
Warning: Values from `value` are not uniquely identified; output will contain list-cols.
23.6.3 Common Causes
# Multiple measurements per combination
measurements <- tibble(
person = c("Alice", "Alice", "Bob", "Bob"),
test = c("test1", "test1", "test2", "test2"),
score = c(85, 87, 90, 92) # Two scores for Alice-test1
)
measurements %>%
pivot_wider(
names_from = test,
values_from = score
)
#> Warning: Values from `score` are not uniquely identified; output will contain list-cols.
#> • Use `values_fn = list` to suppress this warning.
#> • Use `values_fn = {summary_fun}` to summarise duplicates.
#> • Use the following dplyr code to identify duplicates.
#> {data} |>
#> dplyr::summarise(n = dplyr::n(), .by = c(person, test)) |>
#> dplyr::filter(n > 1L)
#> # A tibble: 2 × 3
#> person test1 test2
#> <chr> <list> <list>
#> 1 Alice <dbl [2]> <NULL>
#> 2 Bob <NULL> <dbl [2]>
# Creates list columns!23.6.4 Solutions
✅ SOLUTION 1: Aggregate First
✅ SOLUTION 2: Add Identifier
# Add row number to make unique
measurements %>%
group_by(person, test) %>%
mutate(attempt = row_number()) %>%
pivot_wider(
names_from = c(test, attempt),
values_from = score
)
#> # A tibble: 2 × 5
#> # Groups: person [2]
#> person test1_1 test1_2 test2_1 test2_2
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Alice 85 87 NA NA
#> 2 Bob NA NA 90 92✅ SOLUTION 3: Use values_fn
# Specify how to aggregate
measurements %>%
pivot_wider(
names_from = test,
values_from = score,
values_fn = list(score = mean)
)
#> # A tibble: 2 × 3
#> person test1 test2
#> <chr> <dbl> <dbl>
#> 1 Alice 86 NA
#> 2 Bob NA 91
# Or use list for multiple aggregations
measurements %>%
pivot_wider(
names_from = test,
values_from = score,
values_fn = list(score = list)
)
#> # A tibble: 2 × 3
#> person test1 test2
#> <chr> <list> <list>
#> 1 Alice <dbl [2]> <NULL>
#> 2 Bob <NULL> <dbl [2]>23.7 separate() and unite()
💡 Key Insight: Splitting and Combining Columns
# separate: split one column into multiple
data <- tibble(
id = 1:3,
name = c("Alice_Smith", "Bob_Jones", "Charlie_Brown")
)
data %>%
separate(name, into = c("first", "last"), sep = "_")
#> # A tibble: 3 × 3
#> id first last
#> <int> <chr> <chr>
#> 1 1 Alice Smith
#> 2 2 Bob Jones
#> 3 3 Charlie Brown
# With different separator
data2 <- tibble(
id = 1:3,
date = c("2024-01-15", "2024-02-20", "2024-03-25")
)
data2 %>%
separate(date, into = c("year", "month", "day"), sep = "-")
#> # A tibble: 3 × 4
#> id year month day
#> <int> <chr> <chr> <chr>
#> 1 1 2024 01 15
#> 2 2 2024 02 20
#> 3 3 2024 03 25
# Convert types
data2 %>%
separate(date, into = c("year", "month", "day"),
sep = "-", convert = TRUE)
#> # A tibble: 3 × 4
#> id year month day
#> <int> <int> <int> <int>
#> 1 1 2024 1 15
#> 2 2 2024 2 20
#> 3 3 2024 3 25
# unite: combine multiple columns
data_split <- tibble(
id = 1:3,
first = c("Alice", "Bob", "Charlie"),
last = c("Smith", "Jones", "Brown")
)
data_split %>%
unite(name, first, last, sep = " ")
#> # A tibble: 3 × 2
#> id name
#> <int> <chr>
#> 1 1 Alice Smith
#> 2 2 Bob Jones
#> 3 3 Charlie Brown
# Keep original columns
data_split %>%
unite(name, first, last, sep = " ", remove = FALSE)
#> # A tibble: 3 × 4
#> id name first last
#> <int> <chr> <chr> <chr>
#> 1 1 Alice Smith Alice Smith
#> 2 2 Bob Jones Bob Jones
#> 3 3 Charlie Brown Charlie Brown23.8 fill() for Missing Values
💡 Key Insight: Fill Missing Values
# Data with implicit missing values
data <- tibble(
group = c("A", NA, NA, "B", NA, "C"),
value = 1:6
)
# Fill down
data %>%
fill(group)
#> # A tibble: 6 × 2
#> group value
#> <chr> <int>
#> 1 A 1
#> 2 A 2
#> 3 A 3
#> 4 B 4
#> 5 B 5
#> 6 C 6
# Fill up
data %>%
fill(group, .direction = "up")
#> # A tibble: 6 × 2
#> group value
#> <chr> <int>
#> 1 A 1
#> 2 B 2
#> 3 B 3
#> 4 B 4
#> 5 C 5
#> 6 C 6
# Fill both directions
data %>%
fill(group, .direction = "downup")
#> # A tibble: 6 × 2
#> group value
#> <chr> <int>
#> 1 A 1
#> 2 A 2
#> 3 A 3
#> 4 B 4
#> 5 B 5
#> 6 C 623.9 complete() for Implicit Missing Values
💡 Key Insight: Make Implicit NAs Explicit
# Incomplete data
sales <- tibble(
product = c("A", "A", "B"),
quarter = c(1, 2, 1),
sales = c(100, 150, 200)
)
# Missing: product B quarter 2
# Make it explicit
sales %>%
complete(product, quarter)
#> # A tibble: 4 × 3
#> product quarter sales
#> <chr> <dbl> <dbl>
#> 1 A 1 100
#> 2 A 2 150
#> 3 B 1 200
#> 4 B 2 NA
# Fill with specific value
sales %>%
complete(product, quarter, fill = list(sales = 0))
#> # A tibble: 4 × 3
#> product quarter sales
#> <chr> <dbl> <dbl>
#> 1 A 1 100
#> 2 A 2 150
#> 3 B 1 200
#> 4 B 2 0
# With sequences
dates <- tibble(
date = as.Date(c("2024-01-01", "2024-01-03", "2024-01-05")),
value = c(10, 20, 30)
)
dates %>%
complete(date = seq(min(date), max(date), by = "day"))
#> # A tibble: 5 × 2
#> date value
#> <date> <dbl>
#> 1 2024-01-01 10
#> 2 2024-01-02 NA
#> 3 2024-01-03 20
#> 4 2024-01-04 NA
#> 5 2024-01-05 3023.10 Common Patterns
🎯 Best Practice: Reshaping Workflows
# 1. Survey data wide to long
survey_wide <- tibble(
id = 1:3,
q1_satisfaction = c(5, 4, 3),
q1_importance = c(5, 5, 4),
q2_satisfaction = c(4, 3, 5),
q2_importance = c(4, 4, 5)
)
survey_long <- survey_wide %>%
pivot_longer(
cols = -id,
names_to = c("question", "measure"),
names_sep = "_",
values_to = "rating"
)
survey_long
#> # A tibble: 12 × 4
#> id question measure rating
#> <int> <chr> <chr> <dbl>
#> 1 1 q1 satisfaction 5
#> 2 1 q1 importance 5
#> 3 1 q2 satisfaction 4
#> 4 1 q2 importance 4
#> 5 2 q1 satisfaction 4
#> 6 2 q1 importance 5
#> 7 2 q2 satisfaction 3
#> 8 2 q2 importance 4
#> 9 3 q1 satisfaction 3
#> 10 3 q1 importance 4
#> 11 3 q2 satisfaction 5
#> 12 3 q2 importance 5
# 2. Time series wide to long
stocks <- tibble(
date = as.Date("2024-01-01") + 0:2,
AAPL = c(150, 152, 149),
GOOGL = c(140, 142, 138),
MSFT = c(380, 385, 378)
)
stocks %>%
pivot_longer(
cols = -date,
names_to = "stock",
values_to = "price"
)
#> # A tibble: 9 × 3
#> date stock price
#> <date> <chr> <dbl>
#> 1 2024-01-01 AAPL 150
#> 2 2024-01-01 GOOGL 140
#> 3 2024-01-01 MSFT 380
#> 4 2024-01-02 AAPL 152
#> 5 2024-01-02 GOOGL 142
#> 6 2024-01-02 MSFT 385
#> 7 2024-01-03 AAPL 149
#> 8 2024-01-03 GOOGL 138
#> 9 2024-01-03 MSFT 378
# 3. Pivot wider for comparison
comparisons <- tibble(
id = rep(1:3, each = 2),
condition = rep(c("control", "treatment"), 3),
outcome = c(10, 12, 15, 18, 20, 22)
)
comparisons %>%
pivot_wider(
names_from = condition,
values_from = outcome
) %>%
mutate(difference = treatment - control)
#> # A tibble: 3 × 4
#> id control treatment difference
#> <int> <dbl> <dbl> <dbl>
#> 1 1 10 12 2
#> 2 2 15 18 3
#> 3 3 20 22 2
# 4. Complex reshape
data <- tibble(
id = 1:2,
treatment_1_pre = c(10, 15),
treatment_1_post = c(12, 17),
treatment_2_pre = c(20, 25),
treatment_2_post = c(22, 27)
)
data %>%
pivot_longer(
cols = -id,
names_to = c("treatment", "time"),
names_pattern = "treatment_(.*)_(.*)",
values_to = "score"
)
#> # A tibble: 8 × 4
#> id treatment time score
#> <int> <chr> <chr> <dbl>
#> 1 1 1 pre 10
#> 2 1 1 post 12
#> 3 1 2 pre 20
#> 4 1 2 post 22
#> 5 2 1 pre 15
#> 6 2 1 post 17
#> 7 2 2 pre 25
#> 8 2 2 post 2723.11 Summary
Key Takeaways:
- Tidy data principles - Variables in columns, observations in rows
- pivot_longer() - Wide to long format
- pivot_wider() - Long to wide format
- Check for duplicates - Before pivot_wider()
- separate() and unite() - Split/combine columns
- fill() - Handle missing values
- complete() - Make implicit NAs explicit
Quick Reference:
| Function | Purpose | Common Use |
|---|---|---|
| pivot_longer() | Wide → Long | Convert multiple columns to rows |
| pivot_wider() | Long → Wide | Spread values across columns |
| separate() | Split column | Parse combined data |
| unite() | Combine columns | Create composite keys |
| fill() | Fill NAs | Propagate values |
| complete() | Add missing combos | Explicit NAs |
Reshaping:
# Wide to long
data %>%
pivot_longer(
cols = col1:col3,
names_to = "variable",
values_to = "value"
)
# Long to wide
data %>%
pivot_wider(
names_from = variable,
values_from = value
)
# Split column
data %>%
separate(col, into = c("part1", "part2"), sep = "_")
# Combine columns
data %>%
unite(new_col, col1, col2, sep = "_")
# Fill missing
data %>%
fill(col, .direction = "down")
# Complete combinations
data %>%
complete(var1, var2, fill = list(value = 0))Best Practices:
# ✅ Good
Check data structure first # Know what you have
Aggregate before pivot_wider # Avoid list columns
Use descriptive names_to/values_to # Clear column names
Test on small sample first # Verify logic
# ❌ Avoid
Pivot_wider with duplicates # Creates list columns
Complex reshapes in one step # Hard to debug
Forgetting to specify cols # Error or unexpected
Not checking result structure # May not be what you want23.12 Exercises
📝 Exercise 1: Wide to Long
Given:
grades <- tibble(
student = c("Alice", "Bob", "Charlie"),
math_q1 = c(85, 78, 92),
math_q2 = c(88, 82, 95),
english_q1 = c(90, 85, 88),
english_q2 = c(92, 87, 90)
)Convert to long format with columns: student, subject, quarter, grade
📝 Exercise 2: Long to Wide
Given:
measurements <- tibble(
id = rep(1:3, each = 3),
time = rep(c("t1", "t2", "t3"), 3),
value = c(10, 12, 14, 20, 22, 24, 30, 32, 34)
)Convert to wide format with time as columns
📝 Exercise 3: Safe Pivot
Write safe_pivot_wider(data, names_from, values_from) that:
1. Checks for duplicate combinations
2. Aggregates if needed
3. Performs pivot
4. Validates result
23.13 Exercise Answers
Click to see answers
Exercise 1:
grades <- tibble(
student = c("Alice", "Bob", "Charlie"),
math_q1 = c(85, 78, 92),
math_q2 = c(88, 82, 95),
english_q1 = c(90, 85, 88),
english_q2 = c(92, 87, 90)
)
# Method 1: Two-step pivot
result <- grades %>%
pivot_longer(
cols = -student,
names_to = "subject_quarter",
values_to = "grade"
) %>%
separate(subject_quarter, into = c("subject", "quarter"), sep = "_")
result
#> # A tibble: 12 × 4
#> student subject quarter grade
#> <chr> <chr> <chr> <dbl>
#> 1 Alice math q1 85
#> 2 Alice math q2 88
#> 3 Alice english q1 90
#> 4 Alice english q2 92
#> 5 Bob math q1 78
#> 6 Bob math q2 82
#> 7 Bob english q1 85
#> 8 Bob english q2 87
#> 9 Charlie math q1 92
#> 10 Charlie math q2 95
#> 11 Charlie english q1 88
#> 12 Charlie english q2 90
# Method 2: Use names_pattern
result2 <- grades %>%
pivot_longer(
cols = -student,
names_to = c("subject", "quarter"),
names_pattern = "(.*)_(.*)",
values_to = "grade"
)
result2
#> # A tibble: 12 × 4
#> student subject quarter grade
#> <chr> <chr> <chr> <dbl>
#> 1 Alice math q1 85
#> 2 Alice math q2 88
#> 3 Alice english q1 90
#> 4 Alice english q2 92
#> 5 Bob math q1 78
#> 6 Bob math q2 82
#> 7 Bob english q1 85
#> 8 Bob english q2 87
#> 9 Charlie math q1 92
#> 10 Charlie math q2 95
#> 11 Charlie english q1 88
#> 12 Charlie english q2 90Exercise 2:
measurements <- tibble(
id = rep(1:3, each = 3),
time = rep(c("t1", "t2", "t3"), 3),
value = c(10, 12, 14, 20, 22, 24, 30, 32, 34)
)
result <- measurements %>%
pivot_wider(
names_from = time,
values_from = value
)
result
#> # A tibble: 3 × 4
#> id t1 t2 t3
#> <int> <dbl> <dbl> <dbl>
#> 1 1 10 12 14
#> 2 2 20 22 24
#> 3 3 30 32 34
# Verify dimensions
cat("Original: ", nrow(measurements), "rows\n")
#> Original: 9 rows
cat("Wide: ", nrow(result), "rows,", ncol(result) - 1, "time columns\n")
#> Wide: 3 rows, 3 time columnsExercise 3:
safe_pivot_wider <- function(data, names_from, values_from,
aggregate_fn = mean, warn = TRUE) {
names_col <- rlang::ensym(names_from)
values_col <- rlang::ensym(values_from)
# Get ID columns (everything except names_from and values_from)
id_cols <- setdiff(names(data), c(as.character(names_col),
as.character(values_col)))
# Check for duplicates
dup_check <- data %>%
group_by(across(all_of(c(id_cols, as.character(names_col))))) %>%
filter(n() > 1) %>%
ungroup()
if (nrow(dup_check) > 0) {
if (warn) {
warning("Found ", nrow(dup_check), " duplicate rows. ",
"Aggregating with ", deparse(substitute(aggregate_fn)))
print(head(dup_check))
}
# Aggregate
data <- data %>%
group_by(across(all_of(c(id_cols, as.character(names_col))))) %>%
summarize(
!!values_col := aggregate_fn(!!values_col),
.groups = "drop"
)
}
# Perform pivot
result <- data %>%
pivot_wider(
names_from = !!names_col,
values_from = !!values_col
)
# Validate
expected_rows <- length(unique(interaction(data[id_cols])))
actual_rows <- nrow(result)
if (expected_rows != actual_rows) {
warning("Row count mismatch! Expected: ", expected_rows,
", Got: ", actual_rows)
}
result
}
# Test with clean data
measurements <- tibble(
id = rep(1:3, each = 2),
time = rep(c("t1", "t2"), 3),
value = c(10, 15, 20, 25, 30, 35)
)
safe_pivot_wider(measurements, time, value)
#> # A tibble: 3 × 3
#> id t1 t2
#> <int> <dbl> <dbl>
#> 1 1 10 15
#> 2 2 20 25
#> 3 3 30 35
# Test with duplicates
measurements_dup <- tibble(
id = c(1, 1, 2, 2, 2),
time = c("t1", "t1", "t1", "t2", "t2"),
value = c(10, 12, 20, 25, 27)
)
safe_pivot_wider(measurements_dup, time, value)
#> Warning in safe_pivot_wider(measurements_dup, time, value): Found 4 duplicate
#> rows. Aggregating with mean
#> # A tibble: 4 × 3
#> id time value
#> <dbl> <chr> <dbl>
#> 1 1 t1 10
#> 2 1 t1 12
#> 3 2 t2 25
#> 4 2 t2 27
#> # A tibble: 2 × 3
#> id t1 t2
#> <dbl> <dbl> <dbl>
#> 1 1 11 NA
#> 2 2 20 26Exercise 4:
survey <- tibble(
id = 1:3,
age = c(25, 30, 35),
q1_pre = c(3, 4, 2),
q1_post = c(4, 5, 3),
q2_pre = c(2, 3, 4),
q2_post = c(3, 4, 5)
)
# Method 1: Step by step
result <- survey %>%
pivot_longer(
cols = starts_with("q"),
names_to = "question_time",
values_to = "response"
) %>%
separate(question_time, into = c("question", "time"), sep = "_")
result
#> # A tibble: 12 × 5
#> id age question time response
#> <int> <dbl> <chr> <chr> <dbl>
#> 1 1 25 q1 pre 3
#> 2 1 25 q1 post 4
#> 3 1 25 q2 pre 2
#> 4 1 25 q2 post 3
#> 5 2 30 q1 pre 4
#> 6 2 30 q1 post 5
#> 7 2 30 q2 pre 3
#> 8 2 30 q2 post 4
#> 9 3 35 q1 pre 2
#> 10 3 35 q1 post 3
#> 11 3 35 q2 pre 4
#> 12 3 35 q2 post 5
# Method 2: Using names_pattern
result2 <- survey %>%
pivot_longer(
cols = starts_with("q"),
names_to = c("question", "time"),
names_pattern = "(q\\d+)_(.*)",
values_to = "response"
)
result2
#> # A tibble: 12 × 5
#> id age question time response
#> <int> <dbl> <chr> <chr> <dbl>
#> 1 1 25 q1 pre 3
#> 2 1 25 q1 post 4
#> 3 1 25 q2 pre 2
#> 4 1 25 q2 post 3
#> 5 2 30 q1 pre 4
#> 6 2 30 q1 post 5
#> 7 2 30 q2 pre 3
#> 8 2 30 q2 post 4
#> 9 3 35 q1 pre 2
#> 10 3 35 q1 post 3
#> 11 3 35 q2 pre 4
#> 12 3 35 q2 post 5
# Verify
cat("Original questions:", (ncol(survey) - 2), "\n")
#> Original questions: 4
cat("Long format rows:", nrow(result), "\n")
#> Long format rows: 12
cat("Should be:", nrow(survey) * (ncol(survey) - 2), "\n")
#> Should be: 12