Chapter 25 Data Manipulation Best Practices
What You’ll Learn:
- Efficient dplyr workflows
- Performance optimization
- Code organization
- Common anti-patterns
- Debugging strategies
Key Errors Covered: 10+ workflow errors
Difficulty: ⭐⭐ Intermediate to ⭐⭐⭐ Advanced
25.1 Introduction
Best practices make code faster, clearer, and more maintainable:
library(dplyr)
library(tidyr)
# Good: Clear pipeline
result <- mtcars %>%
filter(mpg > 20) %>%
select(mpg, cyl, hp) %>%
mutate(hp_per_cyl = hp / cyl) %>%
arrange(desc(hp_per_cyl))
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)
head(result, 3)
#> # A tibble: 3 × 5
#> id age question time response
#> <int> <dbl> <chr> <chr> <dbl>
#> 1 1 25 q1 pre 3
#> 2 1 25 q1 post 4
#> 3 1 25 q2 pre 225.2 Performance Optimization
🎯 Best Practice: Optimize for Speed
# 1. Filter early
# Bad: filter after heavy operations
mtcars %>%
mutate(complex = mpg * hp / wt) %>%
left_join(mtcars, by = c("cyl")) %>%
filter(mpg > 20)
#> Warning in left_join(., mtcars, by = c("cyl")): Detected an unexpected many-to-many relationship between `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set `relationship =
#> "many-to-many"` to silence this warning.
#> Error in `filter()`:
#> ℹ In argument: `mpg > 20`.
#> Caused by error:
#> ! `..1` must be of size 366 or 1, not size 234.
# Good: filter first
mtcars %>%
filter(mpg > 20) %>%
mutate(complex = mpg * hp / wt) %>%
left_join(mtcars, by = c("cyl"))
#> Warning in left_join(., mtcars, by = c("cyl")): Detected an unexpected many-to-many relationship between `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set `relationship =
#> "many-to-many"` to silence this warning.
#> mpg.x cyl disp.x hp.x drat.x wt.x qsec.x vs.x am.x gear.x carb.x cyl_factor.x
#> 1 21 6 160 110 3.9 2.62 16.46 0 1 4 4 6
#> 2 21 6 160 110 3.9 2.62 16.46 0 1 4 4 6
#> 3 21 6 160 110 3.9 2.62 16.46 0 1 4 4 6
#> 4 21 6 160 110 3.9 2.62 16.46 0 1 4 4 6
#> complex mpg.y disp.y hp.y drat.y wt.y qsec.y vs.y am.y gear.y carb.y
#> 1 881.6794 21.0 160 110 3.90 2.620 16.46 0 1 4 4
#> 2 881.6794 21.0 160 110 3.90 2.875 17.02 0 1 4 4
#> 3 881.6794 21.4 258 110 3.08 3.215 19.44 1 0 3 1
#> 4 881.6794 18.1 225 105 2.76 3.460 20.22 1 0 3 1
#> cyl_factor.y
#> 1 6
#> 2 6
#> 3 6
#> 4 6
#> [ reached 'max' / getOption("max.print") -- omitted 138 rows ]
# 2. Select early
# Bad: keep all columns
mtcars %>%
mutate(new = mpg * 2) %>%
filter(cyl == 4) %>%
select(mpg, cyl, new)
#> Error in select(., mpg, cyl, new): unused arguments (mpg, cyl, new)
# Good: select needed columns early
mtcars %>%
select(mpg, cyl, hp, wt) %>%
filter(cyl == 4) %>%
mutate(new = mpg * 2)
#> Error in select(., mpg, cyl, hp, wt): unused arguments (mpg, cyl, hp, wt)
# 3. Use data.table for large data
# library(dtplyr)
# lazy_dt(large_data) %>%
# filter(...) %>%
# as_tibble()25.3 Code Organization
🎯 Best Practice: Readable Pipelines
# Good: One operation per line
result <- mtcars %>%
filter(mpg > 20) %>%
mutate(
hp_per_ton = hp / wt,
efficiency = mpg / hp
) %>%
group_by(cyl) %>%
summarize(
avg_efficiency = mean(efficiency),
n = n(),
.groups = "drop"
) %>%
arrange(desc(avg_efficiency))
# Good: Use intermediate variables for clarity
filtered_cars <- mtcars %>%
filter(mpg > 20, cyl %in% c(4, 6))
efficiency_metrics <- filtered_cars %>%
mutate(
hp_per_ton = hp / wt,
efficiency = mpg / hp
)
final_summary <- efficiency_metrics %>%
group_by(cyl) %>%
summarize(
avg_efficiency = mean(efficiency),
n = n(),
.groups = "drop"
)
# Bad: Everything in one long pipe
# result <- mtcars %>% filter(mpg > 20) %>% mutate(hp_per_ton = hp / wt, efficiency = mpg / hp) %>% group_by(cyl) %>% summarize(avg_efficiency = mean(efficiency), n = n(), .groups = "drop") %>% arrange(desc(avg_efficiency))25.4 Common Anti-Patterns
⚠️ Avoid These Patterns
# Anti-pattern 1: Growing data frame in loop
# Bad
result <- tibble()
for (i in 1:100) {
result <- bind_rows(result, tibble(x = i, y = i^2))
}
# Good
result <- tibble(
x = 1:100,
y = x^2
)
# Anti-pattern 2: Unnecessary grouping
# Bad
mtcars %>%
group_by(cyl) %>%
select(mpg, cyl, hp) # select doesn't need groups
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)
# Good
mtcars %>%
select(mpg, cyl, hp) %>%
group_by(cyl) # Group only when needed
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)
# Anti-pattern 3: Repeated similar code
# Bad
mtcars %>%
summarize(
mean_mpg = mean(mpg),
mean_hp = mean(hp),
mean_wt = mean(wt),
mean_disp = mean(disp)
)
#> mean_mpg mean_hp mean_wt mean_disp
#> 1 20.09062 146.6875 3.21725 230.7219
# Good
mtcars %>%
summarize(across(c(mpg, hp, wt, disp), mean, .names = "mean_{.col}"))
#> mean_mpg mean_hp mean_wt mean_disp
#> 1 20.09062 146.6875 3.21725 230.7219
# Anti-pattern 4: Ignoring NAs
# Bad
data_with_na <- tibble(x = c(1, 2, NA, 4))
data_with_na %>%
summarize(mean_x = mean(x)) # Returns NA!
#> # A tibble: 1 × 1
#> mean_x
#> <dbl>
#> 1 NA
# Good
data_with_na %>%
summarize(mean_x = mean(x, na.rm = TRUE))
#> # A tibble: 1 × 1
#> mean_x
#> <dbl>
#> 1 2.3325.5 Debugging Strategies
🎯 Best Practice: Debug Effectively
# 1. Check intermediate steps
mtcars %>%
filter(mpg > 20) %>%
{cat("After filter:", nrow(.), "rows\n"); .} %>%
select(mpg, cyl, hp) %>%
{cat("After select:", ncol(.), "cols\n"); .} %>%
head(3)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)
# 2. Use View() in RStudio
# mtcars %>%
# filter(mpg > 20) %>%
# View()
# 3. Print summary statistics
debug_data <- function(data, label = "") {
cat("\n=== Debug:", label, "===\n")
cat("Rows:", nrow(data), "\n")
cat("Cols:", ncol(data), "\n")
cat("Columns:", paste(names(data), collapse = ", "), "\n")
data
}
mtcars %>%
debug_data("original") %>%
filter(mpg > 20) %>%
debug_data("after filter")
#>
#> === Debug: after filter ===
#>
#> === Debug: original ===
#> Rows: 32
#> Cols: 12
#> Columns: mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb, cyl_factor
#> Rows: 14
#> Cols: 12
#> Columns: mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb, cyl_factor
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 4
#> Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 6
#> Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 4
#> Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 4
#> Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 4
#> Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 4
#> [ reached 'max' / getOption("max.print") -- omitted 6 rows ]
# 4. Check for common issues
check_pipeline <- function(data) {
cat("=== Pipeline Check ===\n")
cat("Rows:", nrow(data), "\n")
cat("NA counts:\n")
print(colSums(is.na(data)))
cat("\nGrouping:", paste(group_vars(data), collapse = ", "), "\n")
cat("Grouped:", is_grouped_df(data), "\n")
data
}25.6 Documentation
🎯 Best Practice: Document Complex Operations
# Good: Comments explain WHY, not WHAT
clean_sales_data <- function(data) {
data %>%
# Remove test transactions (internal use only)
filter(customer_id >= 1000) %>%
# Standardize date format for downstream systems
mutate(date = as.Date(date)) %>%
# Calculate revenue (excluding tax per accounting rules)
mutate(revenue = quantity * price) %>%
# Remove cancelled orders (status code 99)
filter(status != 99)
}
# Good: Function documentation
#' Calculate customer lifetime value
#'
#' @param orders Data frame with customer_id, date, amount
#' @param window_days Number of days to consider (default: 365)
#' @return Data frame with customer_id and lifetime_value
calculate_ltv <- function(orders, window_days = 365) {
cutoff_date <- Sys.Date() - window_days
orders %>%
filter(date >= cutoff_date) %>%
group_by(customer_id) %>%
summarize(
lifetime_value = sum(amount),
n_orders = n(),
.groups = "drop"
)
}25.7 Error Handling
🎯 Best Practice: Handle Errors Gracefully
# Validate inputs
safe_summarize <- function(data, col) {
# Check data frame
if (!is.data.frame(data)) {
stop("data must be a data frame")
}
# Check column exists
if (!col %in% names(data)) {
stop("Column '", col, "' not found. ",
"Available: ", paste(names(data), collapse = ", "))
}
# Check numeric
if (!is.numeric(data[[col]])) {
stop("Column '", col, "' must be numeric")
}
# Perform operation
data %>%
summarize(
mean = mean(.data[[col]], na.rm = TRUE),
sd = sd(.data[[col]], na.rm = TRUE),
n = sum(!is.na(.data[[col]]))
)
}
# Test
safe_summarize(mtcars, "mpg")
#> mean sd n
#> 1 20.09062 6.026948 3225.8 Testing Pipelines
🎯 Best Practice: Test Your Code
# Create test function
test_clean_data <- function() {
# Setup test data
test_data <- tibble(
id = 1:5,
value = c(10, NA, 30, 40, 50),
group = c("A", "A", "B", "B", "C")
)
# Run pipeline
result <- test_data %>%
filter(!is.na(value)) %>%
group_by(group) %>%
summarize(mean_value = mean(value), .groups = "drop")
# Check results
stopifnot(nrow(result) == 3) # 3 groups
stopifnot(all(!is.na(result$mean_value))) # No NAs
stopifnot("group" %in% names(result)) # Has group column
message("✓ All tests passed")
TRUE
}
test_clean_data()
#> ✓ All tests passed
#> [1] TRUE25.9 Summary
Key Takeaways:
- Filter early - Reduce data size first
- Select needed columns - Don’t carry extra data
- One operation per line - Readable pipelines
- Use across() - Avoid repetition
- Handle NAs - Always specify na.rm
- Ungroup - Clean up after group operations
- Document WHY - Not what code does
- Test code - Catch errors early
Best Practices:
# ✅ Excellent pipeline
result <- data %>%
# Filter first (reduce data)
filter(important_condition) %>%
# Select needed columns
select(needed_cols) %>%
# Transform
mutate(new_col = calculation) %>%
# Aggregate
group_by(category) %>%
summarize(
metric = mean(value, na.rm = TRUE),
n = n(),
.groups = "drop" # Always ungroup
) %>%
# Sort
arrange(desc(metric))
# ✅ Use functions for repeated logic
calculate_stats <- function(data, group_var, value_var) {
data %>%
group_by({{ group_var }}) %>%
summarize(
across({{ value_var }},
list(mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE)),
.names = "{.col}_{.fn}"),
.groups = "drop"
)
}
# ✅ Document complex operations
# Remove outliers using IQR method (1.5 * IQR)
clean_data <- data %>%
group_by(category) %>%
filter(
value >= quantile(value, 0.25) - 1.5 * IQR(value),
value <= quantile(value, 0.75) + 1.5 * IQR(value)
) %>%
ungroup()Anti-Patterns to Avoid:
# ❌ Bad
Growing data in loops
Unnecessary grouping
Repeated similar code
Ignoring NAs
No error checking
Complex one-liners
No comments on complex logic
# ✅ Good
Vectorized operations
Group only when needed
Use across() for repetition
Always handle NAs
Validate inputs
Break into readable steps
Document WHY not WHAT25.11 Exercise Answer
Click to see answer
# Refactored version
result <- data %>%
# Filter early for better performance
filter(category %in% c("A", "B", "C")) %>%
# Use across() to avoid repetition
mutate(
across(c(a, b), .names = "sum_{.col}",
.fns = ~ . + .)
) %>%
# Group for aggregation
group_by(category) %>%
# Use across() in summarize
summarize(
across(starts_with("sum_"),
mean,
na.rm = TRUE,
.names = "mean_{.col}"),
.groups = "drop"
) %>%
# Filter on summarized values
filter(mean_sum_a > 0)