Chapter 25 Data Manipulation Best Practices

What You’ll Learn:

  • Efficient dplyr workflows
  • Performance optimization
  • Code organization
  • Common anti-patterns
  • Debugging strategies

Key Errors Covered: 10+ workflow errors

Difficulty: ⭐⭐ Intermediate to ⭐⭐⭐ Advanced

25.1 Introduction

Best practices make code faster, clearer, and more maintainable:

library(dplyr)
library(tidyr)

# Good: Clear pipeline
result <- mtcars %>%
  filter(mpg > 20) %>%
  select(mpg, cyl, hp) %>%
  mutate(hp_per_cyl = hp / cyl) %>%
  arrange(desc(hp_per_cyl))
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

head(result, 3)
#> # A tibble: 3 × 5
#>      id   age question time  response
#>   <int> <dbl> <chr>    <chr>    <dbl>
#> 1     1    25 q1       pre          3
#> 2     1    25 q1       post         4
#> 3     1    25 q2       pre          2

25.2 Performance Optimization

🎯 Best Practice: Optimize for Speed

# 1. Filter early
# Bad: filter after heavy operations
mtcars %>%
  mutate(complex = mpg * hp / wt) %>%
  left_join(mtcars, by = c("cyl")) %>%
  filter(mpg > 20)
#> Warning in left_join(., mtcars, by = c("cyl")): Detected an unexpected many-to-many relationship between `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set `relationship =
#>   "many-to-many"` to silence this warning.
#> Error in `filter()`:
#> ℹ In argument: `mpg > 20`.
#> Caused by error:
#> ! `..1` must be of size 366 or 1, not size 234.

# Good: filter first
mtcars %>%
  filter(mpg > 20) %>%
  mutate(complex = mpg * hp / wt) %>%
  left_join(mtcars, by = c("cyl"))
#> Warning in left_join(., mtcars, by = c("cyl")): Detected an unexpected many-to-many relationship between `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set `relationship =
#>   "many-to-many"` to silence this warning.
#>   mpg.x cyl disp.x hp.x drat.x wt.x qsec.x vs.x am.x gear.x carb.x cyl_factor.x
#> 1    21   6    160  110    3.9 2.62  16.46    0    1      4      4            6
#> 2    21   6    160  110    3.9 2.62  16.46    0    1      4      4            6
#> 3    21   6    160  110    3.9 2.62  16.46    0    1      4      4            6
#> 4    21   6    160  110    3.9 2.62  16.46    0    1      4      4            6
#>    complex mpg.y disp.y hp.y drat.y  wt.y qsec.y vs.y am.y gear.y carb.y
#> 1 881.6794  21.0    160  110   3.90 2.620  16.46    0    1      4      4
#> 2 881.6794  21.0    160  110   3.90 2.875  17.02    0    1      4      4
#> 3 881.6794  21.4    258  110   3.08 3.215  19.44    1    0      3      1
#> 4 881.6794  18.1    225  105   2.76 3.460  20.22    1    0      3      1
#>   cyl_factor.y
#> 1            6
#> 2            6
#> 3            6
#> 4            6
#>  [ reached 'max' / getOption("max.print") -- omitted 138 rows ]

# 2. Select early
# Bad: keep all columns
mtcars %>%
  mutate(new = mpg * 2) %>%
  filter(cyl == 4) %>%
  select(mpg, cyl, new)
#> Error in select(., mpg, cyl, new): unused arguments (mpg, cyl, new)

# Good: select needed columns early
mtcars %>%
  select(mpg, cyl, hp, wt) %>%
  filter(cyl == 4) %>%
  mutate(new = mpg * 2)
#> Error in select(., mpg, cyl, hp, wt): unused arguments (mpg, cyl, hp, wt)

# 3. Use data.table for large data
# library(dtplyr)
# lazy_dt(large_data) %>%
#   filter(...) %>%
#   as_tibble()

25.3 Code Organization

🎯 Best Practice: Readable Pipelines

# Good: One operation per line
result <- mtcars %>%
  filter(mpg > 20) %>%
  mutate(
    hp_per_ton = hp / wt,
    efficiency = mpg / hp
  ) %>%
  group_by(cyl) %>%
  summarize(
    avg_efficiency = mean(efficiency),
    n = n(),
    .groups = "drop"
  ) %>%
  arrange(desc(avg_efficiency))

# Good: Use intermediate variables for clarity
filtered_cars <- mtcars %>%
  filter(mpg > 20, cyl %in% c(4, 6))

efficiency_metrics <- filtered_cars %>%
  mutate(
    hp_per_ton = hp / wt,
    efficiency = mpg / hp
  )

final_summary <- efficiency_metrics %>%
  group_by(cyl) %>%
  summarize(
    avg_efficiency = mean(efficiency),
    n = n(),
    .groups = "drop"
  )

# Bad: Everything in one long pipe
# result <- mtcars %>% filter(mpg > 20) %>% mutate(hp_per_ton = hp / wt, efficiency = mpg / hp) %>% group_by(cyl) %>% summarize(avg_efficiency = mean(efficiency), n = n(), .groups = "drop") %>% arrange(desc(avg_efficiency))

25.4 Common Anti-Patterns

⚠️ Avoid These Patterns

# Anti-pattern 1: Growing data frame in loop
# Bad
result <- tibble()
for (i in 1:100) {
  result <- bind_rows(result, tibble(x = i, y = i^2))
}

# Good
result <- tibble(
  x = 1:100,
  y = x^2
)

# Anti-pattern 2: Unnecessary grouping
# Bad
mtcars %>%
  group_by(cyl) %>%
  select(mpg, cyl, hp)  # select doesn't need groups
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

# Good
mtcars %>%
  select(mpg, cyl, hp) %>%
  group_by(cyl)  # Group only when needed
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

# Anti-pattern 3: Repeated similar code
# Bad
mtcars %>%
  summarize(
    mean_mpg = mean(mpg),
    mean_hp = mean(hp),
    mean_wt = mean(wt),
    mean_disp = mean(disp)
  )
#>   mean_mpg  mean_hp mean_wt mean_disp
#> 1 20.09062 146.6875 3.21725  230.7219

# Good
mtcars %>%
  summarize(across(c(mpg, hp, wt, disp), mean, .names = "mean_{.col}"))
#>   mean_mpg  mean_hp mean_wt mean_disp
#> 1 20.09062 146.6875 3.21725  230.7219

# Anti-pattern 4: Ignoring NAs
# Bad
data_with_na <- tibble(x = c(1, 2, NA, 4))
data_with_na %>%
  summarize(mean_x = mean(x))  # Returns NA!
#> # A tibble: 1 × 1
#>   mean_x
#>    <dbl>
#> 1     NA

# Good
data_with_na %>%
  summarize(mean_x = mean(x, na.rm = TRUE))
#> # A tibble: 1 × 1
#>   mean_x
#>    <dbl>
#> 1   2.33

25.5 Debugging Strategies

🎯 Best Practice: Debug Effectively

# 1. Check intermediate steps
mtcars %>%
  filter(mpg > 20) %>%
  {cat("After filter:", nrow(.), "rows\n"); .} %>%
  select(mpg, cyl, hp) %>%
  {cat("After select:", ncol(.), "cols\n"); .} %>%
  head(3)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

# 2. Use View() in RStudio
# mtcars %>%
#   filter(mpg > 20) %>%
#   View()

# 3. Print summary statistics
debug_data <- function(data, label = "") {
  cat("\n=== Debug:", label, "===\n")
  cat("Rows:", nrow(data), "\n")
  cat("Cols:", ncol(data), "\n")
  cat("Columns:", paste(names(data), collapse = ", "), "\n")
  data
}

mtcars %>%
  debug_data("original") %>%
  filter(mpg > 20) %>%
  debug_data("after filter")
#> 
#> === Debug: after filter ===
#> 
#> === Debug: original ===
#> Rows: 32 
#> Cols: 12 
#> Columns: mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb, cyl_factor 
#> Rows: 14 
#> Cols: 12 
#> Columns: mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb, cyl_factor
#>                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1          4
#> Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1          6
#> Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2          4
#> Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2          4
#> Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1          4
#> Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2          4
#>  [ reached 'max' / getOption("max.print") -- omitted 6 rows ]

# 4. Check for common issues
check_pipeline <- function(data) {
  cat("=== Pipeline Check ===\n")
  cat("Rows:", nrow(data), "\n")
  cat("NA counts:\n")
  print(colSums(is.na(data)))
  cat("\nGrouping:", paste(group_vars(data), collapse = ", "), "\n")
  cat("Grouped:", is_grouped_df(data), "\n")
  data
}

25.6 Documentation

🎯 Best Practice: Document Complex Operations

# Good: Comments explain WHY, not WHAT
clean_sales_data <- function(data) {
  data %>%
    # Remove test transactions (internal use only)
    filter(customer_id >= 1000) %>%
    # Standardize date format for downstream systems
    mutate(date = as.Date(date)) %>%
    # Calculate revenue (excluding tax per accounting rules)
    mutate(revenue = quantity * price) %>%
    # Remove cancelled orders (status code 99)
    filter(status != 99)
}

# Good: Function documentation
#' Calculate customer lifetime value
#'
#' @param orders Data frame with customer_id, date, amount
#' @param window_days Number of days to consider (default: 365)
#' @return Data frame with customer_id and lifetime_value
calculate_ltv <- function(orders, window_days = 365) {
  cutoff_date <- Sys.Date() - window_days
  
  orders %>%
    filter(date >= cutoff_date) %>%
    group_by(customer_id) %>%
    summarize(
      lifetime_value = sum(amount),
      n_orders = n(),
      .groups = "drop"
    )
}

25.7 Error Handling

🎯 Best Practice: Handle Errors Gracefully

# Validate inputs
safe_summarize <- function(data, col) {
  # Check data frame
  if (!is.data.frame(data)) {
    stop("data must be a data frame")
  }
  
  # Check column exists
  if (!col %in% names(data)) {
    stop("Column '", col, "' not found. ",
         "Available: ", paste(names(data), collapse = ", "))
  }
  
  # Check numeric
  if (!is.numeric(data[[col]])) {
    stop("Column '", col, "' must be numeric")
  }
  
  # Perform operation
  data %>%
    summarize(
      mean = mean(.data[[col]], na.rm = TRUE),
      sd = sd(.data[[col]], na.rm = TRUE),
      n = sum(!is.na(.data[[col]]))
    )
}

# Test
safe_summarize(mtcars, "mpg")
#>       mean       sd  n
#> 1 20.09062 6.026948 32
safe_summarize(mtcars, "missing_col")
#> Error in safe_summarize(mtcars, "missing_col"): Column 'missing_col' not found. Available: mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb, cyl_factor

25.8 Testing Pipelines

🎯 Best Practice: Test Your Code

# Create test function
test_clean_data <- function() {
  # Setup test data
  test_data <- tibble(
    id = 1:5,
    value = c(10, NA, 30, 40, 50),
    group = c("A", "A", "B", "B", "C")
  )
  
  # Run pipeline
  result <- test_data %>%
    filter(!is.na(value)) %>%
    group_by(group) %>%
    summarize(mean_value = mean(value), .groups = "drop")
  
  # Check results
  stopifnot(nrow(result) == 3)  # 3 groups
  stopifnot(all(!is.na(result$mean_value)))  # No NAs
  stopifnot("group" %in% names(result))  # Has group column
  
  message("✓ All tests passed")
  TRUE
}

test_clean_data()
#> ✓ All tests passed
#> [1] TRUE

25.9 Summary

Key Takeaways:

  1. Filter early - Reduce data size first
  2. Select needed columns - Don’t carry extra data
  3. One operation per line - Readable pipelines
  4. Use across() - Avoid repetition
  5. Handle NAs - Always specify na.rm
  6. Ungroup - Clean up after group operations
  7. Document WHY - Not what code does
  8. Test code - Catch errors early

Best Practices:

# ✅ Excellent pipeline
result <- data %>%
  # Filter first (reduce data)
  filter(important_condition) %>%
  # Select needed columns
  select(needed_cols) %>%
  # Transform
  mutate(new_col = calculation) %>%
  # Aggregate
  group_by(category) %>%
  summarize(
    metric = mean(value, na.rm = TRUE),
    n = n(),
    .groups = "drop"  # Always ungroup
  ) %>%
  # Sort
  arrange(desc(metric))

# ✅ Use functions for repeated logic
calculate_stats <- function(data, group_var, value_var) {
  data %>%
    group_by({{ group_var }}) %>%
    summarize(
      across({{ value_var }},
             list(mean = ~mean(., na.rm = TRUE),
                  sd = ~sd(., na.rm = TRUE)),
             .names = "{.col}_{.fn}"),
      .groups = "drop"
    )
}

# ✅ Document complex operations
# Remove outliers using IQR method (1.5 * IQR)
clean_data <- data %>%
  group_by(category) %>%
  filter(
    value >= quantile(value, 0.25) - 1.5 * IQR(value),
    value <= quantile(value, 0.75) + 1.5 * IQR(value)
  ) %>%
  ungroup()

Anti-Patterns to Avoid:

# ❌ Bad
Growing data in loops
Unnecessary grouping
Repeated similar code
Ignoring NAs
No error checking
Complex one-liners
No comments on complex logic

# ✅ Good
Vectorized operations
Group only when needed
Use across() for repetition
Always handle NAs
Validate inputs
Break into readable steps
Document WHY not WHAT

25.10 Exercise

📝 Exercise: Refactor Pipeline

Improve this code:

result <- data %>%
  mutate(x = a + b, y = c + d, z = e + f) %>%
  filter(category == "A" | category == "B" | category == "C") %>%
  group_by(category) %>%
  summarize(mean_x = mean(x), mean_y = mean(y), mean_z = mean(z)) %>%
  filter(mean_x > 0)

25.11 Exercise Answer

Click to see answer
# Refactored version
result <- data %>%
  # Filter early for better performance
  filter(category %in% c("A", "B", "C")) %>%
  # Use across() to avoid repetition
  mutate(
    across(c(a, b), .names = "sum_{.col}",
           .fns = ~ . + .)
  ) %>%
  # Group for aggregation
  group_by(category) %>%
  # Use across() in summarize
  summarize(
    across(starts_with("sum_"),
           mean,
           na.rm = TRUE,
           .names = "mean_{.col}"),
    .groups = "drop"
  ) %>%
  # Filter on summarized values
  filter(mean_sum_a > 0)

25.12 Completion

Part VIII Complete!

You’ve mastered: - dplyr fundamentals (filter, select, mutate, etc.) - Grouping and summarizing - Joining data - Reshaping with tidyr - Advanced tidy evaluation - Best practices and optimization

Ready for: Part IX (Graphics) or other advanced topics!