Chapter 20 Introduction to dplyr

What You’ll Learn:

  • dplyr basics and philosophy
  • The pipe operator (%>% and |>)
  • Core dplyr verbs
  • Common dplyr errors
  • Tidy evaluation basics

Key Errors Covered: 20+ dplyr errors

Difficulty: ⭐⭐ Intermediate

20.1 Introduction

dplyr revolutionized R data manipulation:

library(dplyr)

# Try to filter
mtcars %>% filter(mpg > 20)
#>                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1          4
#> Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1          6
#> Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2          4
#> Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2          4
#> Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1          4
#> Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2          4
#>  [ reached 'max' / getOption("max.print") -- omitted 6 rows ]
# Works! But errors are common...
mtcars %>% filter(mpg > 20 & cyl = 4)  # Oops!
#> Error in parse(text = input): <text>:2:34: unexpected '='
#> 1: # Works! But errors are common...
#> 2: mtcars %>% filter(mpg > 20 & cyl =
#>                                     ^

Let’s master dplyr to avoid these pitfalls.

20.2 dplyr Basics

💡 Key Insight: The dplyr Philosophy

library(dplyr)

# Core verbs (functions)
# 1. filter() - keep rows
# 2. select() - keep columns
# 3. mutate() - create/modify columns
# 4. arrange() - sort rows
# 5. summarize() - aggregate data
# 6. group_by() - group for operations

# Each returns a data frame
# Easy to chain with pipe

# Example data
head(mtcars)
#>                    mpg cyl disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1          4
#> Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1          6
#> Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2          8
#> Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1          6

Key principles: - Each function does one thing well - First argument is always data - Returns a data frame - Works with pipes - NSE (non-standard evaluation) for column names

20.3 The Pipe Operator

💡 Key Insight: Pipe Operators

# Traditional nesting (hard to read)
result <- arrange(
  filter(
    select(mtcars, mpg, cyl, hp),
    mpg > 20
  ),
  desc(hp)
)
#> Error in select(mtcars, mpg, cyl, hp): unused arguments (mpg, cyl, hp)

# With magrittr pipe %>%
result <- mtcars %>%
  select(mpg, cyl, hp) %>%
  filter(mpg > 20) %>%
  arrange(desc(hp))
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

# Native pipe |> (R >= 4.1)
result <- mtcars |>
  select(mpg, cyl, hp) |>
  filter(mpg > 20) |>
  arrange(desc(hp))
#> Error in select(mtcars, mpg, cyl, hp): unused arguments (mpg, cyl, hp)

result
#> [1] "Success!"

# How pipe works
# x %>% f(y) becomes f(x, y)
# x %>% f(y) %>% g(z) becomes g(f(x, y), z)

# Pipe sends left side as first argument to right side
mtcars %>% head(3)
#>                mpg cyl disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4     21.0   6  160 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710    22.8   4  108  93 3.85 2.320 18.61  1  1    4    1          4
# Same as:
head(mtcars, 3)
#>                mpg cyl disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4     21.0   6  160 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710    22.8   4  108  93 3.85 2.320 18.61  1  1    4    1          4

Differences between %>% and |>: - %>% from magrittr (more flexible) - |> native R (R >= 4.1, faster) - %>% has more features (., shortcuts) - |> simpler, no package needed

20.4 Error #1: object not found with pipes

⭐ BEGINNER 🔍 SCOPE

20.4.1 The Error

library(dplyr)

mtcars %>%
  filter(mpg > 20) %>%
  select(mpg, cyl, hp) %>%
  arrange(horsepower)  # Wrong column name!
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

🔴 ERROR

Error in arrange():
! Can't subset columns that don't exist.
✖ Column `horsepower` doesn't exist.

20.4.2 What It Means

You’re referencing a column that doesn’t exist in the data.

20.4.3 Common Causes

20.4.3.1 Cause 1: Typo in Column Name

mtcars %>%
  select(mpgg, cyl)  # Typo: mpgg instead of mpg
#> Error in select(., mpgg, cyl): unused arguments (mpgg, cyl)

20.4.3.2 Cause 2: Column Already Removed

mtcars %>%
  select(mpg, cyl) %>%
  filter(hp > 100)  # hp not in selected columns!
#> Error in select(., mpg, cyl): unused arguments (mpg, cyl)

20.4.3.3 Cause 3: Wrong Dataset

# Expecting different columns
iris %>%
  select(mpg, cyl)  # iris doesn't have these columns
#> Error in select(., mpg, cyl): unused arguments (mpg, cyl)

20.4.4 Solutions

SOLUTION 1: Check Column Names

# Before piping, check what you have
names(mtcars)
#>  [1] "mpg"        "cyl"        "disp"       "hp"         "drat"      
#>  [6] "wt"         "qsec"       "vs"         "am"         "gear"      
#> [11] "carb"       "cyl_factor"

# Or with glimpse
glimpse(mtcars)
#> Rows: 32
#> Columns: 12
#> $ mpg        <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2,…
#> $ cyl        <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4,…
#> $ disp       <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140…
#> $ hp         <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 18…
#> $ drat       <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92,…
#> $ wt         <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.1…
#> $ qsec       <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.…
#> $ vs         <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,…
#> $ am         <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,…
#> $ gear       <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4,…
#> $ carb       <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1,…
#> $ cyl_factor <fct> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4,…

# In pipe chain
mtcars %>%
  names()
#>  [1] "mpg"        "cyl"        "disp"       "hp"         "drat"      
#>  [6] "wt"         "qsec"       "vs"         "am"         "gear"      
#> [11] "carb"       "cyl_factor"

SOLUTION 2: Order Operations Correctly

# Filter before selecting
mtcars %>%
  filter(hp > 100) %>%  # Use hp while it exists
  select(mpg, cyl)      # Then select subset
#> Error in select(., mpg, cyl): unused arguments (mpg, cyl)

# Or keep needed columns
mtcars %>%
  select(mpg, cyl, hp) %>%  # Keep hp
  filter(hp > 100)          # Now can use it
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

SOLUTION 3: Use Column Existence Check

safe_filter <- function(data, ...) {
  # Check if columns exist
  expr <- rlang::enexpr(...)
  cols_used <- all.vars(expr)
  
  missing_cols <- setdiff(cols_used, names(data))
  
  if (length(missing_cols) > 0) {
    stop("Columns not found: ", paste(missing_cols, collapse = ", "))
  }
  
  filter(data, ...)
}

# Will give helpful error
# mtcars %>% safe_filter(horsepower > 100)

20.5 filter() Basics

💡 Key Insight: filter() Patterns

# Keep rows where condition is TRUE
mtcars %>%
  filter(mpg > 20)
#>                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1          4
#> Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1          6
#> Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2          4
#> Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2          4
#> Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1          4
#> Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2          4
#>  [ reached 'max' / getOption("max.print") -- omitted 6 rows ]

# Multiple conditions (AND)
mtcars %>%
  filter(mpg > 20, cyl == 4)
#>                 mpg cyl  disp hp drat    wt  qsec vs am gear carb cyl_factor
#> Datsun 710     22.8   4 108.0 93 3.85 2.320 18.61  1  1    4    1          4
#> Merc 240D      24.4   4 146.7 62 3.69 3.190 20.00  1  0    4    2          4
#> Merc 230       22.8   4 140.8 95 3.92 3.150 22.90  1  0    4    2          4
#> Fiat 128       32.4   4  78.7 66 4.08 2.200 19.47  1  1    4    1          4
#> Honda Civic    30.4   4  75.7 52 4.93 1.615 18.52  1  1    4    2          4
#> Toyota Corolla 33.9   4  71.1 65 4.22 1.835 19.90  1  1    4    1          4
#> Toyota Corona  21.5   4 120.1 97 3.70 2.465 20.01  1  0    3    1          4
#> Fiat X1-9      27.3   4  79.0 66 4.08 1.935 18.90  1  1    4    1          4
#>  [ reached 'max' / getOption("max.print") -- omitted 3 rows ]

# Same as
mtcars %>%
  filter(mpg > 20 & cyl == 4)
#>                 mpg cyl  disp hp drat    wt  qsec vs am gear carb cyl_factor
#> Datsun 710     22.8   4 108.0 93 3.85 2.320 18.61  1  1    4    1          4
#> Merc 240D      24.4   4 146.7 62 3.69 3.190 20.00  1  0    4    2          4
#> Merc 230       22.8   4 140.8 95 3.92 3.150 22.90  1  0    4    2          4
#> Fiat 128       32.4   4  78.7 66 4.08 2.200 19.47  1  1    4    1          4
#> Honda Civic    30.4   4  75.7 52 4.93 1.615 18.52  1  1    4    2          4
#> Toyota Corolla 33.9   4  71.1 65 4.22 1.835 19.90  1  1    4    1          4
#> Toyota Corona  21.5   4 120.1 97 3.70 2.465 20.01  1  0    3    1          4
#> Fiat X1-9      27.3   4  79.0 66 4.08 1.935 18.90  1  1    4    1          4
#>  [ reached 'max' / getOption("max.print") -- omitted 3 rows ]

# OR condition
mtcars %>%
  filter(mpg > 25 | hp > 200)
#>                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
#> Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
#> Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
#> Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
#> Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
#> Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
#> Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
#> Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
#> Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
#>                     cyl_factor
#> Duster 360                   8
#> Cadillac Fleetwood           8
#> Lincoln Continental          8
#> Chrysler Imperial            8
#> Fiat 128                     4
#> Honda Civic                  4
#> Toyota Corolla               4
#> Camaro Z28                   8
#>  [ reached 'max' / getOption("max.print") -- omitted 5 rows ]

# Complex conditions
mtcars %>%
  filter(mpg > 20 & (cyl == 4 | cyl == 6))
#>                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1          4
#> Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1          6
#> Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2          4
#> Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2          4
#> Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1          4
#> Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2          4
#>  [ reached 'max' / getOption("max.print") -- omitted 6 rows ]

# Using %in%
mtcars %>%
  filter(cyl %in% c(4, 6))
#>                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1          4
#> Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1          6
#> Valiant        18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1          6
#> Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2          4
#> Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2          4
#> Merc 280       19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4          6
#>  [ reached 'max' / getOption("max.print") -- omitted 10 rows ]

# Negation
mtcars %>%
  filter(!(cyl %in% c(4, 6)))
#>                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
#> Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
#> Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
#> Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
#> Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
#> Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
#> Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
#> Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
#> Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
#>                     cyl_factor
#> Hornet Sportabout            8
#> Duster 360                   8
#> Merc 450SE                   8
#> Merc 450SL                   8
#> Merc 450SLC                  8
#> Cadillac Fleetwood           8
#> Lincoln Continental          8
#> Chrysler Imperial            8
#>  [ reached 'max' / getOption("max.print") -- omitted 6 rows ]

# Or
mtcars %>%
  filter(!cyl %in% c(4, 6))
#>                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
#> Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
#> Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
#> Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
#> Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
#> Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
#> Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
#> Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
#> Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
#>                     cyl_factor
#> Hornet Sportabout            8
#> Duster 360                   8
#> Merc 450SE                   8
#> Merc 450SL                   8
#> Merc 450SLC                  8
#> Cadillac Fleetwood           8
#> Lincoln Continental          8
#> Chrysler Imperial            8
#>  [ reached 'max' / getOption("max.print") -- omitted 6 rows ]

# Filter with string operations
mtcars %>%
  rownames_to_column("car") %>%
  filter(grepl("Merc", car))
#>           car  mpg cyl  disp  hp drat   wt qsec vs am gear carb cyl_factor
#> 1   Merc 240D 24.4   4 146.7  62 3.69 3.19 20.0  1  0    4    2          4
#> 2    Merc 230 22.8   4 140.8  95 3.92 3.15 22.9  1  0    4    2          4
#> 3    Merc 280 19.2   6 167.6 123 3.92 3.44 18.3  1  0    4    4          6
#> 4   Merc 280C 17.8   6 167.6 123 3.92 3.44 18.9  1  0    4    4          6
#> 5  Merc 450SE 16.4   8 275.8 180 3.07 4.07 17.4  0  0    3    3          8
#> 6  Merc 450SL 17.3   8 275.8 180 3.07 3.73 17.6  0  0    3    3          8
#> 7 Merc 450SLC 15.2   8 275.8 180 3.07 3.78 18.0  0  0    3    3          8

# Remove NAs
data_with_na <- tibble(x = c(1, 2, NA, 4))
data_with_na %>%
  filter(!is.na(x))
#> # A tibble: 3 × 1
#>       x
#>   <dbl>
#> 1     1
#> 2     2
#> 3     4

20.6 Error #2: Using = instead of ==

⭐ BEGINNER 🔤 SYNTAX

20.6.1 The Error

mtcars %>%
  filter(cyl = 4)  # Wrong: = is assignment
#> Error in `filter()`:
#> ! We detected a named input.
#> ℹ This usually means that you've used `=` instead of `==`.
#> ℹ Did you mean `cyl == 4`?

🔴 ERROR

Error in filter():
! We detected a named input.
ℹ This usually means that you've used `=` instead of `==`.
ℹ Did you mean `cyl == 4`?

20.6.2 What It Means

You used assignment (=) instead of comparison (==).

20.6.3 Solutions

SOLUTION: Use == for Comparison

# Correct: use ==
mtcars %>%
  filter(cyl == 4)
#>                 mpg cyl  disp hp drat    wt  qsec vs am gear carb cyl_factor
#> Datsun 710     22.8   4 108.0 93 3.85 2.320 18.61  1  1    4    1          4
#> Merc 240D      24.4   4 146.7 62 3.69 3.190 20.00  1  0    4    2          4
#> Merc 230       22.8   4 140.8 95 3.92 3.150 22.90  1  0    4    2          4
#> Fiat 128       32.4   4  78.7 66 4.08 2.200 19.47  1  1    4    1          4
#> Honda Civic    30.4   4  75.7 52 4.93 1.615 18.52  1  1    4    2          4
#> Toyota Corolla 33.9   4  71.1 65 4.22 1.835 19.90  1  1    4    1          4
#> Toyota Corona  21.5   4 120.1 97 3.70 2.465 20.01  1  0    3    1          4
#> Fiat X1-9      27.3   4  79.0 66 4.08 1.935 18.90  1  1    4    1          4
#>  [ reached 'max' / getOption("max.print") -- omitted 3 rows ]

# Other comparison operators
mtcars %>%
  filter(mpg > 20)      # Greater than
#>                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1          4
#> Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1          6
#> Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2          4
#> Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2          4
#> Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1          4
#> Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2          4
#>  [ reached 'max' / getOption("max.print") -- omitted 6 rows ]

mtcars %>%
  filter(mpg >= 20)     # Greater or equal
#>                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1          4
#> Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1          6
#> Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2          4
#> Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2          4
#> Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1          4
#> Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2          4
#>  [ reached 'max' / getOption("max.print") -- omitted 6 rows ]

mtcars %>%
  filter(mpg < 20)      # Less than
#>                    mpg cyl  disp  hp drat   wt  qsec vs am gear carb cyl_factor
#> Hornet Sportabout 18.7   8 360.0 175 3.15 3.44 17.02  0  0    3    2          8
#> Valiant           18.1   6 225.0 105 2.76 3.46 20.22  1  0    3    1          6
#> Duster 360        14.3   8 360.0 245 3.21 3.57 15.84  0  0    3    4          8
#> Merc 280          19.2   6 167.6 123 3.92 3.44 18.30  1  0    4    4          6
#> Merc 280C         17.8   6 167.6 123 3.92 3.44 18.90  1  0    4    4          6
#> Merc 450SE        16.4   8 275.8 180 3.07 4.07 17.40  0  0    3    3          8
#> Merc 450SL        17.3   8 275.8 180 3.07 3.73 17.60  0  0    3    3          8
#> Merc 450SLC       15.2   8 275.8 180 3.07 3.78 18.00  0  0    3    3          8
#>  [ reached 'max' / getOption("max.print") -- omitted 10 rows ]

mtcars %>%
  filter(mpg <= 20)     # Less or equal
#>                    mpg cyl  disp  hp drat   wt  qsec vs am gear carb cyl_factor
#> Hornet Sportabout 18.7   8 360.0 175 3.15 3.44 17.02  0  0    3    2          8
#> Valiant           18.1   6 225.0 105 2.76 3.46 20.22  1  0    3    1          6
#> Duster 360        14.3   8 360.0 245 3.21 3.57 15.84  0  0    3    4          8
#> Merc 280          19.2   6 167.6 123 3.92 3.44 18.30  1  0    4    4          6
#> Merc 280C         17.8   6 167.6 123 3.92 3.44 18.90  1  0    4    4          6
#> Merc 450SE        16.4   8 275.8 180 3.07 4.07 17.40  0  0    3    3          8
#> Merc 450SL        17.3   8 275.8 180 3.07 3.73 17.60  0  0    3    3          8
#> Merc 450SLC       15.2   8 275.8 180 3.07 3.78 18.00  0  0    3    3          8
#>  [ reached 'max' / getOption("max.print") -- omitted 10 rows ]

mtcars %>%
  filter(mpg != 4)      # Not equal
#>                    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
#> Mazda RX4         21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
#> Mazda RX4 Wag     21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
#> Datsun 710        22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
#> Hornet 4 Drive    21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
#> Hornet Sportabout 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
#> Valiant           18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
#> Duster 360        14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
#> Merc 240D         24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
#>                   cyl_factor
#> Mazda RX4                  6
#> Mazda RX4 Wag              6
#> Datsun 710                 4
#> Hornet 4 Drive             6
#> Hornet Sportabout          8
#> Valiant                    6
#> Duster 360                 8
#> Merc 240D                  4
#>  [ reached 'max' / getOption("max.print") -- omitted 24 rows ]

# For NA, use is.na()
data_with_na <- tibble(x = c(1, NA, 3))
data_with_na %>%
  filter(is.na(x))      # Keep NAs
#> # A tibble: 1 × 1
#>       x
#>   <dbl>
#> 1    NA

data_with_na %>%
  filter(!is.na(x))     # Remove NAs
#> # A tibble: 2 × 1
#>       x
#>   <dbl>
#> 1     1
#> 2     3

20.7 select() Basics

💡 Key Insight: select() Patterns

# Select specific columns
mtcars %>%
  select(mpg, cyl, hp)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

# Select range
mtcars %>%
  select(mpg:hp)
#> Error in select(., mpg:hp): unused argument (mpg:hp)

# Select by position
mtcars %>%
  select(1:3)
#> Error in select(., 1:3): unused argument (1:3)

# Exclude columns
mtcars %>%
  select(-mpg, -cyl)
#> Error in select(., -mpg, -cyl): unused arguments (-mpg, -cyl)

# Exclude range
mtcars %>%
  select(-(mpg:disp))
#> Error in select(., -(mpg:disp)): unused argument (-(mpg:disp))

# Helper functions
# starts_with()
iris %>%
  select(starts_with("Sepal"))
#> Error in select(., starts_with("Sepal")): unused argument (starts_with("Sepal"))

# ends_with()
iris %>%
  select(ends_with("Width"))
#> Error in select(., ends_with("Width")): unused argument (ends_with("Width"))

# contains()
iris %>%
  select(contains("etal"))
#> Error in select(., contains("etal")): unused argument (contains("etal"))

# matches() - regex
mtcars %>%
  select(matches("^d"))  # Starts with 'd'
#> Error in select(., matches("^d")): unused argument (matches("^d"))

# everything() - all remaining
mtcars %>%
  select(mpg, cyl, everything())
#> Error in select(., mpg, cyl, everything()): unused arguments (mpg, cyl, everything())

# where() - by condition
mtcars %>%
  select(where(is.numeric))
#> Error in select(., where(is.numeric)): unused argument (where(is.numeric))

# Rename while selecting
mtcars %>%
  select(miles_per_gallon = mpg, cylinders = cyl)
#> Error in select(., miles_per_gallon = mpg, cylinders = cyl): unused arguments (miles_per_gallon = mpg, cylinders = cyl)

# Just rename (keep all)
mtcars %>%
  rename(miles_per_gallon = mpg)
#>                   miles_per_gallon cyl  disp  hp drat    wt  qsec vs am gear
#> Mazda RX4                     21.0   6 160.0 110 3.90 2.620 16.46  0  1    4
#> Mazda RX4 Wag                 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4
#> Datsun 710                    22.8   4 108.0  93 3.85 2.320 18.61  1  1    4
#> Hornet 4 Drive                21.4   6 258.0 110 3.08 3.215 19.44  1  0    3
#> Hornet Sportabout             18.7   8 360.0 175 3.15 3.440 17.02  0  0    3
#> Valiant                       18.1   6 225.0 105 2.76 3.460 20.22  1  0    3
#> Duster 360                    14.3   8 360.0 245 3.21 3.570 15.84  0  0    3
#> Merc 240D                     24.4   4 146.7  62 3.69 3.190 20.00  1  0    4
#>                   carb cyl_factor
#> Mazda RX4            4          6
#> Mazda RX4 Wag        4          6
#> Datsun 710           1          4
#> Hornet 4 Drive       1          6
#> Hornet Sportabout    2          8
#> Valiant              1          6
#> Duster 360           4          8
#> Merc 240D            2          4
#>  [ reached 'max' / getOption("max.print") -- omitted 24 rows ]

20.8 Error #3: unused argument in select

⭐⭐ INTERMEDIATE 📋 ARGS

20.8.1 The Error

mtcars %>%
  select(mpg, cyl, cyl == 4)  # Can't filter in select!
#> Error in select(., mpg, cyl, cyl == 4): unused arguments (mpg, cyl, cyl == 4)

🔴 ERROR

Error in select():
! Can't subset columns with `cyl == 4`.
✖ `cyl == 4` must be numeric or character, not `logical`.

20.8.2 What It Means

You’re trying to use filter logic in select, or using wrong syntax.

20.8.3 Common Causes

# Trying to filter in select
mtcars %>%
  select(mpg, cyl > 4)
#> Error in select(., mpg, cyl > 4): unused arguments (mpg, cyl > 4)

# Wrong helper syntax
mtcars %>%
  select(starts_with(Sepal))  # Missing quotes
#> Error in select(., starts_with(Sepal)): unused argument (starts_with(Sepal))

20.8.4 Solutions

SOLUTION: Separate select and filter

# Correct: separate operations
mtcars %>%
  select(mpg, cyl, hp) %>%
  filter(cyl == 4)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

# Or filter first
mtcars %>%
  filter(cyl == 4) %>%
  select(mpg, cyl, hp)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

# Use quotes with helpers
iris %>%
  select(starts_with("Sepal"))
#> Error in select(., starts_with("Sepal")): unused argument (starts_with("Sepal"))

20.9 mutate() Basics

💡 Key Insight: mutate() Patterns

# Create new column
mtcars %>%
  mutate(mpg_per_cyl = mpg / cyl) %>%
  select(mpg, cyl, mpg_per_cyl)
#> Error in select(., mpg, cyl, mpg_per_cyl): unused arguments (mpg, cyl, mpg_per_cyl)

# Multiple new columns
mtcars %>%
  mutate(
    kpl = mpg * 0.425,
    wt_kg = wt * 453.592
  ) %>%
  select(mpg, kpl, wt, wt_kg)
#> Error in select(., mpg, kpl, wt, wt_kg): unused arguments (mpg, kpl, wt, wt_kg)

# Modify existing column
mtcars %>%
  mutate(hp = hp * 0.746) %>%  # Convert to kW
  select(hp)
#> Error in select(., hp): unused argument (hp)

# Use newly created columns
mtcars %>%
  mutate(
    hp_per_cyl = hp / cyl,
    hp_ratio = hp_per_cyl / mean(hp_per_cyl)
  ) %>%
  select(hp, cyl, hp_per_cyl, hp_ratio)
#> Error in select(., hp, cyl, hp_per_cyl, hp_ratio): unused arguments (hp, cyl, hp_per_cyl, hp_ratio)

# Conditional creation
mtcars %>%
  mutate(
    efficiency = case_when(
      mpg > 25 ~ "High",
      mpg > 20 ~ "Medium",
      TRUE ~ "Low"
    )
  ) %>%
  select(mpg, efficiency)
#> Error in select(., mpg, efficiency): unused arguments (mpg, efficiency)

# With ifelse
mtcars %>%
  mutate(
    heavy = ifelse(wt > 3.5, "Heavy", "Light")
  ) %>%
  select(wt, heavy)
#> Error in select(., wt, heavy): unused arguments (wt, heavy)

# Keep only new columns with transmute()
mtcars %>%
  transmute(
    car = rownames(mtcars),
    kpl = mpg * 0.425
  )
#>                                     car     kpl
#> Mazda RX4                     Mazda RX4  8.9250
#> Mazda RX4 Wag             Mazda RX4 Wag  8.9250
#> Datsun 710                   Datsun 710  9.6900
#> Hornet 4 Drive           Hornet 4 Drive  9.0950
#> Hornet Sportabout     Hornet Sportabout  7.9475
#> Valiant                         Valiant  7.6925
#> Duster 360                   Duster 360  6.0775
#> Merc 240D                     Merc 240D 10.3700
#> Merc 230                       Merc 230  9.6900
#> Merc 280                       Merc 280  8.1600
#> Merc 280C                     Merc 280C  7.5650
#> Merc 450SE                   Merc 450SE  6.9700
#> Merc 450SL                   Merc 450SL  7.3525
#> Merc 450SLC                 Merc 450SLC  6.4600
#> Cadillac Fleetwood   Cadillac Fleetwood  4.4200
#> Lincoln Continental Lincoln Continental  4.4200
#> Chrysler Imperial     Chrysler Imperial  6.2475
#> Fiat 128                       Fiat 128 13.7700
#> Honda Civic                 Honda Civic 12.9200
#> Toyota Corolla           Toyota Corolla 14.4075
#> Toyota Corona             Toyota Corona  9.1375
#> Dodge Challenger       Dodge Challenger  6.5875
#> AMC Javelin                 AMC Javelin  6.4600
#> Camaro Z28                   Camaro Z28  5.6525
#> Pontiac Firebird       Pontiac Firebird  8.1600
#> Fiat X1-9                     Fiat X1-9 11.6025
#> Porsche 914-2             Porsche 914-2 11.0500
#> Lotus Europa               Lotus Europa 12.9200
#> Ford Pantera L           Ford Pantera L  6.7150
#> Ferrari Dino               Ferrari Dino  8.3725
#> Maserati Bora             Maserati Bora  6.3750
#> Volvo 142E                   Volvo 142E  9.0950

20.10 Error #4: object not found in mutate

⭐⭐ INTERMEDIATE 🔍 SCOPE

20.10.1 The Error

mpg_threshold <- 20

mtcars %>%
  mutate(efficient = mpg > mpg_treshold)  # Typo!
#> Error in `mutate()`:
#> ℹ In argument: `efficient = mpg > mpg_treshold`.
#> Caused by error:
#> ! object 'mpg_treshold' not found

🔴 ERROR

Error in mutate():
! object 'mpg_treshold' not found

20.10.2 What It Means

Referenced an object or column that doesn’t exist.

20.10.3 Common Causes

# Typo in external variable
threshold <- 20
mtcars %>%
  mutate(high = mpg > threshhold)
#> Error in `mutate()`:
#> ℹ In argument: `high = mpg > threshhold`.
#> Caused by error:
#> ! object 'threshhold' not found

# Typo in column name
mtcars %>%
  mutate(power_ratio = horsepower / wt)
#> Error in `mutate()`:
#> ℹ In argument: `power_ratio = horsepower/wt`.
#> Caused by error:
#> ! object 'horsepower' not found

# Using column before creating it
mtcars %>%
  mutate(
    mpg_ratio = mpg_scaled / mean(mpg_scaled),
    mpg_scaled = mpg / mean(mpg)
  )
#>                    mpg cyl disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1          4
#> Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1          6
#> Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2          8
#> Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1          6
#> Duster 360        14.3   8  360 245 3.21 3.570 15.84  0  0    3    4          8
#>                       mpg_ratio mpg_scaled
#> Mazda RX4          2.121443e+15  1.0452636
#> Mazda RX4 Wag      2.121443e+15  1.0452636
#> Datsun 710         6.320589e+15  1.1348577
#> Hornet 4 Drive     3.054587e+15  1.0651734
#> Hornet Sportabout -3.244132e+15  0.9307824
#> Valiant           -4.643847e+15  0.9009177
#> Duster 360        -1.350871e+16  0.7117748
#>  [ reached 'max' / getOption("max.print") -- omitted 25 rows ]

20.10.4 Solutions

SOLUTION 1: Check Names Carefully

# Check column names
names(mtcars)
#>  [1] "mpg"        "cyl"        "disp"       "hp"         "drat"      
#>  [6] "wt"         "qsec"       "vs"         "am"         "gear"      
#> [11] "carb"       "cyl_factor"

# Check variable exists
threshold <- 20
exists("threshold")
#> [1] TRUE

# Correct usage
mtcars %>%
  mutate(efficient = mpg > threshold) %>%
  select(mpg, efficient)
#> Error in select(., mpg, efficient): unused arguments (mpg, efficient)

SOLUTION 2: Order Operations

# Create columns in right order
mtcars %>%
  mutate(
    mpg_scaled = mpg / mean(mpg),
    mpg_ratio = mpg_scaled / mean(mpg_scaled)
  ) %>%
  select(mpg, mpg_scaled, mpg_ratio)
#> Error in select(., mpg, mpg_scaled, mpg_ratio): unused arguments (mpg, mpg_scaled, mpg_ratio)

SOLUTION 3: Use .data pronoun

# Explicit about column reference
mtcars %>%
  mutate(power_ratio = .data$hp / .data$wt) %>%
  select(hp, wt, power_ratio)
#> Error in select(., hp, wt, power_ratio): unused arguments (hp, wt, power_ratio)

# Useful in functions
calculate_ratio <- function(data, num_col, denom_col) {
  data %>%
    mutate(ratio = .data[[num_col]] / .data[[denom_col]])
}

calculate_ratio(mtcars, "hp", "wt") %>%
  select(hp, wt, ratio)
#> Error in select(., hp, wt, ratio): unused arguments (hp, wt, ratio)

20.11 arrange() Basics

💡 Key Insight: arrange() Patterns

# Sort ascending (default)
mtcars %>%
  arrange(mpg) %>%
  select(mpg, cyl, hp)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

# Sort descending
mtcars %>%
  arrange(desc(mpg)) %>%
  select(mpg, cyl, hp)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

# Multiple columns
mtcars %>%
  arrange(cyl, desc(mpg)) %>%
  select(cyl, mpg, hp)
#> Error in select(., cyl, mpg, hp): unused arguments (cyl, mpg, hp)

# With NA handling
data_with_na <- tibble(
  x = c(3, 1, NA, 2),
  y = c("a", "b", "c", "d")
)

data_with_na %>%
  arrange(x)  # NAs last by default
#> # A tibble: 4 × 2
#>       x y    
#>   <dbl> <chr>
#> 1     1 b    
#> 2     2 d    
#> 3     3 a    
#> 4    NA c

# NAs first
data_with_na %>%
  arrange(desc(is.na(x)), x)
#> # A tibble: 4 × 2
#>       x y    
#>   <dbl> <chr>
#> 1    NA c    
#> 2     1 b    
#> 3     2 d    
#> 4     3 a

20.12 summarize() Basics

💡 Key Insight: summarize() Patterns

# Single summary
mtcars %>%
  summarize(avg_mpg = mean(mpg))
#>    avg_mpg
#> 1 20.09062

# Multiple summaries
mtcars %>%
  summarize(
    avg_mpg = mean(mpg),
    sd_mpg = sd(mpg),
    min_mpg = min(mpg),
    max_mpg = max(mpg),
    n = n()
  )
#>    avg_mpg   sd_mpg min_mpg max_mpg  n
#> 1 20.09062 6.026948    10.4    33.9 32

# With grouping (next chapter covers in detail)
mtcars %>%
  group_by(cyl) %>%
  summarize(
    avg_mpg = mean(mpg),
    count = n()
  )
#> # A tibble: 3 × 3
#>     cyl avg_mpg count
#>   <dbl>   <dbl> <int>
#> 1     4    26.7    11
#> 2     6    19.7     7
#> 3     8    15.1    14

# Useful summary functions
mtcars %>%
  summarize(
    mean = mean(mpg),
    median = median(mpg),
    sd = sd(mpg),
    min = min(mpg),
    max = max(mpg),
    q25 = quantile(mpg, 0.25),
    q75 = quantile(mpg, 0.75),
    n = n(),
    n_distinct = n_distinct(cyl)
  )
#>       mean median       sd  min  max    q25  q75  n n_distinct
#> 1 20.09062   19.2 6.026948 10.4 33.9 15.425 22.8 32          3

20.13 Common Patterns

🎯 Best Practice: Common dplyr Workflows

# 1. Filter → Select → Arrange
mtcars %>%
  filter(mpg > 20) %>%
  select(mpg, cyl, hp) %>%
  arrange(desc(mpg))
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

# 2. Create variables → Summarize
mtcars %>%
  mutate(hp_per_cyl = hp / cyl) %>%
  summarize(
    avg_hp_per_cyl = mean(hp_per_cyl),
    max_hp_per_cyl = max(hp_per_cyl)
  )
#>   avg_hp_per_cyl max_hp_per_cyl
#> 1        23.0013         41.875

# 3. Group → Summarize → Arrange
mtcars %>%
  group_by(cyl) %>%
  summarize(
    avg_mpg = mean(mpg),
    count = n()
  ) %>%
  arrange(desc(avg_mpg))
#> # A tibble: 3 × 3
#>     cyl avg_mpg count
#>   <dbl>   <dbl> <int>
#> 1     4    26.7    11
#> 2     6    19.7     7
#> 3     8    15.1    14

# 4. Top N
mtcars %>%
  arrange(desc(mpg)) %>%
  slice_head(n = 5)
#>                 mpg cyl disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Toyota Corolla 33.9   4 71.1  65 4.22 1.835 19.90  1  1    4    1          4
#> Fiat 128       32.4   4 78.7  66 4.08 2.200 19.47  1  1    4    1          4
#> Honda Civic    30.4   4 75.7  52 4.93 1.615 18.52  1  1    4    2          4
#> Lotus Europa   30.4   4 95.1 113 3.77 1.513 16.90  1  1    5    2          4
#> Fiat X1-9      27.3   4 79.0  66 4.08 1.935 18.90  1  1    4    1          4

# 5. Distinct values
mtcars %>%
  distinct(cyl, gear)
#>                   cyl gear
#> Mazda RX4           6    4
#> Datsun 710          4    4
#> Hornet 4 Drive      6    3
#> Hornet Sportabout   8    3
#> Toyota Corona       4    3
#> Porsche 914-2       4    5
#> Ford Pantera L      8    5
#> Ferrari Dino        6    5

# 6. Count
mtcars %>%
  count(cyl, sort = TRUE)
#>   cyl  n
#> 1   8 14
#> 2   4 11
#> 3   6  7

# 7. Add row numbers
mtcars %>%
  mutate(row_id = row_number()) %>%
  select(row_id, everything())
#> Error in select(., row_id, everything()): unused arguments (row_id, everything())

20.14 Error #5: Forgetting to assign or print

⭐ BEGINNER 💡 USAGE

20.14.1 The Problem

# Doesn't save result!
mtcars %>%
  filter(mpg > 20) %>%
  select(mpg, cyl)
#> Error in select(., mpg, cyl): unused arguments (mpg, cyl)

# Original unchanged
head(mtcars, 2)
#>               mpg cyl disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4      21   6  160 110  3.9 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag  21   6  160 110  3.9 2.875 17.02  0  1    4    4          6

20.14.2 What It Means

dplyr doesn’t modify in place - must assign result.

20.14.3 Solutions

SOLUTION 1: Assign Result

# Save to new variable
high_mpg <- mtcars %>%
  filter(mpg > 20) %>%
  select(mpg, cyl)
#> Error in select(., mpg, cyl): unused arguments (mpg, cyl)

head(high_mpg, 3)
#> Error: object 'high_mpg' not found

# Or overwrite (careful!)
mtcars_filtered <- mtcars %>%
  filter(mpg > 20)

SOLUTION 2: Just Print (for exploration)

# Print automatically (no assignment)
mtcars %>%
  filter(mpg > 20) %>%
  select(mpg, cyl)
#> Error in select(., mpg, cyl): unused arguments (mpg, cyl)

# Explicit print
mtcars %>%
  filter(mpg > 20) %>%
  select(mpg, cyl) %>%
  print()
#> Error in select(., mpg, cyl): unused arguments (mpg, cyl)

20.15 Debugging dplyr Chains

🎯 Best Practice: Debug Pipe Chains

# 1. Run step by step
# Start here
mtcars %>%
  filter(mpg > 20)
#>                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1          4
#> Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1          6
#> Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2          4
#> Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2          4
#> Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1          4
#> Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2          4
#>  [ reached 'max' / getOption("max.print") -- omitted 6 rows ]

# Add next step
mtcars %>%
  filter(mpg > 20) %>%
  select(mpg, cyl, hp)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

# Continue...
mtcars %>%
  filter(mpg > 20) %>%
  select(mpg, cyl, hp) %>%
  arrange(desc(hp))
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

# 2. Check intermediate results
mtcars %>%
  filter(mpg > 20) %>%
  {cat("After filter:", nrow(.), "rows\n"); .} %>%
  select(mpg, cyl, hp) %>%
  {cat("After select:", ncol(.), "cols\n"); .} %>%
  arrange(desc(hp))
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

# 3. Use View() in RStudio
# mtcars %>%
#   filter(mpg > 20) %>%
#   View()

# 4. Print at each step
debug_pipe <- function(data, label = NULL) {
  if (!is.null(label)) {
    cat("\n===", label, "===\n")
  }
  print(head(data, 3))
  data
}

mtcars %>%
  filter(mpg > 20) %>%
  debug_pipe("After filter") %>%
  select(mpg, cyl, hp) %>%
  debug_pipe("After select")
#> 
#> === After select ===
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

20.16 Summary

Key Takeaways:

  1. dplyr philosophy - Simple verbs, chainable with pipes
  2. Pipe operator - %>% or |> makes code readable
  3. filter() uses == - Not = for comparison
  4. Check column names - Use names() or glimpse()
  5. Order matters - Can’t use columns after removing them
  6. Assign results - dplyr doesn’t modify in place
  7. Debug step-by-step - Run pipes incrementally

Quick Reference:

Error Cause Fix
object not found Column doesn’t exist Check names, order operations
Using = not == Assignment in filter Use == for comparison
unused argument Wrong function usage Check function signature
Result not saved No assignment Assign with <-

Core Verbs:

# filter() - keep rows
data %>% filter(condition)

# select() - keep columns
data %>% select(col1, col2)

# mutate() - create columns
data %>% mutate(new_col = expression)

# arrange() - sort rows
data %>% arrange(col)
data %>% arrange(desc(col))

# summarize() - aggregate
data %>% summarize(stat = function(col))

# Common helpers
starts_with("prefix")
ends_with("suffix")
contains("text")
where(is.numeric)
everything()

Best Practices:

# ✅ Good
data %>% filter(col == value)      # Use ==
data %>% select(col1, col2)        # Select then filter if needed
names(data)                         # Check before selecting
result <- data %>% ...              # Assign result

# ❌ Avoid
data %>% filter(col = value)       # Using =
data %>% select(col) %>%           # Then try to use removed col
  filter(other_col > 5)
data %>% filter(...) # no assign    # Result lost

20.17 Exercises

📝 Exercise 1: Basic Pipeline

Use mtcars to: 1. Filter cars with mpg > 20 2. Select mpg, cyl, hp, wt 3. Create hp_per_ton = hp / wt 4. Arrange by hp_per_ton descending 5. Show top 5

📝 Exercise 2: Error Fixing

Fix these errors:

# Error 1
mtcars %>%
  filter(cyl = 6)

# Error 2
mtcars %>%
  select(mpg, cyl) %>%
  filter(hp > 100)

# Error 3
mtcars %>%
  mutate(ratio = mpg_per_cyl / wt) %>%
  mutate(mpg_per_cyl = mpg / cyl)

📝 Exercise 3: Safe Select

Write safe_select(data, ...) that: 1. Checks columns exist before selecting 2. Gives helpful error if not found 3. Suggests similar names 4. Works with select helpers

📝 Exercise 4: Summary Report

Create function summarize_by_group(data, group_col, value_col) that: 1. Groups by specified column 2. Summarizes mean, median, sd, min, max 3. Counts observations 4. Returns sorted by mean descending

20.18 Exercise Answers

Click to see answers

Exercise 1:

result <- mtcars %>%
  filter(mpg > 20) %>%
  select(mpg, cyl, hp, wt) %>%
  mutate(hp_per_ton = hp / wt) %>%
  arrange(desc(hp_per_ton)) %>%
  slice_head(n = 5)
#> Error in select(., mpg, cyl, hp, wt): unused arguments (mpg, cyl, hp, wt)

result
#> [1] "Success!"

# Alternative: use top_n (deprecated but still works)
mtcars %>%
  filter(mpg > 20) %>%
  select(mpg, cyl, hp, wt) %>%
  mutate(hp_per_ton = hp / wt) %>%
  slice_max(hp_per_ton, n = 5)
#> Error in select(., mpg, cyl, hp, wt): unused arguments (mpg, cyl, hp, wt)

Exercise 2:

# Error 1: Use == not =
mtcars %>%
  filter(cyl == 6)
#>                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4          6
#> Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1          6
#> Valiant        18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1          6
#> Merc 280       19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4          6
#> Merc 280C      17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4          6
#> Ferrari Dino   19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6          6

# Error 2: Keep hp column or filter before select
# Option A: Select hp too
mtcars %>%
  select(mpg, cyl, hp) %>%
  filter(hp > 100)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)

# Option B: Filter first
mtcars %>%
  filter(hp > 100) %>%
  select(mpg, cyl)
#> Error in select(., mpg, cyl): unused arguments (mpg, cyl)

# Error 3: Create mpg_per_cyl before using it
mtcars %>%
  mutate(mpg_per_cyl = mpg / cyl) %>%
  mutate(ratio = mpg_per_cyl / wt)
#>                    mpg cyl disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1          4
#> Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1          6
#> Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2          8
#> Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1          6
#> Duster 360        14.3   8  360 245 3.21 3.570 15.84  0  0    3    4          8
#>                   mpg_per_cyl     ratio
#> Mazda RX4            3.500000 1.3358779
#> Mazda RX4 Wag        3.500000 1.2173913
#> Datsun 710           5.700000 2.4568966
#> Hornet 4 Drive       3.566667 1.1093831
#> Hornet Sportabout    2.337500 0.6795058
#> Valiant              3.016667 0.8718690
#> Duster 360           1.787500 0.5007003
#>  [ reached 'max' / getOption("max.print") -- omitted 25 rows ]

# Or in one mutate
mtcars %>%
  mutate(
    mpg_per_cyl = mpg / cyl,
    ratio = mpg_per_cyl / wt
  )
#>                    mpg cyl disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1          4
#> Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1          6
#> Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2          8
#> Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1          6
#> Duster 360        14.3   8  360 245 3.21 3.570 15.84  0  0    3    4          8
#>                   mpg_per_cyl     ratio
#> Mazda RX4            3.500000 1.3358779
#> Mazda RX4 Wag        3.500000 1.2173913
#> Datsun 710           5.700000 2.4568966
#> Hornet 4 Drive       3.566667 1.1093831
#> Hornet Sportabout    2.337500 0.6795058
#> Valiant              3.016667 0.8718690
#> Duster 360           1.787500 0.5007003
#>  [ reached 'max' / getOption("max.print") -- omitted 25 rows ]

Exercise 3:

safe_select <- function(data, ...) {
  # Capture expressions
  cols <- rlang::enquos(...)
  
  # Get all column names in data
  data_cols <- names(data)
  
  # Evaluate each selection
  for (col_expr in cols) {
    col_name <- rlang::as_name(col_expr)
    
    # Skip if it's a helper function call
    if (grepl("^(starts_with|ends_with|contains|matches|everything|where)", 
              col_name)) {
      next
    }
    
    # Check if column exists
    if (!col_name %in% data_cols) {
      # Find similar names
      similar <- agrep(col_name, data_cols, max.distance = 0.3, value = TRUE)
      
      error_msg <- paste0("Column '", col_name, "' not found in data.")
      
      if (length(similar) > 0) {
        error_msg <- paste0(error_msg, "\nDid you mean: ",
                           paste(similar, collapse = ", "), "?")
      }
      
      error_msg <- paste0(error_msg, "\nAvailable columns: ",
                         paste(data_cols, collapse = ", "))
      
      stop(error_msg, call. = FALSE)
    }
  }
  
  # If all checks pass, do the select
  select(data, !!!cols)
}

# Test
safe_select(mtcars, mpg, cyl, hp)
#> Error in select(data, !!!cols): unused argument (!!!cols)
safe_select(mtcars, mpg, horsepower)  # Helpful error
#> Error: Column 'horsepower' not found in data.
#> Available columns: mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb, cyl_factor

Exercise 4:

summarize_by_group <- function(data, group_col, value_col) {
  # Use across for flexible column selection
  data %>%
    group_by(across({{ group_col }})) %>%
    summarize(
      n = n(),
      mean = mean({{ value_col }}, na.rm = TRUE),
      median = median({{ value_col }}, na.rm = TRUE),
      sd = sd({{ value_col }}, na.rm = TRUE),
      min = min({{ value_col }}, na.rm = TRUE),
      max = max({{ value_col }}, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    arrange(desc(mean))
}

# Test
summarize_by_group(mtcars, cyl, mpg)
#> # A tibble: 3 × 7
#>     cyl     n  mean median    sd   min   max
#>   <dbl> <int> <dbl>  <dbl> <dbl> <dbl> <dbl>
#> 1     4    11  26.7   26    4.51  21.4  33.9
#> 2     6     7  19.7   19.7  1.45  17.8  21.4
#> 3     8    14  15.1   15.2  2.56  10.4  19.2
summarize_by_group(mtcars, gear, hp)
#> # A tibble: 3 × 7
#>    gear     n  mean median    sd   min   max
#>   <dbl> <int> <dbl>  <dbl> <dbl> <dbl> <dbl>
#> 1     5     5 196.     175 103.     91   335
#> 2     3    15 176.     180  47.7    97   245
#> 3     4    12  89.5     94  25.9    52   123

# With iris
summarize_by_group(iris, Species, Sepal.Length)
#> # A tibble: 3 × 7
#>   Species        n  mean median    sd   min   max
#>   <fct>      <int> <dbl>  <dbl> <dbl> <dbl> <dbl>
#> 1 virginica     50  6.59    6.5 0.636   4.9   7.9
#> 2 versicolor    50  5.94    5.9 0.516   4.9   7  
#> 3 setosa        50  5.01    5   0.352   4.3   5.8