Chapter 20 Introduction to dplyr
What You’ll Learn:
- dplyr basics and philosophy
- The pipe operator (%>% and |>)
- Core dplyr verbs
- Common dplyr errors
- Tidy evaluation basics
Key Errors Covered: 20+ dplyr errors
Difficulty: ⭐⭐ Intermediate
20.1 Introduction
dplyr revolutionized R data manipulation:
library(dplyr)
# Try to filter
mtcars %>% filter(mpg > 20)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 4
#> Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 6
#> Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 4
#> Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 4
#> Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 4
#> Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 4
#> [ reached 'max' / getOption("max.print") -- omitted 6 rows ]# Works! But errors are common...
mtcars %>% filter(mpg > 20 & cyl = 4) # Oops!
#> Error in parse(text = input): <text>:2:34: unexpected '='
#> 1: # Works! But errors are common...
#> 2: mtcars %>% filter(mpg > 20 & cyl =
#> ^Let’s master dplyr to avoid these pitfalls.
20.2 dplyr Basics
💡 Key Insight: The dplyr Philosophy
library(dplyr)
# Core verbs (functions)
# 1. filter() - keep rows
# 2. select() - keep columns
# 3. mutate() - create/modify columns
# 4. arrange() - sort rows
# 5. summarize() - aggregate data
# 6. group_by() - group for operations
# Each returns a data frame
# Easy to chain with pipe
# Example data
head(mtcars)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 4
#> Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 6
#> Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 8
#> Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 6Key principles: - Each function does one thing well - First argument is always data - Returns a data frame - Works with pipes - NSE (non-standard evaluation) for column names
20.3 The Pipe Operator
💡 Key Insight: Pipe Operators
# Traditional nesting (hard to read)
result <- arrange(
filter(
select(mtcars, mpg, cyl, hp),
mpg > 20
),
desc(hp)
)
#> Error in select(mtcars, mpg, cyl, hp): unused arguments (mpg, cyl, hp)
# With magrittr pipe %>%
result <- mtcars %>%
select(mpg, cyl, hp) %>%
filter(mpg > 20) %>%
arrange(desc(hp))
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)
# Native pipe |> (R >= 4.1)
result <- mtcars |>
select(mpg, cyl, hp) |>
filter(mpg > 20) |>
arrange(desc(hp))
#> Error in select(mtcars, mpg, cyl, hp): unused arguments (mpg, cyl, hp)
result
#> [1] "Success!"
# How pipe works
# x %>% f(y) becomes f(x, y)
# x %>% f(y) %>% g(z) becomes g(f(x, y), z)
# Pipe sends left side as first argument to right side
mtcars %>% head(3)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 4
# Same as:
head(mtcars, 3)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 4Differences between %>% and |>:
- %>% from magrittr (more flexible)
- |> native R (R >= 4.1, faster)
- %>% has more features (., shortcuts)
- |> simpler, no package needed
20.4 Error #1: object not found with pipes
⭐ BEGINNER 🔍 SCOPE
20.4.1 The Error
library(dplyr)
mtcars %>%
filter(mpg > 20) %>%
select(mpg, cyl, hp) %>%
arrange(horsepower) # Wrong column name!
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)🔴 ERROR
Error in arrange():
! Can't subset columns that don't exist.
✖ Column `horsepower` doesn't exist.
20.4.4 Solutions
✅ SOLUTION 1: Check Column Names
# Before piping, check what you have
names(mtcars)
#> [1] "mpg" "cyl" "disp" "hp" "drat"
#> [6] "wt" "qsec" "vs" "am" "gear"
#> [11] "carb" "cyl_factor"
# Or with glimpse
glimpse(mtcars)
#> Rows: 32
#> Columns: 12
#> $ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2,…
#> $ cyl <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4,…
#> $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140…
#> $ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 18…
#> $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92,…
#> $ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.1…
#> $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.…
#> $ vs <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,…
#> $ am <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,…
#> $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4,…
#> $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1,…
#> $ cyl_factor <fct> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4,…
# In pipe chain
mtcars %>%
names()
#> [1] "mpg" "cyl" "disp" "hp" "drat"
#> [6] "wt" "qsec" "vs" "am" "gear"
#> [11] "carb" "cyl_factor"✅ SOLUTION 2: Order Operations Correctly
# Filter before selecting
mtcars %>%
filter(hp > 100) %>% # Use hp while it exists
select(mpg, cyl) # Then select subset
#> Error in select(., mpg, cyl): unused arguments (mpg, cyl)
# Or keep needed columns
mtcars %>%
select(mpg, cyl, hp) %>% # Keep hp
filter(hp > 100) # Now can use it
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)✅ SOLUTION 3: Use Column Existence Check
safe_filter <- function(data, ...) {
# Check if columns exist
expr <- rlang::enexpr(...)
cols_used <- all.vars(expr)
missing_cols <- setdiff(cols_used, names(data))
if (length(missing_cols) > 0) {
stop("Columns not found: ", paste(missing_cols, collapse = ", "))
}
filter(data, ...)
}
# Will give helpful error
# mtcars %>% safe_filter(horsepower > 100)20.5 filter() Basics
💡 Key Insight: filter() Patterns
# Keep rows where condition is TRUE
mtcars %>%
filter(mpg > 20)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 4
#> Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 6
#> Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 4
#> Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 4
#> Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 4
#> Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 4
#> [ reached 'max' / getOption("max.print") -- omitted 6 rows ]
# Multiple conditions (AND)
mtcars %>%
filter(mpg > 20, cyl == 4)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 4
#> Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 4
#> Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 4
#> Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 4
#> Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 4
#> Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 4
#> Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 4
#> Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 4
#> [ reached 'max' / getOption("max.print") -- omitted 3 rows ]
# Same as
mtcars %>%
filter(mpg > 20 & cyl == 4)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 4
#> Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 4
#> Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 4
#> Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 4
#> Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 4
#> Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 4
#> Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 4
#> Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 4
#> [ reached 'max' / getOption("max.print") -- omitted 3 rows ]
# OR condition
mtcars %>%
filter(mpg > 25 | hp > 200)
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
#> Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
#> Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
#> Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
#> Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
#> Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
#> Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
#> Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
#> cyl_factor
#> Duster 360 8
#> Cadillac Fleetwood 8
#> Lincoln Continental 8
#> Chrysler Imperial 8
#> Fiat 128 4
#> Honda Civic 4
#> Toyota Corolla 4
#> Camaro Z28 8
#> [ reached 'max' / getOption("max.print") -- omitted 5 rows ]
# Complex conditions
mtcars %>%
filter(mpg > 20 & (cyl == 4 | cyl == 6))
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 4
#> Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 6
#> Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 4
#> Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 4
#> Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 4
#> Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 4
#> [ reached 'max' / getOption("max.print") -- omitted 6 rows ]
# Using %in%
mtcars %>%
filter(cyl %in% c(4, 6))
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 4
#> Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 6
#> Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 6
#> Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 4
#> Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 4
#> Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 6
#> [ reached 'max' / getOption("max.print") -- omitted 10 rows ]
# Negation
mtcars %>%
filter(!(cyl %in% c(4, 6)))
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
#> Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
#> Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
#> Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
#> Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
#> Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
#> Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
#> Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
#> cyl_factor
#> Hornet Sportabout 8
#> Duster 360 8
#> Merc 450SE 8
#> Merc 450SL 8
#> Merc 450SLC 8
#> Cadillac Fleetwood 8
#> Lincoln Continental 8
#> Chrysler Imperial 8
#> [ reached 'max' / getOption("max.print") -- omitted 6 rows ]
# Or
mtcars %>%
filter(!cyl %in% c(4, 6))
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
#> Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
#> Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
#> Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
#> Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
#> Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
#> Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
#> Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
#> cyl_factor
#> Hornet Sportabout 8
#> Duster 360 8
#> Merc 450SE 8
#> Merc 450SL 8
#> Merc 450SLC 8
#> Cadillac Fleetwood 8
#> Lincoln Continental 8
#> Chrysler Imperial 8
#> [ reached 'max' / getOption("max.print") -- omitted 6 rows ]
# Filter with string operations
mtcars %>%
rownames_to_column("car") %>%
filter(grepl("Merc", car))
#> car mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> 1 Merc 240D 24.4 4 146.7 62 3.69 3.19 20.0 1 0 4 2 4
#> 2 Merc 230 22.8 4 140.8 95 3.92 3.15 22.9 1 0 4 2 4
#> 3 Merc 280 19.2 6 167.6 123 3.92 3.44 18.3 1 0 4 4 6
#> 4 Merc 280C 17.8 6 167.6 123 3.92 3.44 18.9 1 0 4 4 6
#> 5 Merc 450SE 16.4 8 275.8 180 3.07 4.07 17.4 0 0 3 3 8
#> 6 Merc 450SL 17.3 8 275.8 180 3.07 3.73 17.6 0 0 3 3 8
#> 7 Merc 450SLC 15.2 8 275.8 180 3.07 3.78 18.0 0 0 3 3 8
# Remove NAs
data_with_na <- tibble(x = c(1, 2, NA, 4))
data_with_na %>%
filter(!is.na(x))
#> # A tibble: 3 × 1
#> x
#> <dbl>
#> 1 1
#> 2 2
#> 3 420.6 Error #2: Using = instead of ==
⭐ BEGINNER 🔤 SYNTAX
20.6.1 The Error
mtcars %>%
filter(cyl = 4) # Wrong: = is assignment
#> Error in `filter()`:
#> ! We detected a named input.
#> ℹ This usually means that you've used `=` instead of `==`.
#> ℹ Did you mean `cyl == 4`?🔴 ERROR
Error in filter():
! We detected a named input.
ℹ This usually means that you've used `=` instead of `==`.
ℹ Did you mean `cyl == 4`?
20.6.3 Solutions
✅ SOLUTION: Use == for Comparison
# Correct: use ==
mtcars %>%
filter(cyl == 4)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 4
#> Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 4
#> Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 4
#> Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 4
#> Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 4
#> Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 4
#> Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 4
#> Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 4
#> [ reached 'max' / getOption("max.print") -- omitted 3 rows ]
# Other comparison operators
mtcars %>%
filter(mpg > 20) # Greater than
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 4
#> Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 6
#> Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 4
#> Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 4
#> Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 4
#> Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 4
#> [ reached 'max' / getOption("max.print") -- omitted 6 rows ]
mtcars %>%
filter(mpg >= 20) # Greater or equal
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 4
#> Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 6
#> Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 4
#> Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 4
#> Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 4
#> Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 4
#> [ reached 'max' / getOption("max.print") -- omitted 6 rows ]
mtcars %>%
filter(mpg < 20) # Less than
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Hornet Sportabout 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 2 8
#> Valiant 18.1 6 225.0 105 2.76 3.46 20.22 1 0 3 1 6
#> Duster 360 14.3 8 360.0 245 3.21 3.57 15.84 0 0 3 4 8
#> Merc 280 19.2 6 167.6 123 3.92 3.44 18.30 1 0 4 4 6
#> Merc 280C 17.8 6 167.6 123 3.92 3.44 18.90 1 0 4 4 6
#> Merc 450SE 16.4 8 275.8 180 3.07 4.07 17.40 0 0 3 3 8
#> Merc 450SL 17.3 8 275.8 180 3.07 3.73 17.60 0 0 3 3 8
#> Merc 450SLC 15.2 8 275.8 180 3.07 3.78 18.00 0 0 3 3 8
#> [ reached 'max' / getOption("max.print") -- omitted 10 rows ]
mtcars %>%
filter(mpg <= 20) # Less or equal
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Hornet Sportabout 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 2 8
#> Valiant 18.1 6 225.0 105 2.76 3.46 20.22 1 0 3 1 6
#> Duster 360 14.3 8 360.0 245 3.21 3.57 15.84 0 0 3 4 8
#> Merc 280 19.2 6 167.6 123 3.92 3.44 18.30 1 0 4 4 6
#> Merc 280C 17.8 6 167.6 123 3.92 3.44 18.90 1 0 4 4 6
#> Merc 450SE 16.4 8 275.8 180 3.07 4.07 17.40 0 0 3 3 8
#> Merc 450SL 17.3 8 275.8 180 3.07 3.73 17.60 0 0 3 3 8
#> Merc 450SLC 15.2 8 275.8 180 3.07 3.78 18.00 0 0 3 3 8
#> [ reached 'max' / getOption("max.print") -- omitted 10 rows ]
mtcars %>%
filter(mpg != 4) # Not equal
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
#> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
#> Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
#> Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
#> Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
#> Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
#> Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
#> Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
#> cyl_factor
#> Mazda RX4 6
#> Mazda RX4 Wag 6
#> Datsun 710 4
#> Hornet 4 Drive 6
#> Hornet Sportabout 8
#> Valiant 6
#> Duster 360 8
#> Merc 240D 4
#> [ reached 'max' / getOption("max.print") -- omitted 24 rows ]
# For NA, use is.na()
data_with_na <- tibble(x = c(1, NA, 3))
data_with_na %>%
filter(is.na(x)) # Keep NAs
#> # A tibble: 1 × 1
#> x
#> <dbl>
#> 1 NA
data_with_na %>%
filter(!is.na(x)) # Remove NAs
#> # A tibble: 2 × 1
#> x
#> <dbl>
#> 1 1
#> 2 320.7 select() Basics
💡 Key Insight: select() Patterns
# Select specific columns
mtcars %>%
select(mpg, cyl, hp)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)
# Select range
mtcars %>%
select(mpg:hp)
#> Error in select(., mpg:hp): unused argument (mpg:hp)
# Select by position
mtcars %>%
select(1:3)
#> Error in select(., 1:3): unused argument (1:3)
# Exclude columns
mtcars %>%
select(-mpg, -cyl)
#> Error in select(., -mpg, -cyl): unused arguments (-mpg, -cyl)
# Exclude range
mtcars %>%
select(-(mpg:disp))
#> Error in select(., -(mpg:disp)): unused argument (-(mpg:disp))
# Helper functions
# starts_with()
iris %>%
select(starts_with("Sepal"))
#> Error in select(., starts_with("Sepal")): unused argument (starts_with("Sepal"))
# ends_with()
iris %>%
select(ends_with("Width"))
#> Error in select(., ends_with("Width")): unused argument (ends_with("Width"))
# contains()
iris %>%
select(contains("etal"))
#> Error in select(., contains("etal")): unused argument (contains("etal"))
# matches() - regex
mtcars %>%
select(matches("^d")) # Starts with 'd'
#> Error in select(., matches("^d")): unused argument (matches("^d"))
# everything() - all remaining
mtcars %>%
select(mpg, cyl, everything())
#> Error in select(., mpg, cyl, everything()): unused arguments (mpg, cyl, everything())
# where() - by condition
mtcars %>%
select(where(is.numeric))
#> Error in select(., where(is.numeric)): unused argument (where(is.numeric))
# Rename while selecting
mtcars %>%
select(miles_per_gallon = mpg, cylinders = cyl)
#> Error in select(., miles_per_gallon = mpg, cylinders = cyl): unused arguments (miles_per_gallon = mpg, cylinders = cyl)
# Just rename (keep all)
mtcars %>%
rename(miles_per_gallon = mpg)
#> miles_per_gallon cyl disp hp drat wt qsec vs am gear
#> Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4
#> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4
#> Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4
#> Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3
#> Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3
#> Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3
#> Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3
#> Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4
#> carb cyl_factor
#> Mazda RX4 4 6
#> Mazda RX4 Wag 4 6
#> Datsun 710 1 4
#> Hornet 4 Drive 1 6
#> Hornet Sportabout 2 8
#> Valiant 1 6
#> Duster 360 4 8
#> Merc 240D 2 4
#> [ reached 'max' / getOption("max.print") -- omitted 24 rows ]20.8 Error #3: unused argument in select
⭐⭐ INTERMEDIATE 📋 ARGS
20.8.1 The Error
mtcars %>%
select(mpg, cyl, cyl == 4) # Can't filter in select!
#> Error in select(., mpg, cyl, cyl == 4): unused arguments (mpg, cyl, cyl == 4)🔴 ERROR
Error in select():
! Can't subset columns with `cyl == 4`.
✖ `cyl == 4` must be numeric or character, not `logical`.
20.8.4 Solutions
✅ SOLUTION: Separate select and filter
# Correct: separate operations
mtcars %>%
select(mpg, cyl, hp) %>%
filter(cyl == 4)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)
# Or filter first
mtcars %>%
filter(cyl == 4) %>%
select(mpg, cyl, hp)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)
# Use quotes with helpers
iris %>%
select(starts_with("Sepal"))
#> Error in select(., starts_with("Sepal")): unused argument (starts_with("Sepal"))20.9 mutate() Basics
💡 Key Insight: mutate() Patterns
# Create new column
mtcars %>%
mutate(mpg_per_cyl = mpg / cyl) %>%
select(mpg, cyl, mpg_per_cyl)
#> Error in select(., mpg, cyl, mpg_per_cyl): unused arguments (mpg, cyl, mpg_per_cyl)
# Multiple new columns
mtcars %>%
mutate(
kpl = mpg * 0.425,
wt_kg = wt * 453.592
) %>%
select(mpg, kpl, wt, wt_kg)
#> Error in select(., mpg, kpl, wt, wt_kg): unused arguments (mpg, kpl, wt, wt_kg)
# Modify existing column
mtcars %>%
mutate(hp = hp * 0.746) %>% # Convert to kW
select(hp)
#> Error in select(., hp): unused argument (hp)
# Use newly created columns
mtcars %>%
mutate(
hp_per_cyl = hp / cyl,
hp_ratio = hp_per_cyl / mean(hp_per_cyl)
) %>%
select(hp, cyl, hp_per_cyl, hp_ratio)
#> Error in select(., hp, cyl, hp_per_cyl, hp_ratio): unused arguments (hp, cyl, hp_per_cyl, hp_ratio)
# Conditional creation
mtcars %>%
mutate(
efficiency = case_when(
mpg > 25 ~ "High",
mpg > 20 ~ "Medium",
TRUE ~ "Low"
)
) %>%
select(mpg, efficiency)
#> Error in select(., mpg, efficiency): unused arguments (mpg, efficiency)
# With ifelse
mtcars %>%
mutate(
heavy = ifelse(wt > 3.5, "Heavy", "Light")
) %>%
select(wt, heavy)
#> Error in select(., wt, heavy): unused arguments (wt, heavy)
# Keep only new columns with transmute()
mtcars %>%
transmute(
car = rownames(mtcars),
kpl = mpg * 0.425
)
#> car kpl
#> Mazda RX4 Mazda RX4 8.9250
#> Mazda RX4 Wag Mazda RX4 Wag 8.9250
#> Datsun 710 Datsun 710 9.6900
#> Hornet 4 Drive Hornet 4 Drive 9.0950
#> Hornet Sportabout Hornet Sportabout 7.9475
#> Valiant Valiant 7.6925
#> Duster 360 Duster 360 6.0775
#> Merc 240D Merc 240D 10.3700
#> Merc 230 Merc 230 9.6900
#> Merc 280 Merc 280 8.1600
#> Merc 280C Merc 280C 7.5650
#> Merc 450SE Merc 450SE 6.9700
#> Merc 450SL Merc 450SL 7.3525
#> Merc 450SLC Merc 450SLC 6.4600
#> Cadillac Fleetwood Cadillac Fleetwood 4.4200
#> Lincoln Continental Lincoln Continental 4.4200
#> Chrysler Imperial Chrysler Imperial 6.2475
#> Fiat 128 Fiat 128 13.7700
#> Honda Civic Honda Civic 12.9200
#> Toyota Corolla Toyota Corolla 14.4075
#> Toyota Corona Toyota Corona 9.1375
#> Dodge Challenger Dodge Challenger 6.5875
#> AMC Javelin AMC Javelin 6.4600
#> Camaro Z28 Camaro Z28 5.6525
#> Pontiac Firebird Pontiac Firebird 8.1600
#> Fiat X1-9 Fiat X1-9 11.6025
#> Porsche 914-2 Porsche 914-2 11.0500
#> Lotus Europa Lotus Europa 12.9200
#> Ford Pantera L Ford Pantera L 6.7150
#> Ferrari Dino Ferrari Dino 8.3725
#> Maserati Bora Maserati Bora 6.3750
#> Volvo 142E Volvo 142E 9.095020.10 Error #4: object not found in mutate
⭐⭐ INTERMEDIATE 🔍 SCOPE
20.10.1 The Error
mpg_threshold <- 20
mtcars %>%
mutate(efficient = mpg > mpg_treshold) # Typo!
#> Error in `mutate()`:
#> ℹ In argument: `efficient = mpg > mpg_treshold`.
#> Caused by error:
#> ! object 'mpg_treshold' not found🔴 ERROR
Error in mutate():
! object 'mpg_treshold' not found
20.10.3 Common Causes
# Typo in external variable
threshold <- 20
mtcars %>%
mutate(high = mpg > threshhold)
#> Error in `mutate()`:
#> ℹ In argument: `high = mpg > threshhold`.
#> Caused by error:
#> ! object 'threshhold' not found
# Typo in column name
mtcars %>%
mutate(power_ratio = horsepower / wt)
#> Error in `mutate()`:
#> ℹ In argument: `power_ratio = horsepower/wt`.
#> Caused by error:
#> ! object 'horsepower' not found
# Using column before creating it
mtcars %>%
mutate(
mpg_ratio = mpg_scaled / mean(mpg_scaled),
mpg_scaled = mpg / mean(mpg)
)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 4
#> Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 6
#> Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 8
#> Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 6
#> Duster 360 14.3 8 360 245 3.21 3.570 15.84 0 0 3 4 8
#> mpg_ratio mpg_scaled
#> Mazda RX4 2.121443e+15 1.0452636
#> Mazda RX4 Wag 2.121443e+15 1.0452636
#> Datsun 710 6.320589e+15 1.1348577
#> Hornet 4 Drive 3.054587e+15 1.0651734
#> Hornet Sportabout -3.244132e+15 0.9307824
#> Valiant -4.643847e+15 0.9009177
#> Duster 360 -1.350871e+16 0.7117748
#> [ reached 'max' / getOption("max.print") -- omitted 25 rows ]20.10.4 Solutions
✅ SOLUTION 1: Check Names Carefully
# Check column names
names(mtcars)
#> [1] "mpg" "cyl" "disp" "hp" "drat"
#> [6] "wt" "qsec" "vs" "am" "gear"
#> [11] "carb" "cyl_factor"
# Check variable exists
threshold <- 20
exists("threshold")
#> [1] TRUE
# Correct usage
mtcars %>%
mutate(efficient = mpg > threshold) %>%
select(mpg, efficient)
#> Error in select(., mpg, efficient): unused arguments (mpg, efficient)✅ SOLUTION 2: Order Operations
✅ SOLUTION 3: Use .data pronoun
# Explicit about column reference
mtcars %>%
mutate(power_ratio = .data$hp / .data$wt) %>%
select(hp, wt, power_ratio)
#> Error in select(., hp, wt, power_ratio): unused arguments (hp, wt, power_ratio)
# Useful in functions
calculate_ratio <- function(data, num_col, denom_col) {
data %>%
mutate(ratio = .data[[num_col]] / .data[[denom_col]])
}
calculate_ratio(mtcars, "hp", "wt") %>%
select(hp, wt, ratio)
#> Error in select(., hp, wt, ratio): unused arguments (hp, wt, ratio)20.11 arrange() Basics
💡 Key Insight: arrange() Patterns
# Sort ascending (default)
mtcars %>%
arrange(mpg) %>%
select(mpg, cyl, hp)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)
# Sort descending
mtcars %>%
arrange(desc(mpg)) %>%
select(mpg, cyl, hp)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)
# Multiple columns
mtcars %>%
arrange(cyl, desc(mpg)) %>%
select(cyl, mpg, hp)
#> Error in select(., cyl, mpg, hp): unused arguments (cyl, mpg, hp)
# With NA handling
data_with_na <- tibble(
x = c(3, 1, NA, 2),
y = c("a", "b", "c", "d")
)
data_with_na %>%
arrange(x) # NAs last by default
#> # A tibble: 4 × 2
#> x y
#> <dbl> <chr>
#> 1 1 b
#> 2 2 d
#> 3 3 a
#> 4 NA c
# NAs first
data_with_na %>%
arrange(desc(is.na(x)), x)
#> # A tibble: 4 × 2
#> x y
#> <dbl> <chr>
#> 1 NA c
#> 2 1 b
#> 3 2 d
#> 4 3 a20.12 summarize() Basics
💡 Key Insight: summarize() Patterns
# Single summary
mtcars %>%
summarize(avg_mpg = mean(mpg))
#> avg_mpg
#> 1 20.09062
# Multiple summaries
mtcars %>%
summarize(
avg_mpg = mean(mpg),
sd_mpg = sd(mpg),
min_mpg = min(mpg),
max_mpg = max(mpg),
n = n()
)
#> avg_mpg sd_mpg min_mpg max_mpg n
#> 1 20.09062 6.026948 10.4 33.9 32
# With grouping (next chapter covers in detail)
mtcars %>%
group_by(cyl) %>%
summarize(
avg_mpg = mean(mpg),
count = n()
)
#> # A tibble: 3 × 3
#> cyl avg_mpg count
#> <dbl> <dbl> <int>
#> 1 4 26.7 11
#> 2 6 19.7 7
#> 3 8 15.1 14
# Useful summary functions
mtcars %>%
summarize(
mean = mean(mpg),
median = median(mpg),
sd = sd(mpg),
min = min(mpg),
max = max(mpg),
q25 = quantile(mpg, 0.25),
q75 = quantile(mpg, 0.75),
n = n(),
n_distinct = n_distinct(cyl)
)
#> mean median sd min max q25 q75 n n_distinct
#> 1 20.09062 19.2 6.026948 10.4 33.9 15.425 22.8 32 320.13 Common Patterns
🎯 Best Practice: Common dplyr Workflows
# 1. Filter → Select → Arrange
mtcars %>%
filter(mpg > 20) %>%
select(mpg, cyl, hp) %>%
arrange(desc(mpg))
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)
# 2. Create variables → Summarize
mtcars %>%
mutate(hp_per_cyl = hp / cyl) %>%
summarize(
avg_hp_per_cyl = mean(hp_per_cyl),
max_hp_per_cyl = max(hp_per_cyl)
)
#> avg_hp_per_cyl max_hp_per_cyl
#> 1 23.0013 41.875
# 3. Group → Summarize → Arrange
mtcars %>%
group_by(cyl) %>%
summarize(
avg_mpg = mean(mpg),
count = n()
) %>%
arrange(desc(avg_mpg))
#> # A tibble: 3 × 3
#> cyl avg_mpg count
#> <dbl> <dbl> <int>
#> 1 4 26.7 11
#> 2 6 19.7 7
#> 3 8 15.1 14
# 4. Top N
mtcars %>%
arrange(desc(mpg)) %>%
slice_head(n = 5)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 4
#> Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 4
#> Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 4
#> Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 4
#> Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 4
# 5. Distinct values
mtcars %>%
distinct(cyl, gear)
#> cyl gear
#> Mazda RX4 6 4
#> Datsun 710 4 4
#> Hornet 4 Drive 6 3
#> Hornet Sportabout 8 3
#> Toyota Corona 4 3
#> Porsche 914-2 4 5
#> Ford Pantera L 8 5
#> Ferrari Dino 6 5
# 6. Count
mtcars %>%
count(cyl, sort = TRUE)
#> cyl n
#> 1 8 14
#> 2 4 11
#> 3 6 7
# 7. Add row numbers
mtcars %>%
mutate(row_id = row_number()) %>%
select(row_id, everything())
#> Error in select(., row_id, everything()): unused arguments (row_id, everything())20.14 Error #5: Forgetting to assign or print
⭐ BEGINNER 💡 USAGE
20.14.1 The Problem
# Doesn't save result!
mtcars %>%
filter(mpg > 20) %>%
select(mpg, cyl)
#> Error in select(., mpg, cyl): unused arguments (mpg, cyl)
# Original unchanged
head(mtcars, 2)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21 6 160 110 3.9 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4 620.15 Debugging dplyr Chains
🎯 Best Practice: Debug Pipe Chains
# 1. Run step by step
# Start here
mtcars %>%
filter(mpg > 20)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 4
#> Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 6
#> Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 4
#> Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 4
#> Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 4
#> Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 4
#> [ reached 'max' / getOption("max.print") -- omitted 6 rows ]
# Add next step
mtcars %>%
filter(mpg > 20) %>%
select(mpg, cyl, hp)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)
# Continue...
mtcars %>%
filter(mpg > 20) %>%
select(mpg, cyl, hp) %>%
arrange(desc(hp))
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)
# 2. Check intermediate results
mtcars %>%
filter(mpg > 20) %>%
{cat("After filter:", nrow(.), "rows\n"); .} %>%
select(mpg, cyl, hp) %>%
{cat("After select:", ncol(.), "cols\n"); .} %>%
arrange(desc(hp))
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)
# 3. Use View() in RStudio
# mtcars %>%
# filter(mpg > 20) %>%
# View()
# 4. Print at each step
debug_pipe <- function(data, label = NULL) {
if (!is.null(label)) {
cat("\n===", label, "===\n")
}
print(head(data, 3))
data
}
mtcars %>%
filter(mpg > 20) %>%
debug_pipe("After filter") %>%
select(mpg, cyl, hp) %>%
debug_pipe("After select")
#>
#> === After select ===
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)20.16 Summary
Key Takeaways:
- dplyr philosophy - Simple verbs, chainable with pipes
- Pipe operator -
%>%or|>makes code readable - filter() uses == - Not
=for comparison - Check column names - Use
names()orglimpse() - Order matters - Can’t use columns after removing them
- Assign results - dplyr doesn’t modify in place
- Debug step-by-step - Run pipes incrementally
Quick Reference:
| Error | Cause | Fix |
|---|---|---|
| object not found | Column doesn’t exist | Check names, order operations |
| Using = not == | Assignment in filter | Use == for comparison |
| unused argument | Wrong function usage | Check function signature |
| Result not saved | No assignment | Assign with <- |
Core Verbs:
# filter() - keep rows
data %>% filter(condition)
# select() - keep columns
data %>% select(col1, col2)
# mutate() - create columns
data %>% mutate(new_col = expression)
# arrange() - sort rows
data %>% arrange(col)
data %>% arrange(desc(col))
# summarize() - aggregate
data %>% summarize(stat = function(col))
# Common helpers
starts_with("prefix")
ends_with("suffix")
contains("text")
where(is.numeric)
everything()Best Practices:
# ✅ Good
data %>% filter(col == value) # Use ==
data %>% select(col1, col2) # Select then filter if needed
names(data) # Check before selecting
result <- data %>% ... # Assign result
# ❌ Avoid
data %>% filter(col = value) # Using =
data %>% select(col) %>% # Then try to use removed col
filter(other_col > 5)
data %>% filter(...) # no assign # Result lost20.17 Exercises
📝 Exercise 1: Basic Pipeline
Use mtcars to:
1. Filter cars with mpg > 20
2. Select mpg, cyl, hp, wt
3. Create hp_per_ton = hp / wt
4. Arrange by hp_per_ton descending
5. Show top 5
📝 Exercise 2: Error Fixing
Fix these errors:
📝 Exercise 3: Safe Select
Write safe_select(data, ...) that:
1. Checks columns exist before selecting
2. Gives helpful error if not found
3. Suggests similar names
4. Works with select helpers
📝 Exercise 4: Summary Report
Create function summarize_by_group(data, group_col, value_col) that:
1. Groups by specified column
2. Summarizes mean, median, sd, min, max
3. Counts observations
4. Returns sorted by mean descending
20.18 Exercise Answers
Click to see answers
Exercise 1:
result <- mtcars %>%
filter(mpg > 20) %>%
select(mpg, cyl, hp, wt) %>%
mutate(hp_per_ton = hp / wt) %>%
arrange(desc(hp_per_ton)) %>%
slice_head(n = 5)
#> Error in select(., mpg, cyl, hp, wt): unused arguments (mpg, cyl, hp, wt)
result
#> [1] "Success!"
# Alternative: use top_n (deprecated but still works)
mtcars %>%
filter(mpg > 20) %>%
select(mpg, cyl, hp, wt) %>%
mutate(hp_per_ton = hp / wt) %>%
slice_max(hp_per_ton, n = 5)
#> Error in select(., mpg, cyl, hp, wt): unused arguments (mpg, cyl, hp, wt)Exercise 2:
# Error 1: Use == not =
mtcars %>%
filter(cyl == 6)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 6
#> Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 6
#> Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 6
#> Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 6
#> Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 6
#> Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 6
# Error 2: Keep hp column or filter before select
# Option A: Select hp too
mtcars %>%
select(mpg, cyl, hp) %>%
filter(hp > 100)
#> Error in select(., mpg, cyl, hp): unused arguments (mpg, cyl, hp)
# Option B: Filter first
mtcars %>%
filter(hp > 100) %>%
select(mpg, cyl)
#> Error in select(., mpg, cyl): unused arguments (mpg, cyl)
# Error 3: Create mpg_per_cyl before using it
mtcars %>%
mutate(mpg_per_cyl = mpg / cyl) %>%
mutate(ratio = mpg_per_cyl / wt)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 4
#> Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 6
#> Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 8
#> Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 6
#> Duster 360 14.3 8 360 245 3.21 3.570 15.84 0 0 3 4 8
#> mpg_per_cyl ratio
#> Mazda RX4 3.500000 1.3358779
#> Mazda RX4 Wag 3.500000 1.2173913
#> Datsun 710 5.700000 2.4568966
#> Hornet 4 Drive 3.566667 1.1093831
#> Hornet Sportabout 2.337500 0.6795058
#> Valiant 3.016667 0.8718690
#> Duster 360 1.787500 0.5007003
#> [ reached 'max' / getOption("max.print") -- omitted 25 rows ]
# Or in one mutate
mtcars %>%
mutate(
mpg_per_cyl = mpg / cyl,
ratio = mpg_per_cyl / wt
)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 4
#> Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 6
#> Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 8
#> Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 6
#> Duster 360 14.3 8 360 245 3.21 3.570 15.84 0 0 3 4 8
#> mpg_per_cyl ratio
#> Mazda RX4 3.500000 1.3358779
#> Mazda RX4 Wag 3.500000 1.2173913
#> Datsun 710 5.700000 2.4568966
#> Hornet 4 Drive 3.566667 1.1093831
#> Hornet Sportabout 2.337500 0.6795058
#> Valiant 3.016667 0.8718690
#> Duster 360 1.787500 0.5007003
#> [ reached 'max' / getOption("max.print") -- omitted 25 rows ]Exercise 3:
safe_select <- function(data, ...) {
# Capture expressions
cols <- rlang::enquos(...)
# Get all column names in data
data_cols <- names(data)
# Evaluate each selection
for (col_expr in cols) {
col_name <- rlang::as_name(col_expr)
# Skip if it's a helper function call
if (grepl("^(starts_with|ends_with|contains|matches|everything|where)",
col_name)) {
next
}
# Check if column exists
if (!col_name %in% data_cols) {
# Find similar names
similar <- agrep(col_name, data_cols, max.distance = 0.3, value = TRUE)
error_msg <- paste0("Column '", col_name, "' not found in data.")
if (length(similar) > 0) {
error_msg <- paste0(error_msg, "\nDid you mean: ",
paste(similar, collapse = ", "), "?")
}
error_msg <- paste0(error_msg, "\nAvailable columns: ",
paste(data_cols, collapse = ", "))
stop(error_msg, call. = FALSE)
}
}
# If all checks pass, do the select
select(data, !!!cols)
}
# Test
safe_select(mtcars, mpg, cyl, hp)
#> Error in select(data, !!!cols): unused argument (!!!cols)safe_select(mtcars, mpg, horsepower) # Helpful error
#> Error: Column 'horsepower' not found in data.
#> Available columns: mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb, cyl_factorExercise 4:
summarize_by_group <- function(data, group_col, value_col) {
# Use across for flexible column selection
data %>%
group_by(across({{ group_col }})) %>%
summarize(
n = n(),
mean = mean({{ value_col }}, na.rm = TRUE),
median = median({{ value_col }}, na.rm = TRUE),
sd = sd({{ value_col }}, na.rm = TRUE),
min = min({{ value_col }}, na.rm = TRUE),
max = max({{ value_col }}, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(mean))
}
# Test
summarize_by_group(mtcars, cyl, mpg)
#> # A tibble: 3 × 7
#> cyl n mean median sd min max
#> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 4 11 26.7 26 4.51 21.4 33.9
#> 2 6 7 19.7 19.7 1.45 17.8 21.4
#> 3 8 14 15.1 15.2 2.56 10.4 19.2
summarize_by_group(mtcars, gear, hp)
#> # A tibble: 3 × 7
#> gear n mean median sd min max
#> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 5 5 196. 175 103. 91 335
#> 2 3 15 176. 180 47.7 97 245
#> 3 4 12 89.5 94 25.9 52 123
# With iris
summarize_by_group(iris, Species, Sepal.Length)
#> # A tibble: 3 × 7
#> Species n mean median sd min max
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 virginica 50 6.59 6.5 0.636 4.9 7.9
#> 2 versicolor 50 5.94 5.9 0.516 4.9 7
#> 3 setosa 50 5.01 5 0.352 4.3 5.8