Chapter 21 Grouping & Summarizing

What You’ll Learn:

  • group_by() mechanics
  • Grouped operations
  • summarize() with groups
  • Common grouping errors
  • Multiple grouping variables
  • Ungrouping data

Key Errors Covered: 18+ grouping errors

Difficulty: ⭐⭐ Intermediate to ⭐⭐⭐ Advanced

21.1 Introduction

Grouping is one of dplyr’s most powerful features:

library(dplyr)

# Without grouping
mtcars %>%
  summarize(avg_mpg = mean(mpg))
#>    avg_mpg
#> 1 20.09062

# With grouping
mtcars %>%
  group_by(cyl) %>%
  summarize(avg_mpg = mean(mpg))
#> # A tibble: 3 × 2
#>     cyl avg_mpg
#>   <dbl>   <dbl>
#> 1     4    26.7
#> 2     6    19.7
#> 3     8    15.1

But grouping has surprising behaviors. Let’s master it!

21.2 group_by() Basics

💡 Key Insight: group_by() Creates Invisible Groups

# Create grouped data
grouped_data <- mtcars %>%
  group_by(cyl)

# Looks the same when printed
head(grouped_data, 3)
#> # A tibble: 3 × 12
#> # Groups:   cyl [2]
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb cyl_factor
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>     
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4 6         
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4 6         
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1 4

# But it's grouped!
class(grouped_data)
#> [1] "grouped_df" "tbl_df"     "tbl"        "data.frame"
groups(grouped_data)
#> [[1]]
#> cyl

# Check if grouped
is_grouped_df(grouped_data)
#> [1] TRUE

# See grouping variables
group_vars(grouped_data)
#> [1] "cyl"

# Count groups
n_groups(grouped_data)
#> [1] 3

# Subsequent operations work within groups
grouped_data %>%
  summarize(avg_mpg = mean(mpg))
#> # A tibble: 3 × 2
#>     cyl avg_mpg
#>   <dbl>   <dbl>
#> 1     4    26.7
#> 2     6    19.7
#> 3     8    15.1

Key insight: Grouping doesn’t change data, just how operations apply!

21.3 Error #1: Forgetting Data is Grouped

⭐⭐ INTERMEDIATE 🧠 LOGIC

21.3.1 The Problem

# Group the data
cars_grouped <- mtcars %>%
  group_by(cyl)

# Later in code... expecting ungrouped behavior
cars_grouped %>%
  mutate(rank = row_number())  # Ranks within groups!
#> # A tibble: 32 × 13
#> # Groups:   cyl [3]
#>      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb cyl_factor
#>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>     
#>  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4 6         
#>  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4 6         
#>  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1 4         
#>  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1 6         
#>  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2 8         
#>  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1 6         
#>  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4 8         
#>  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2 4         
#>  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2 4         
#> 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4 6         
#> # ℹ 22 more rows
#> # ℹ 1 more variable: rank <int>

21.3.2 What Happened

row_number() operated within each group, not across all data.

21.3.3 Common Causes

# Expecting overall statistics
mtcars %>%
  group_by(cyl) %>%
  mutate(
    overall_mean = mean(mpg)  # Actually group means!
  ) %>%
  select(cyl, mpg, overall_mean)
#> Error in select(., cyl, mpg, overall_mean): unused arguments (cyl, mpg, overall_mean)

# Expecting all rows
mtcars %>%
  group_by(cyl) %>%
  slice(1)  # Gets first row of EACH group, not overall first
#> # A tibble: 3 × 12
#> # Groups:   cyl [3]
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb cyl_factor
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>     
#> 1  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1 4         
#> 2  21       6   160   110  3.9   2.62  16.5     0     1     4     4 6         
#> 3  18.7     8   360   175  3.15  3.44  17.0     0     0     3     2 8

# Filtering
mtcars %>%
  group_by(cyl) %>%
  filter(mpg == max(mpg))  # Max within each group
#> # A tibble: 3 × 12
#> # Groups:   cyl [3]
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb cyl_factor
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>     
#> 1  21.4     6 258     110  3.08  3.22  19.4     1     0     3     1 6         
#> 2  33.9     4  71.1    65  4.22  1.84  19.9     1     1     4     1 4         
#> 3  19.2     8 400     175  3.08  3.84  17.0     0     0     3     2 8

21.3.4 Solutions

SOLUTION 1: Ungroup When Done

# Ungroup explicitly
result <- mtcars %>%
  group_by(cyl) %>%
  summarize(avg_mpg = mean(mpg)) %>%
  ungroup()  # Remove grouping

# Now operations are ungrouped
result %>%
  mutate(rank = row_number())  # Ranks across all rows
#> # A tibble: 3 × 3
#>     cyl avg_mpg  rank
#>   <dbl>   <dbl> <int>
#> 1     4    26.7     1
#> 2     6    19.7     2
#> 3     8    15.1     3

SOLUTION 2: Check Grouping Status

# Always check if data is grouped
check_grouping <- function(data) {
  if (is_grouped_df(data)) {
    message("Data is grouped by: ", paste(group_vars(data), collapse = ", "))
  } else {
    message("Data is not grouped")
  }
  invisible(data)
}

mtcars %>%
  group_by(cyl) %>%
  check_grouping() %>%
  summarize(avg_mpg = mean(mpg)) %>%
  check_grouping()
#> Data is grouped by: cyl
#> Data is not grouped

SOLUTION 3: Use .groups Argument

# Control grouping in summarize
mtcars %>%
  group_by(cyl, gear) %>%
  summarize(
    avg_mpg = mean(mpg),
    .groups = "drop"  # Ungroup completely
  )
#> # A tibble: 8 × 3
#>     cyl  gear avg_mpg
#>   <dbl> <dbl>   <dbl>
#> 1     4     3    21.5
#> 2     4     4    26.9
#> 3     4     5    28.2
#> 4     6     3    19.8
#> 5     6     4    19.8
#> 6     6     5    19.7
#> 7     8     3    15.0
#> 8     8     5    15.4

# Keep some grouping
mtcars %>%
  group_by(cyl, gear) %>%
  summarize(
    avg_mpg = mean(mpg),
    .groups = "drop_last"  # Keep cyl grouping
  ) %>%
  group_vars()
#> [1] "cyl"

# Keep all groups
mtcars %>%
  group_by(cyl, gear) %>%
  summarize(
    avg_mpg = mean(mpg),
    .groups = "keep"
  ) %>%
  group_vars()
#> [1] "cyl"  "gear"

21.4 summarize() with Groups

💡 Key Insight: summarize() Reduces Groups

# Single grouping variable
mtcars %>%
  group_by(cyl) %>%
  summarize(
    n = n(),
    avg_mpg = mean(mpg),
    sd_mpg = sd(mpg)
  )
#> # A tibble: 3 × 4
#>     cyl     n avg_mpg sd_mpg
#>   <dbl> <int>   <dbl>  <dbl>
#> 1     4    11    26.7   4.51
#> 2     6     7    19.7   1.45
#> 3     8    14    15.1   2.56

# Multiple grouping variables
mtcars %>%
  group_by(cyl, gear) %>%
  summarize(
    n = n(),
    avg_mpg = mean(mpg),
    .groups = "drop"
  )
#> # A tibble: 8 × 4
#>     cyl  gear     n avg_mpg
#>   <dbl> <dbl> <int>   <dbl>
#> 1     4     3     1    21.5
#> 2     4     4     8    26.9
#> 3     4     5     2    28.2
#> 4     6     3     2    19.8
#> 5     6     4     4    19.8
#> 6     6     5     1    19.7
#> 7     8     3    12    15.0
#> 8     8     5     2    15.4

# Useful summary functions
mtcars %>%
  group_by(cyl) %>%
  summarize(
    count = n(),                    # Number of observations
    n_distinct = n_distinct(gear),  # Unique values
    mean = mean(mpg),
    median = median(mpg),
    sd = sd(mpg),
    min = min(mpg),
    max = max(mpg),
    first = first(mpg),             # First value
    last = last(mpg),               # Last value
    nth = nth(mpg, 2)               # Nth value
  )
#> # A tibble: 3 × 11
#>     cyl count n_distinct  mean median    sd   min   max first  last   nth
#>   <dbl> <int>      <int> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     4    11          3  26.7   26    4.51  21.4  33.9  22.8  21.4  24.4
#> 2     6     7          3  19.7   19.7  1.45  17.8  21.4  21    19.7  21  
#> 3     8    14          2  15.1   15.2  2.56  10.4  19.2  18.7  15    14.3

# Multiple values from same column
mtcars %>%
  group_by(cyl) %>%
  summarize(
    mpg_stats = list(summary(mpg))  # List column
  )
#> # A tibble: 3 × 2
#>     cyl mpg_stats     
#>   <dbl> <list>        
#> 1     4 <smmryDfl [6]>
#> 2     6 <smmryDfl [6]>
#> 3     8 <smmryDfl [6]>

21.5 Error #2: must be length 1, not ...

⭐⭐ INTERMEDIATE 🔢 TYPE

21.5.1 The Error

mtcars %>%
  group_by(cyl) %>%
  summarize(
    top_mpg = head(mpg, 3)  # Returns multiple values!
  )
#> Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
#> dplyr 1.1.0.
#> ℹ Please use `reframe()` instead.
#> ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
#>   always returns an ungrouped data frame and adjust accordingly.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> `summarise()` has grouped output by 'cyl'. You can override using the `.groups`
#> argument.
#> # A tibble: 9 × 2
#> # Groups:   cyl [3]
#>     cyl top_mpg
#>   <dbl>   <dbl>
#> 1     4    22.8
#> 2     4    24.4
#> 3     4    22.8
#> 4     6    21  
#> 5     6    21  
#> 6     6    21.4
#> 7     8    18.7
#> 8     8    14.3
#> 9     8    16.4

🔴 ERROR

Error in summarize():
! `top_mpg` must be size 1, not 3.

21.5.2 What It Means

summarize() expects functions that return a single value per group, not vectors.

21.5.3 Common Causes

# Functions returning multiple values
mtcars %>%
  group_by(cyl) %>%
  summarize(all_mpg = mpg)  # All values, not one
#> Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
#> dplyr 1.1.0.
#> ℹ Please use `reframe()` instead.
#> ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
#>   always returns an ungrouped data frame and adjust accordingly.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> `summarise()` has grouped output by 'cyl'. You can override using the `.groups`
#> argument.
#> # A tibble: 32 × 2
#> # Groups:   cyl [3]
#>      cyl all_mpg
#>    <dbl>   <dbl>
#>  1     4    22.8
#>  2     4    24.4
#>  3     4    22.8
#>  4     4    32.4
#>  5     4    30.4
#>  6     4    33.9
#>  7     4    21.5
#>  8     4    27.3
#>  9     4    26  
#> 10     4    30.4
#> # ℹ 22 more rows

mtcars %>%
  group_by(cyl) %>%
  summarize(quantiles = quantile(mpg))  # Returns 5 values
#> Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
#> dplyr 1.1.0.
#> ℹ Please use `reframe()` instead.
#> ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
#>   always returns an ungrouped data frame and adjust accordingly.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> `summarise()` has grouped output by 'cyl'. You can override using the `.groups`
#> argument.
#> # A tibble: 15 × 2
#> # Groups:   cyl [3]
#>      cyl quantiles
#>    <dbl>     <dbl>
#>  1     4      21.4
#>  2     4      22.8
#>  3     4      26  
#>  4     4      30.4
#>  5     4      33.9
#>  6     6      17.8
#>  7     6      18.6
#>  8     6      19.7
#>  9     6      21  
#> 10     6      21.4
#> 11     8      10.4
#> 12     8      14.4
#> 13     8      15.2
#> 14     8      16.2
#> 15     8      19.2

mtcars %>%
  group_by(cyl) %>%
  summarize(range = range(mpg))  # Returns 2 values
#> Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
#> dplyr 1.1.0.
#> ℹ Please use `reframe()` instead.
#> ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
#>   always returns an ungrouped data frame and adjust accordingly.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> `summarise()` has grouped output by 'cyl'. You can override using the `.groups`
#> argument.
#> # A tibble: 6 × 2
#> # Groups:   cyl [3]
#>     cyl range
#>   <dbl> <dbl>
#> 1     4  21.4
#> 2     4  33.9
#> 3     6  17.8
#> 4     6  21.4
#> 5     8  10.4
#> 6     8  19.2

21.5.4 Solutions

SOLUTION 1: Use Single-Value Functions

# Functions that return one value
mtcars %>%
  group_by(cyl) %>%
  summarize(
    mean_mpg = mean(mpg),
    max_mpg = max(mpg),
    min_mpg = min(mpg),
    n_obs = n()
  )
#> # A tibble: 3 × 5
#>     cyl mean_mpg max_mpg min_mpg n_obs
#>   <dbl>    <dbl>   <dbl>   <dbl> <int>
#> 1     4     26.7    33.9    21.4    11
#> 2     6     19.7    21.4    17.8     7
#> 3     8     15.1    19.2    10.4    14

# For ranges, get separately
mtcars %>%
  group_by(cyl) %>%
  summarize(
    min_mpg = min(mpg),
    max_mpg = max(mpg)
  )
#> # A tibble: 3 × 3
#>     cyl min_mpg max_mpg
#>   <dbl>   <dbl>   <dbl>
#> 1     4    21.4    33.9
#> 2     6    17.8    21.4
#> 3     8    10.4    19.2

SOLUTION 2: Use reframe() for Multiple Rows

# reframe() allows multiple rows per group (dplyr >= 1.1.0)
if (packageVersion("dplyr") >= "1.1.0") {
  mtcars %>%
    group_by(cyl) %>%
    reframe(
      top_3_mpg = head(sort(mpg, decreasing = TRUE), 3)
    )
}
#> # A tibble: 9 × 2
#>     cyl top_3_mpg
#>   <dbl>     <dbl>
#> 1     4      33.9
#> 2     4      32.4
#> 3     4      30.4
#> 4     6      21.4
#> 5     6      21  
#> 6     6      21  
#> 7     8      19.2
#> 8     8      18.7
#> 9     8      17.3

# For older dplyr, use summarize with list
mtcars %>%
  group_by(cyl) %>%
  summarize(
    top_3_mpg = list(head(sort(mpg, decreasing = TRUE), 3))
  ) %>%
  tidyr::unnest(top_3_mpg)
#> # A tibble: 9 × 2
#>     cyl top_3_mpg
#>   <dbl>     <dbl>
#> 1     4      33.9
#> 2     4      32.4
#> 3     4      30.4
#> 4     6      21.4
#> 5     6      21  
#> 6     6      21  
#> 7     8      19.2
#> 8     8      18.7
#> 9     8      17.3

SOLUTION 3: Use slice() Instead

# Get top N rows per group with slice
mtcars %>%
  group_by(cyl) %>%
  slice_max(mpg, n = 3) %>%
  select(cyl, mpg)
#> Error in select(., cyl, mpg): unused arguments (cyl, mpg)

# Or with arrange and slice
mtcars %>%
  group_by(cyl) %>%
  arrange(desc(mpg)) %>%
  slice(1:3) %>%
  select(cyl, mpg)
#> Error in select(., cyl, mpg): unused arguments (cyl, mpg)

21.6 mutate() with Groups

💡 Key Insight: mutate() Keeps All Rows

# mutate() operates within groups but keeps all rows
mtcars %>%
  group_by(cyl) %>%
  mutate(
    group_mean = mean(mpg),
    deviation = mpg - group_mean
  ) %>%
  select(cyl, mpg, group_mean, deviation) %>%
  arrange(cyl, mpg)
#> Error in select(., cyl, mpg, group_mean, deviation): unused arguments (cyl, mpg, group_mean, deviation)

# Ranking within groups
mtcars %>%
  group_by(cyl) %>%
  mutate(
    mpg_rank = rank(desc(mpg)),
    mpg_dense_rank = dense_rank(desc(mpg)),
    mpg_row_number = row_number(desc(mpg))
  ) %>%
  select(cyl, mpg, mpg_rank, mpg_dense_rank, mpg_row_number) %>%
  arrange(cyl, desc(mpg))
#> Error in select(., cyl, mpg, mpg_rank, mpg_dense_rank, mpg_row_number): unused arguments (cyl, mpg, mpg_rank, mpg_dense_rank, mpg_row_number)

# Cumulative operations within groups
mtcars %>%
  group_by(cyl) %>%
  arrange(mpg) %>%
  mutate(
    cumsum_mpg = cumsum(mpg),
    cummax_mpg = cummax(mpg)
  ) %>%
  select(cyl, mpg, cumsum_mpg, cummax_mpg)
#> Error in select(., cyl, mpg, cumsum_mpg, cummax_mpg): unused arguments (cyl, mpg, cumsum_mpg, cummax_mpg)

# Lead and lag within groups
mtcars %>%
  group_by(cyl) %>%
  arrange(mpg) %>%
  mutate(
    prev_mpg = lag(mpg),
    next_mpg = lead(mpg),
    diff_from_prev = mpg - lag(mpg)
  ) %>%
  select(cyl, mpg, prev_mpg, next_mpg, diff_from_prev)
#> Error in select(., cyl, mpg, prev_mpg, next_mpg, diff_from_prev): unused arguments (cyl, mpg, prev_mpg, next_mpg, diff_from_prev)

21.7 filter() with Groups

💡 Key Insight: filter() with Groups

# Filter based on group statistics
mtcars %>%
  group_by(cyl) %>%
  filter(mpg > mean(mpg)) %>%
  select(cyl, mpg) %>%
  arrange(cyl, mpg)
#> Error in select(., cyl, mpg): unused arguments (cyl, mpg)

# Keep only groups meeting criteria
mtcars %>%
  group_by(cyl) %>%
  filter(n() >= 10) %>%  # Only groups with 10+ observations
  select(cyl) %>%
  distinct()
#> Error in select(., cyl): unused argument (cyl)

# Top N per group
mtcars %>%
  group_by(cyl) %>%
  filter(rank(desc(mpg)) <= 2) %>%
  select(cyl, mpg) %>%
  arrange(cyl, desc(mpg))
#> Error in select(., cyl, mpg): unused arguments (cyl, mpg)

# Filter by group condition
mtcars %>%
  group_by(cyl) %>%
  filter(mean(mpg) > 20) %>%  # Only groups with avg mpg > 20
  select(cyl, mpg)
#> Error in select(., cyl, mpg): unused arguments (cyl, mpg)

21.8 Multiple Grouping Variables

💡 Key Insight: Multiple Groups

# Group by multiple variables
mtcars %>%
  group_by(cyl, gear) %>%
  summarize(
    n = n(),
    avg_mpg = mean(mpg),
    .groups = "drop"
  )
#> # A tibble: 8 × 4
#>     cyl  gear     n avg_mpg
#>   <dbl> <dbl> <int>   <dbl>
#> 1     4     3     1    21.5
#> 2     4     4     8    26.9
#> 3     4     5     2    28.2
#> 4     6     3     2    19.8
#> 5     6     4     4    19.8
#> 6     6     5     1    19.7
#> 7     8     3    12    15.0
#> 8     8     5     2    15.4

# Order matters for operations
mtcars %>%
  group_by(cyl, gear) %>%
  mutate(
    rank_in_cyl_gear = row_number(desc(mpg))
  ) %>%
  select(cyl, gear, mpg, rank_in_cyl_gear) %>%
  arrange(cyl, gear, desc(mpg))
#> Error in select(., cyl, gear, mpg, rank_in_cyl_gear): unused arguments (cyl, gear, mpg, rank_in_cyl_gear)

# Count combinations
mtcars %>%
  count(cyl, gear)
#>   cyl gear  n
#> 1   4    3  1
#> 2   4    4  8
#> 3   4    5  2
#> 4   6    3  2
#> 5   6    4  4
#> 6   6    5  1
#> 7   8    3 12
#> 8   8    5  2

# Or with group_by + summarize
mtcars %>%
  group_by(cyl, gear) %>%
  summarize(n = n(), .groups = "drop")
#> # A tibble: 8 × 3
#>     cyl  gear     n
#>   <dbl> <dbl> <int>
#> 1     4     3     1
#> 2     4     4     8
#> 3     4     5     2
#> 4     6     3     2
#> 5     6     4     4
#> 6     6     5     1
#> 7     8     3    12
#> 8     8     5     2

21.9 Error #3: Unexpected Grouping After summarize()

⭐⭐ INTERMEDIATE 🧠 LOGIC

21.9.1 The Problem

# Summarize with multiple groups
result <- mtcars %>%
  group_by(cyl, gear) %>%
  summarize(avg_mpg = mean(mpg))  # Message about grouping
#> `summarise()` has grouped output by 'cyl'. You can override using the `.groups`
#> argument.

# Still grouped!
is_grouped_df(result)
#> [1] TRUE
group_vars(result)
#> [1] "cyl"

# Next operation uses this grouping
result %>%
  mutate(rank = row_number())  # Within cyl groups!
#> # A tibble: 8 × 4
#> # Groups:   cyl [3]
#>     cyl  gear avg_mpg  rank
#>   <dbl> <dbl>   <dbl> <int>
#> 1     4     3    21.5     1
#> 2     4     4    26.9     2
#> 3     4     5    28.2     3
#> 4     6     3    19.8     1
#> 5     6     4    19.8     2
#> 6     6     5    19.7     3
#> 7     8     3    15.0     1
#> 8     8     5    15.4     2

21.9.2 What Happened

By default, summarize() drops the last grouping variable but keeps others.

21.9.3 Solutions

SOLUTION 1: Use .groups = “drop”

# Ungroup completely
result <- mtcars %>%
  group_by(cyl, gear) %>%
  summarize(
    avg_mpg = mean(mpg),
    .groups = "drop"
  )

is_grouped_df(result)
#> [1] FALSE

SOLUTION 2: Explicit ungroup()

result <- mtcars %>%
  group_by(cyl, gear) %>%
  summarize(avg_mpg = mean(mpg)) %>%
  ungroup()
#> `summarise()` has grouped output by 'cyl'. You can override using the `.groups`
#> argument.

is_grouped_df(result)
#> [1] FALSE

SOLUTION 3: Understand .groups Options

# .groups = "drop_last" (default)
mtcars %>%
  group_by(cyl, gear) %>%
  summarize(avg_mpg = mean(mpg), .groups = "drop_last") %>%
  group_vars()  # Still has cyl
#> [1] "cyl"

# .groups = "keep"
mtcars %>%
  group_by(cyl, gear) %>%
  summarize(avg_mpg = mean(mpg), .groups = "keep") %>%
  group_vars()  # Has both cyl and gear
#> [1] "cyl"  "gear"

# .groups = "drop"
mtcars %>%
  group_by(cyl, gear) %>%
  summarize(avg_mpg = mean(mpg), .groups = "drop") %>%
  group_vars()  # No groups
#> character(0)

# .groups = "rowwise"
mtcars %>%
  group_by(cyl, gear) %>%
  summarize(avg_mpg = mean(mpg), .groups = "rowwise") %>%
  class()
#> [1] "rowwise_df" "tbl_df"     "tbl"        "data.frame"

21.10 Window Functions

🎯 Best Practice: Window Functions with Groups

# Ranking functions
mtcars %>%
  group_by(cyl) %>%
  mutate(
    rank = rank(desc(mpg)),           # Ties get average rank
    dense_rank = dense_rank(desc(mpg)), # Ties get same rank, no gaps
    row_number = row_number(desc(mpg)), # Ties broken arbitrarily
    percent_rank = percent_rank(desc(mpg)), # Percentile
    cume_dist = cume_dist(desc(mpg))   # Cumulative distribution
  ) %>%
  select(cyl, mpg, rank, dense_rank, row_number, percent_rank, cume_dist) %>%
  arrange(cyl, desc(mpg))
#> Error in select(., cyl, mpg, rank, dense_rank, row_number, percent_rank, : unused arguments (cyl, mpg, rank, dense_rank, row_number, percent_rank, cume_dist)

# Lead and lag
mtcars %>%
  group_by(cyl) %>%
  arrange(mpg) %>%
  mutate(
    prev_value = lag(mpg, 1),      # Previous value
    next_value = lead(mpg, 1),     # Next value
    prev_2 = lag(mpg, 2),          # 2 rows back
    change = mpg - lag(mpg, 1)     # Change from previous
  ) %>%
  select(cyl, mpg, prev_value, next_value, change)
#> Error in select(., cyl, mpg, prev_value, next_value, change): unused arguments (cyl, mpg, prev_value, next_value, change)

# Cumulative functions
mtcars %>%
  group_by(cyl) %>%
  arrange(mpg) %>%
  mutate(
    cumsum = cumsum(mpg),
    cummean = cummean(mpg),
    cummin = cummin(mpg),
    cummax = cummax(mpg)
  ) %>%
  select(cyl, mpg, cumsum, cummean, cummin, cummax)
#> Error in select(., cyl, mpg, cumsum, cummean, cummin, cummax): unused arguments (cyl, mpg, cumsum, cummean, cummin, cummax)

# Offset functions
mtcars %>%
  group_by(cyl) %>%
  arrange(mpg) %>%
  mutate(
    first_val = first(mpg),
    last_val = last(mpg),
    nth_val = nth(mpg, 3)
  ) %>%
  select(cyl, mpg, first_val, last_val, nth_val)
#> Error in select(., cyl, mpg, first_val, last_val, nth_val): unused arguments (cyl, mpg, first_val, last_val, nth_val)

21.11 count() and add_count()

💡 Key Insight: Counting Shortcuts

# count() is shortcut for group_by + summarize + ungroup
mtcars %>%
  count(cyl)
#>   cyl  n
#> 1   4 11
#> 2   6  7
#> 3   8 14

# Equivalent to:
mtcars %>%
  group_by(cyl) %>%
  summarize(n = n()) %>%
  ungroup()
#> # A tibble: 3 × 2
#>     cyl     n
#>   <dbl> <int>
#> 1     4    11
#> 2     6     7
#> 3     8    14

# Multiple variables
mtcars %>%
  count(cyl, gear)
#>   cyl gear  n
#> 1   4    3  1
#> 2   4    4  8
#> 3   4    5  2
#> 4   6    3  2
#> 5   6    4  4
#> 6   6    5  1
#> 7   8    3 12
#> 8   8    5  2

# With sorting
mtcars %>%
  count(cyl, sort = TRUE)
#>   cyl  n
#> 1   8 14
#> 2   4 11
#> 3   6  7

# With weights
mtcars %>%
  count(cyl, wt = mpg)  # Sum of mpg instead of count
#>   cyl     n
#> 1   4 293.3
#> 2   6 138.2
#> 3   8 211.4

# add_count() keeps all rows
mtcars %>%
  add_count(cyl) %>%
  select(cyl, n, everything())
#> Error in select(., cyl, n, everything()): unused arguments (cyl, n, everything())

# Name the count column
mtcars %>%
  add_count(cyl, name = "n_per_cyl") %>%
  select(cyl, n_per_cyl, mpg)
#> Error in select(., cyl, n_per_cyl, mpg): unused arguments (cyl, n_per_cyl, mpg)

21.12 across() for Multiple Columns

🎯 Best Practice: across() for Multiple Columns

# Summarize multiple columns
mtcars %>%
  group_by(cyl) %>%
  summarize(
    across(c(mpg, hp, wt), mean),
    .groups = "drop"
  )
#> # A tibble: 3 × 4
#>     cyl   mpg    hp    wt
#>   <dbl> <dbl> <dbl> <dbl>
#> 1     4  26.7  82.6  2.29
#> 2     6  19.7 122.   3.12
#> 3     8  15.1 209.   4.00

# With named functions
mtcars %>%
  group_by(cyl) %>%
  summarize(
    across(c(mpg, hp, wt), 
           list(mean = mean, sd = sd),
           .names = "{.col}_{.fn}"),
    .groups = "drop"
  )
#> # A tibble: 3 × 7
#>     cyl mpg_mean mpg_sd hp_mean hp_sd wt_mean wt_sd
#>   <dbl>    <dbl>  <dbl>   <dbl> <dbl>   <dbl> <dbl>
#> 1     4     26.7   4.51    82.6  20.9    2.29 0.570
#> 2     6     19.7   1.45   122.   24.3    3.12 0.356
#> 3     8     15.1   2.56   209.   51.0    4.00 0.759

# Using selection helpers
mtcars %>%
  group_by(cyl) %>%
  summarize(
    across(where(is.numeric), mean),
    .groups = "drop"
  )
#> # A tibble: 3 × 11
#>     cyl   mpg  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     4  26.7  105.  82.6  4.07  2.29  19.1 0.909 0.727  4.09  1.55
#> 2     6  19.7  183. 122.   3.59  3.12  18.0 0.571 0.429  3.86  3.43
#> 3     8  15.1  353. 209.   3.23  4.00  16.8 0     0.143  3.29  3.5

# In mutate
mtcars %>%
  group_by(cyl) %>%
  mutate(
    across(c(mpg, hp), 
           ~ . - mean(.),
           .names = "{.col}_centered")
  ) %>%
  select(cyl, mpg, mpg_centered, hp, hp_centered)
#> Error in select(., cyl, mpg, mpg_centered, hp, hp_centered): unused arguments (cyl, mpg, mpg_centered, hp, hp_centered)

# Multiple functions
mtcars %>%
  group_by(cyl) %>%
  summarize(
    across(c(mpg, hp), 
           list(min = min, max = max, mean = mean)),
    .groups = "drop"
  )
#> # A tibble: 3 × 7
#>     cyl mpg_min mpg_max mpg_mean hp_min hp_max hp_mean
#>   <dbl>   <dbl>   <dbl>    <dbl>  <dbl>  <dbl>   <dbl>
#> 1     4    21.4    33.9     26.7     52    113    82.6
#> 2     6    17.8    21.4     19.7    105    175   122. 
#> 3     8    10.4    19.2     15.1    150    335   209.

21.13 Common Pitfalls

⚠️ Common Grouping Pitfalls

# Pitfall 1: Forgetting to ungroup
bad <- mtcars %>%
  group_by(cyl) %>%
  mutate(mpg_scaled = mpg / mean(mpg))
# Still grouped!

good <- mtcars %>%
  group_by(cyl) %>%
  mutate(mpg_scaled = mpg / mean(mpg)) %>%
  ungroup()

# Pitfall 2: Summarizing already summarized data
mtcars %>%
  group_by(cyl) %>%
  summarize(avg_mpg = mean(mpg)) %>%
  summarize(overall_avg = mean(avg_mpg))  # Wrong! Treats groups equally
#> # A tibble: 1 × 1
#>   overall_avg
#>         <dbl>
#> 1        20.5

# Correct: weight by group size
mtcars %>%
  group_by(cyl) %>%
  summarize(
    avg_mpg = mean(mpg),
    n = n(),
    .groups = "drop"
  ) %>%
  summarize(overall_avg = sum(avg_mpg * n) / sum(n))
#> # A tibble: 1 × 1
#>   overall_avg
#>         <dbl>
#> 1        20.1

# Pitfall 3: Group order affecting row_number
set.seed(123)
df <- tibble(
  group = rep(c("A", "B"), each = 3),
  value = sample(1:6)
)

# Different results!
df %>% group_by(group) %>% mutate(rn = row_number())
#> # A tibble: 6 × 3
#> # Groups:   group [2]
#>   group value    rn
#>   <chr> <int> <int>
#> 1 A         3     1
#> 2 A         6     2
#> 3 A         2     3
#> 4 B         4     1
#> 5 B         5     2
#> 6 B         1     3
df %>% arrange(value) %>% group_by(group) %>% mutate(rn = row_number())
#> # A tibble: 6 × 3
#> # Groups:   group [2]
#>   group value    rn
#>   <chr> <int> <int>
#> 1 B         1     1
#> 2 A         2     1
#> 3 A         3     2
#> 4 B         4     2
#> 5 B         5     3
#> 6 A         6     3

# Pitfall 4: Comparing grouped vs ungrouped
mtcars %>%
  group_by(cyl) %>%
  filter(mpg == max(mpg)) %>%  # Max within each cyl
  nrow()
#> [1] 3

mtcars %>%
  filter(mpg == max(mpg)) %>%  # Overall max
  nrow()
#> [1] 1

21.14 Summary

Key Takeaways:

  1. group_by() is invisible - Data looks same but behaves differently
  2. Always ungroup - Use ungroup() or .groups = "drop"
  3. summarize() reduces groups - Drops last group by default
  4. mutate() keeps all rows - Operations within groups
  5. Window functions - Work great with groups
  6. Check grouping status - Use group_vars() or is_grouped_df()
  7. across() for multiple columns - Apply same operation to many columns

Quick Reference:

Error Cause Fix
Unexpected grouped behavior Forgot data is grouped Check with group_vars(), use ungroup()
must be size 1 summarize() with vector Use single-value function or reframe()
Wrong statistics Operating on groups Ungroup or use different operation

Grouping Operations:

# Basic grouping
data %>% group_by(col)

# Multiple groups
data %>% group_by(col1, col2)

# Ungroup
data %>% ungroup()

# Check grouping
group_vars(data)
is_grouped_df(data)
n_groups(data)

# Summarize with groups
data %>%
  group_by(col) %>%
  summarize(stat = mean(value), .groups = "drop")

# Mutate with groups (keeps all rows)
data %>%
  group_by(col) %>%
  mutate(group_mean = mean(value))

# Filter with groups
data %>%
  group_by(col) %>%
  filter(value > mean(value))

# Count
data %>% count(col)
data %>% add_count(col)

# across for multiple columns
data %>%
  group_by(col) %>%
  summarize(across(where(is.numeric), mean))

Best Practices:

# ✅ Good
data %>% group_by(col) %>% 
  summarize(stat = mean(x), .groups = "drop")  # Explicit ungrouping
data %>% group_by(col) %>% ... %>% ungroup()   # Ungroup at end
Always check: group_vars(data)                  # Verify grouping

# ❌ Avoid
data %>% group_by(col) %>% summarize(...)      # No .groups argument
Leave data grouped                               # Causes confusion later
Assume data is/isn't grouped                     # Always check

21.15 Exercises

📝 Exercise 1: Group Statistics

Using mtcars: 1. Group by cyl 2. Calculate mean, median, sd of mpg per group 3. Add count and min/max mpg 4. Sort by mean mpg descending 5. Ensure result is ungrouped

📝 Exercise 2: Window Functions

Using mtcars: 1. Group by cyl 2. Add rank of mpg within each cyl 3. Add difference from group mean 4. Add percentile within group 5. Keep only top 2 per group

📝 Exercise 3: Group Detector

Write check_grouped_operations(data) that: 1. Checks if data is grouped 2. Shows what grouping variables exist 3. Warns about potential issues 4. Suggests when to ungroup

📝 Exercise 4: Grouped Summary Report

Write grouped_summary(data, group_vars, summary_vars) that: 1. Groups by specified columns 2. Summarizes multiple statistics for each summary variable 3. Includes count and percentage of total 4. Returns ungrouped, sorted result

21.16 Exercise Answers

Click to see answers

Exercise 1:

result <- mtcars %>%
  group_by(cyl) %>%
  summarize(
    count = n(),
    mean_mpg = mean(mpg),
    median_mpg = median(mpg),
    sd_mpg = sd(mpg),
    min_mpg = min(mpg),
    max_mpg = max(mpg),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_mpg))

result
#> # A tibble: 3 × 7
#>     cyl count mean_mpg median_mpg sd_mpg min_mpg max_mpg
#>   <dbl> <int>    <dbl>      <dbl>  <dbl>   <dbl>   <dbl>
#> 1     4    11     26.7       26     4.51    21.4    33.9
#> 2     6     7     19.7       19.7   1.45    17.8    21.4
#> 3     8    14     15.1       15.2   2.56    10.4    19.2

# Verify it's ungrouped
is_grouped_df(result)
#> [1] FALSE

Exercise 2:

result <- mtcars %>%
  group_by(cyl) %>%
  mutate(
    mpg_rank = rank(desc(mpg)),
    deviation_from_mean = mpg - mean(mpg),
    percentile = percent_rank(mpg)
  ) %>%
  filter(mpg_rank <= 2) %>%
  select(cyl, mpg, mpg_rank, deviation_from_mean, percentile) %>%
  arrange(cyl, mpg_rank) %>%
  ungroup()
#> Error in select(., cyl, mpg, mpg_rank, deviation_from_mean, percentile): unused arguments (cyl, mpg, mpg_rank, deviation_from_mean, percentile)

result
#> # A tibble: 3 × 7
#>     cyl count mean_mpg median_mpg sd_mpg min_mpg max_mpg
#>   <dbl> <int>    <dbl>      <dbl>  <dbl>   <dbl>   <dbl>
#> 1     4    11     26.7       26     4.51    21.4    33.9
#> 2     6     7     19.7       19.7   1.45    17.8    21.4
#> 3     8    14     15.1       15.2   2.56    10.4    19.2

Exercise 3:

check_grouped_operations <- function(data, warn_threshold = 3) {
  cat("=== Grouping Analysis ===\n\n")
  
  # Check if grouped
  if (!is_grouped_df(data)) {
    cat("✓ Data is NOT grouped\n")
    return(invisible(data))
  }
  
  # Get grouping info
  grp_vars <- group_vars(data)
  n_grps <- n_groups(data)
  
  cat("⚠ Data IS grouped\n")
  cat("Grouping variables:", paste(grp_vars, collapse = ", "), "\n")
  cat("Number of groups:", n_grps, "\n\n")
  
  # Group sizes
  grp_sizes <- data %>%
    summarize(n = n(), .groups = "drop") %>%
    pull(n)
  
  cat("Group size summary:\n")
  cat("  Min:", min(grp_sizes), "\n")
  cat("  Max:", max(grp_sizes), "\n")
  cat("  Mean:", round(mean(grp_sizes), 1), "\n\n")
  
  # Warnings
  if (length(grp_vars) > warn_threshold) {
    warning("Many grouping variables (", length(grp_vars), 
            ") may lead to many small groups")
  }
  
  if (any(grp_sizes == 1)) {
    warning("Some groups have only 1 observation")
  }
  
  # Suggestions
  cat("Suggestions:\n")
  cat("  - Use ungroup() when done with grouped operations\n")
  cat("  - Or use .groups = 'drop' in summarize()\n")
  cat("  - Check grouping with group_vars() before operations\n")
  
  invisible(data)
}

# Test
mtcars %>%
  group_by(cyl) %>%
  check_grouped_operations()
#> === Grouping Analysis ===
#> 
#> ⚠ Data IS grouped
#> Grouping variables: cyl 
#> Number of groups: 3 
#> 
#> Group size summary:
#>   Min: 7 
#>   Max: 14 
#>   Mean: 10.7 
#> 
#> Suggestions:
#>   - Use ungroup() when done with grouped operations
#>   - Or use .groups = 'drop' in summarize()
#>   - Check grouping with group_vars() before operations

mtcars %>%
  group_by(cyl, gear, carb) %>%
  check_grouped_operations()
#> === Grouping Analysis ===
#> 
#> ⚠ Data IS grouped
#> Grouping variables: cyl, gear, carb 
#> Number of groups: 12 
#> 
#> Group size summary:
#>   Min: 1 
#>   Max: 5 
#>   Mean: 2.7
#> Warning in check_grouped_operations(.): Some groups have only 1 observation
#> Suggestions:
#>   - Use ungroup() when done with grouped operations
#>   - Or use .groups = 'drop' in summarize()
#>   - Check grouping with group_vars() before operations

Exercise 4:

grouped_summary <- function(data, group_vars, summary_vars) {
  # Get total count for percentages
  total_n <- nrow(data)
  
  # Create summary
  result <- data %>%
    group_by(across(all_of(group_vars))) %>%
    summarize(
      count = n(),
      pct_of_total = n() / total_n * 100,
      across(
        all_of(summary_vars),
        list(
          mean = ~mean(., na.rm = TRUE),
          median = ~median(., na.rm = TRUE),
          sd = ~sd(., na.rm = TRUE),
          min = ~min(., na.rm = TRUE),
          max = ~max(., na.rm = TRUE)
        ),
        .names = "{.col}_{.fn}"
      ),
      .groups = "drop"
    ) %>%
    arrange(desc(count))
  
  result
}

# Test
grouped_summary(
  mtcars,
  group_vars = c("cyl"),
  summary_vars = c("mpg", "hp")
)
#> # A tibble: 3 × 13
#>     cyl count pct_of_total mpg_mean mpg_median mpg_sd mpg_min mpg_max hp_mean
#>   <dbl> <int>        <dbl>    <dbl>      <dbl>  <dbl>   <dbl>   <dbl>   <dbl>
#> 1     8    14         43.8     15.1       15.2   2.56    10.4    19.2   209. 
#> 2     4    11         34.4     26.7       26     4.51    21.4    33.9    82.6
#> 3     6     7         21.9     19.7       19.7   1.45    17.8    21.4   122. 
#> # ℹ 4 more variables: hp_median <dbl>, hp_sd <dbl>, hp_min <dbl>, hp_max <dbl>

grouped_summary(
  mtcars,
  group_vars = c("cyl", "gear"),
  summary_vars = c("mpg", "hp", "wt")
)
#> # A tibble: 8 × 19
#>     cyl  gear count pct_of_total mpg_mean mpg_median mpg_sd mpg_min mpg_max
#>   <dbl> <dbl> <int>        <dbl>    <dbl>      <dbl>  <dbl>   <dbl>   <dbl>
#> 1     8     3    12        37.5      15.0       15.2  2.77     10.4    19.2
#> 2     4     4     8        25        26.9       25.8  4.81     21.4    33.9
#> 3     6     4     4        12.5      19.8       20.1  1.55     17.8    21  
#> 4     4     5     2         6.25     28.2       28.2  3.11     26      30.4
#> 5     6     3     2         6.25     19.8       19.8  2.33     18.1    21.4
#> 6     8     5     2         6.25     15.4       15.4  0.566    15      15.8
#> 7     4     3     1         3.12     21.5       21.5 NA        21.5    21.5
#> 8     6     5     1         3.12     19.7       19.7 NA        19.7    19.7
#> # ℹ 10 more variables: hp_mean <dbl>, hp_median <dbl>, hp_sd <dbl>,
#> #   hp_min <dbl>, hp_max <dbl>, wt_mean <dbl>, wt_median <dbl>, wt_sd <dbl>,
#> #   wt_min <dbl>, wt_max <dbl>

# With iris
grouped_summary(
  iris,
  group_vars = "Species",
  summary_vars = c("Sepal.Length", "Sepal.Width", "Petal.Length")
)
#> # A tibble: 3 × 18
#>   Species    count pct_of_total Sepal.Length_mean Sepal.Length_median
#>   <fct>      <int>        <dbl>             <dbl>               <dbl>
#> 1 setosa        50         33.3              5.01                 5  
#> 2 versicolor    50         33.3              5.94                 5.9
#> 3 virginica     50         33.3              6.59                 6.5
#> # ℹ 13 more variables: Sepal.Length_sd <dbl>, Sepal.Length_min <dbl>,
#> #   Sepal.Length_max <dbl>, Sepal.Width_mean <dbl>, Sepal.Width_median <dbl>,
#> #   Sepal.Width_sd <dbl>, Sepal.Width_min <dbl>, Sepal.Width_max <dbl>,
#> #   Petal.Length_mean <dbl>, Petal.Length_median <dbl>, Petal.Length_sd <dbl>,
#> #   Petal.Length_min <dbl>, Petal.Length_max <dbl>