1.5 group_by() combined with other functions | R for data science: tidyverse and beyond

1.5 `group_by()` combined with other functions

虽然与 summarize() 结合起来使用是最常见的，但分组也可以和 mutate() , filter() 和 arrange() 结合使用,以完成非常便捷的操作。

当group_by和mutate()函数结合使用时，摘要函数(summary functions，如mean(), median() 等) 将会自动以分组为基础，一些非摘要函数也会受到分组的影响，如偏移函数 lead()、lag() 和排秩函数 min_rank(), row_number()。而普通的数字运算符+ , -、逻辑运算符< , ==，对数运算log()和余数运算 %/%, %%等将无视分组。

arrange() 默认无视分组, .group = TRUE 避免这一点。

# 分组前后 mutate()
df <- tibble(
  x = 1:9,
  group = rep(c("a", "b", "c"), each = 3)
)

gf <- df %>% group_by(group)

df %>% mutate(mean(x))
#> # A tibble: 9 x 3
#>       x group `mean(x)`
#>   <int> <chr>     <dbl>
#> 1     1 a             5
#> 2     2 a             5
#> 3     3 a             5
#> 4     4 b             5
#> 5     5 b             5
#> 6     6 b             5
#> # ... with 3 more rows
gf %>% mutate(mean(x))
#> # A tibble: 9 x 3
#> # Groups:   group [3]
#>       x group `mean(x)`
#>   <int> <chr>     <dbl>
#> 1     1 a             2
#> 2     2 a             2
#> 3     3 a             2
#> 4     4 b             5
#> 5     5 b             5
#> 6     6 b             5
#> # ... with 3 more rows

# Arithmetic operators +, -, *, /, ^ are not affected by group_by().
df %>% mutate(y = x + 2)
#> # A tibble: 9 x 3
#>       x group     y
#>   <int> <chr> <dbl>
#> 1     1 a         3
#> 2     2 a         4
#> 3     3 a         5
#> 4     4 b         6
#> 5     5 b         7
#> 6     6 b         8
#> # ... with 3 more rows
gf %>% mutate(z = x + 2)
#> # A tibble: 9 x 3
#> # Groups:   group [3]
#>       x group     z
#>   <int> <chr> <dbl>
#> 1     1 a         3
#> 2     2 a         4
#> 3     3 a         5
#> 4     4 b         6
#> 5     5 b         7
#> 6     6 b         8
#> # ... with 3 more rows

# The offset functions lead() and lag() respect the groupings in group_by(). The functions lag() and lead() will only return values within each group.
df %>% mutate(lag_x = lag(x), lead_x = lead(x))
#> # A tibble: 9 x 4
#>       x group lag_x lead_x
#>   <int> <chr> <int>  <int>
#> 1     1 a        NA      2
#> 2     2 a         1      3
#> 3     3 a         2      4
#> 4     4 b         3      5
#> 5     5 b         4      6
#> 6     6 b         5      7
#> # ... with 3 more rows
gf %>% mutate(lag_x = lag(x), lead_x = lead(x))
#> # A tibble: 9 x 4
#> # Groups:   group [3]
#>       x group lag_x lead_x
#>   <int> <chr> <int>  <int>
#> 1     1 a        NA      2
#> 2     2 a         1      3
#> 3     3 a         2     NA
#> 4     4 b        NA      5
#> 5     5 b         4      6
#> 6     6 b         5     NA
#> # ... with 3 more rows

# The cumulative and rolling aggregate functions cumsum(), cumprod(), cummin(), cummax(), and cummean() calculate values within each group.
df %>% mutate(cumsum(x))
#> # A tibble: 9 x 3
#>       x group `cumsum(x)`
#>   <int> <chr>       <int>
#> 1     1 a               1
#> 2     2 a               3
#> 3     3 a               6
#> 4     4 b              10
#> 5     5 b              15
#> 6     6 b              21
#> # ... with 3 more rows
gf %>% mutate(cumsum(x))
#> # A tibble: 9 x 3
#> # Groups:   group [3]
#>       x group `cumsum(x)`
#>   <int> <chr>       <int>
#> 1     1 a               1
#> 2     2 a               3
#> 3     3 a               6
#> 4     4 b               4
#> 5     5 b               9
#> 6     6 b              15
#> # ... with 3 more rows

# Logical comparisons, <, <=, >, >=, !=, and == are not affected by group_by().
df %>% mutate(x > 0.5)
#> # A tibble: 9 x 3
#>       x group `x > 0.5`
#>   <int> <chr> <lgl>    
#> 1     1 a     TRUE     
#> 2     2 a     TRUE     
#> 3     3 a     TRUE     
#> 4     4 b     TRUE     
#> 5     5 b     TRUE     
#> 6     6 b     TRUE     
#> # ... with 3 more rows
gf %>% mutate(x > 0.5)
#> # A tibble: 9 x 3
#> # Groups:   group [3]
#>       x group `x > 0.5`
#>   <int> <chr> <lgl>    
#> 1     1 a     TRUE     
#> 2     2 a     TRUE     
#> 3     3 a     TRUE     
#> 4     4 b     TRUE     
#> 5     5 b     TRUE     
#> 6     6 b     TRUE     
#> # ... with 3 more rows

# Ranking functions like min_rank() work within each group when used with group_by().
df %>% mutate(min_rank(x))
#> # A tibble: 9 x 3
#>       x group `min_rank(x)`
#>   <int> <chr>         <int>
#> 1     1 a                 1
#> 2     2 a                 2
#> 3     3 a                 3
#> 4     4 b                 4
#> 5     5 b                 5
#> 6     6 b                 6
#> # ... with 3 more rows
gf %>% mutate(min_rank(x))
#> # A tibble: 9 x 3
#> # Groups:   group [3]
#>       x group `min_rank(x)`
#>   <int> <chr>         <int>
#> 1     1 a                 1
#> 2     2 a                 2
#> 3     3 a                 3
#> 4     4 b                 1
#> 5     5 b                 2
#> 6     6 b                 3
#> # ... with 3 more rows

# filter works the similar way
df %>% filter(min_rank(x) == 1)
#> # A tibble: 1 x 2
#>       x group
#>   <int> <chr>
#> 1     1 a
gf %>% filter(min_rank(x) == 1)
#> # A tibble: 3 x 2
#> # Groups:   group [3]
#>       x group
#>   <int> <chr>
#> 1     1 a    
#> 2     4 b    
#> 3     7 c

# arrange() ignores groups when sorting values.
df <- tibble(
  x = runif(9),
  group = rep(c("a", "b", "c"), each = 3)
) 

df %>% 
  group_by(group) %>%
  arrange(x)
#> # A tibble: 9 x 2
#> # Groups:   group [3]
#>        x group
#>    <dbl> <chr>
#> 1 0.0562 c    
#> 2 0.337  a    
#> 3 0.514  b    
#> 4 0.523  b    
#> 5 0.525  a    
#> 6 0.557  b    
#> # ... with 3 more rows

# .by_group = TRUE
df %>% 
  group_by(group) %>% 
  arrange(x, .by_group = TRUE)
#> # A tibble: 9 x 2
#> # Groups:   group [3]
#>       x group
#>   <dbl> <chr>
#> 1 0.337 a    
#> 2 0.525 a    
#> 3 0.785 a    
#> 4 0.514 b    
#> 5 0.523 b    
#> 6 0.557 b    
#> # ... with 3 more rows