1.5 group_by()
combined with other functions
虽然与 summarize()
结合起来使用是最常见的,但分组也可以和 mutate()
, filter()
和 arrange()
结合使用,以完成非常便捷的操作。
当group_by
和mutate()
函数结合使用时,摘要函数(summary functions,如mean()
, median()
等) 将会自动以分组为基础,一些非摘要函数也会受到分组的影响,如偏移函数 lead()、lag()
和排秩函数 min_rank()
, row_number()
。而普通的数字运算符+ , -
、逻辑运算符< , ==
,对数运算log()
和余数运算 %/%
, %%
等将无视分组。
arrange()
默认无视分组, .group = TRUE
避免这一点。
# 分组前后 mutate()
df <- tibble(
x = 1:9,
group = rep(c("a", "b", "c"), each = 3)
)
gf <- df %>% group_by(group)
df %>% mutate(mean(x))
#> # A tibble: 9 x 3
#> x group `mean(x)`
#> <int> <chr> <dbl>
#> 1 1 a 5
#> 2 2 a 5
#> 3 3 a 5
#> 4 4 b 5
#> 5 5 b 5
#> 6 6 b 5
#> # ... with 3 more rows
gf %>% mutate(mean(x))
#> # A tibble: 9 x 3
#> # Groups: group [3]
#> x group `mean(x)`
#> <int> <chr> <dbl>
#> 1 1 a 2
#> 2 2 a 2
#> 3 3 a 2
#> 4 4 b 5
#> 5 5 b 5
#> 6 6 b 5
#> # ... with 3 more rows
# Arithmetic operators +, -, *, /, ^ are not affected by group_by().
df %>% mutate(y = x + 2)
#> # A tibble: 9 x 3
#> x group y
#> <int> <chr> <dbl>
#> 1 1 a 3
#> 2 2 a 4
#> 3 3 a 5
#> 4 4 b 6
#> 5 5 b 7
#> 6 6 b 8
#> # ... with 3 more rows
gf %>% mutate(z = x + 2)
#> # A tibble: 9 x 3
#> # Groups: group [3]
#> x group z
#> <int> <chr> <dbl>
#> 1 1 a 3
#> 2 2 a 4
#> 3 3 a 5
#> 4 4 b 6
#> 5 5 b 7
#> 6 6 b 8
#> # ... with 3 more rows
# The offset functions lead() and lag() respect the groupings in group_by(). The functions lag() and lead() will only return values within each group.
df %>% mutate(lag_x = lag(x), lead_x = lead(x))
#> # A tibble: 9 x 4
#> x group lag_x lead_x
#> <int> <chr> <int> <int>
#> 1 1 a NA 2
#> 2 2 a 1 3
#> 3 3 a 2 4
#> 4 4 b 3 5
#> 5 5 b 4 6
#> 6 6 b 5 7
#> # ... with 3 more rows
gf %>% mutate(lag_x = lag(x), lead_x = lead(x))
#> # A tibble: 9 x 4
#> # Groups: group [3]
#> x group lag_x lead_x
#> <int> <chr> <int> <int>
#> 1 1 a NA 2
#> 2 2 a 1 3
#> 3 3 a 2 NA
#> 4 4 b NA 5
#> 5 5 b 4 6
#> 6 6 b 5 NA
#> # ... with 3 more rows
# The cumulative and rolling aggregate functions cumsum(), cumprod(), cummin(), cummax(), and cummean() calculate values within each group.
df %>% mutate(cumsum(x))
#> # A tibble: 9 x 3
#> x group `cumsum(x)`
#> <int> <chr> <int>
#> 1 1 a 1
#> 2 2 a 3
#> 3 3 a 6
#> 4 4 b 10
#> 5 5 b 15
#> 6 6 b 21
#> # ... with 3 more rows
gf %>% mutate(cumsum(x))
#> # A tibble: 9 x 3
#> # Groups: group [3]
#> x group `cumsum(x)`
#> <int> <chr> <int>
#> 1 1 a 1
#> 2 2 a 3
#> 3 3 a 6
#> 4 4 b 4
#> 5 5 b 9
#> 6 6 b 15
#> # ... with 3 more rows
# Logical comparisons, <, <=, >, >=, !=, and == are not affected by group_by().
df %>% mutate(x > 0.5)
#> # A tibble: 9 x 3
#> x group `x > 0.5`
#> <int> <chr> <lgl>
#> 1 1 a TRUE
#> 2 2 a TRUE
#> 3 3 a TRUE
#> 4 4 b TRUE
#> 5 5 b TRUE
#> 6 6 b TRUE
#> # ... with 3 more rows
gf %>% mutate(x > 0.5)
#> # A tibble: 9 x 3
#> # Groups: group [3]
#> x group `x > 0.5`
#> <int> <chr> <lgl>
#> 1 1 a TRUE
#> 2 2 a TRUE
#> 3 3 a TRUE
#> 4 4 b TRUE
#> 5 5 b TRUE
#> 6 6 b TRUE
#> # ... with 3 more rows
# Ranking functions like min_rank() work within each group when used with group_by().
df %>% mutate(min_rank(x))
#> # A tibble: 9 x 3
#> x group `min_rank(x)`
#> <int> <chr> <int>
#> 1 1 a 1
#> 2 2 a 2
#> 3 3 a 3
#> 4 4 b 4
#> 5 5 b 5
#> 6 6 b 6
#> # ... with 3 more rows
gf %>% mutate(min_rank(x))
#> # A tibble: 9 x 3
#> # Groups: group [3]
#> x group `min_rank(x)`
#> <int> <chr> <int>
#> 1 1 a 1
#> 2 2 a 2
#> 3 3 a 3
#> 4 4 b 1
#> 5 5 b 2
#> 6 6 b 3
#> # ... with 3 more rows
# filter works the similar way
df %>% filter(min_rank(x) == 1)
#> # A tibble: 1 x 2
#> x group
#> <int> <chr>
#> 1 1 a
gf %>% filter(min_rank(x) == 1)
#> # A tibble: 3 x 2
#> # Groups: group [3]
#> x group
#> <int> <chr>
#> 1 1 a
#> 2 4 b
#> 3 7 c
# arrange() ignores groups when sorting values.
df <- tibble(
x = runif(9),
group = rep(c("a", "b", "c"), each = 3)
)
df %>%
group_by(group) %>%
arrange(x)
#> # A tibble: 9 x 2
#> # Groups: group [3]
#> x group
#> <dbl> <chr>
#> 1 0.0562 c
#> 2 0.337 a
#> 3 0.514 b
#> 4 0.523 b
#> 5 0.525 a
#> 6 0.557 b
#> # ... with 3 more rows
# .by_group = TRUE
df %>%
group_by(group) %>%
arrange(x, .by_group = TRUE)
#> # A tibble: 9 x 2
#> # Groups: group [3]
#> x group
#> <dbl> <chr>
#> 1 0.337 a
#> 2 0.525 a
#> 3 0.785 a
#> 4 0.514 b
#> 5 0.523 b
#> 6 0.557 b
#> # ... with 3 more rows