# 第 22 章 列方向和行方向

## 22.1 体验新版本

New dplyr - 8 things to know:

1. Built in tidyselect
2. relocate()
3. Superpowered summarise()
4. colwise using across()
5. cur_data(), cur_group() and cur_column()
6. new rowwise() grammar
7. easy modeling inside dataframes
8. nest_by()
library(dplyr, warn.conflicts = FALSE)
library(tidyr)

## 22.2 简单回顾

mutate()
select()
filter()
group_by()
summarise()
arrange()
rename()
left_join()

## 22.3 summarise()更强大了

• 长度为 1 的向量，比如，min(x), n(), or sum(is.na(y))
• 长度为 n 的向量，比如，quantile()
• 数据框
df <- tibble(
grp = rep(c("a", "b"), each = 5),
x = c(rnorm(5, -0.25, 1), rnorm(5, 0, 1.5)),
y = c(rnorm(5, 0.25, 1), rnorm(5, 0, 0.5))
)
df
## # A tibble: 10 x 3
##    grp        x       y
##    <chr>  <dbl>   <dbl>
##  1 a     -0.665 -0.387
##  2 a     -0.270 -0.839
##  3 a      0.791  0.0371
##  4 a     -1.38  -0.144
##  5 a      0.903  0.148
##  6 b      1.55   0.143
##  7 b      1.10   0.0986
##  8 b     -0.400 -1.11
##  9 b     -2.47  -0.670
## 10 b     -0.374 -0.440
df %>%
group_by(grp) %>%
summarise(rng = mean(x))
## # A tibble: 2 x 2
##   grp      rng
##   <chr>  <dbl>
## 1 a     -0.124
## 2 b     -0.117

df %>%
group_by(grp) %>%
summarise(rng = range(x))
## # A tibble: 4 x 2
## # Groups:   grp [2]
##   grp      rng
##   <chr>  <dbl>
## 1 a     -1.38
## 2 a      0.903
## 3 b     -2.47
## 4 b      1.55

df %>%
group_by(grp) %>%
summarise(
rng = quantile(x, probs = c(0.05, 0.5, 0.95))
)
## # A tibble: 6 x 2
## # Groups:   grp [2]
##   grp      rng
##   <chr>  <dbl>
## 1 a     -1.23
## 2 a     -0.270
## 3 a      0.881
## 4 b     -2.05
## 5 b     -0.374
## 6 b      1.46
df %>%
group_by(grp) %>%
summarise(
x = quantile(x, c(0.25, 0.5, 0.75)),
q = c(0.25, 0.5, 0.75)
)
## # A tibble: 6 x 3
## # Groups:   grp [2]
##   grp        x     q
##   <chr>  <dbl> <dbl>
## 1 a     -0.665  0.25
## 2 a     -0.270  0.5
## 3 a      0.791  0.75
## 4 b     -0.400  0.25
## 5 b     -0.374  0.5
## 6 b      1.10   0.75

summarise()可以输出数据框，比如

my_quantile <- function(x, probs) {
tibble(x = quantile(x, probs), probs = probs)
}
mtcars %>%
group_by(cyl) %>%
summarise(my_quantile(disp, c(0.25, 0.75)))
## # A tibble: 6 x 3
## # Groups:   cyl [3]
##     cyl     x probs
##   <dbl> <dbl> <dbl>
## 1     4  78.8  0.25
## 2     4 121.   0.75
## 3     6 160    0.25
## 4     6 196.   0.75
## 5     8 302.   0.25
## 6     8 390    0.75

dplyr 1.0 之前是需要group_modify()来实现数据框进，数据框出

mtcars %>%
group_by(cyl) %>%
group_modify(
~ broom::tidy(lm(mpg ~ wt, data = .))
)
## # A tibble: 6 x 6
## # Groups:   cyl [3]
##     cyl term      estimate std.error statistic  p.value
##   <dbl> <chr>        <dbl>     <dbl>     <dbl>    <dbl>
## 1     4 (Interce~    39.6      4.35       9.10  7.77e-6
## 2     4 wt           -5.65     1.85      -3.05  1.37e-2
## 3     6 (Interce~    28.4      4.18       6.79  1.05e-3
## 4     6 wt           -2.78     1.33      -2.08  9.18e-2
## 5     8 (Interce~    23.9      3.01       7.94  4.05e-6
## 6     8 wt           -2.19     0.739     -2.97  1.18e-2

dplyr 1.0 之后，有了新的方案

mtcars %>%
group_by(cyl) %>%
summarise(
broom::tidy(lm(mpg ~ wt))
)
## # A tibble: 6 x 6
## # Groups:   cyl [3]
##     cyl term      estimate std.error statistic  p.value
##   <dbl> <chr>        <dbl>     <dbl>     <dbl>    <dbl>
## 1     4 (Interce~    39.6      4.35       9.10  7.77e-6
## 2     4 wt           -5.65     1.85      -3.05  1.37e-2
## 3     6 (Interce~    28.4      4.18       6.79  1.05e-3
## 4     6 wt           -2.78     1.33      -2.08  9.18e-2
## 5     8 (Interce~    23.9      3.01       7.94  4.05e-6
## 6     8 wt           -2.19     0.739     -2.97  1.18e-2

## 22.4 summarise()后的分组信息是去是留？

group_by()summarise()配合使用的时候，summarise()默认会抵消掉最近一次的分组信息，比如下面按照cylvs分组，但summarise()后，就只剩下cyl的分组信息了。

mtcars %>%
group_by(cyl, vs) %>%
summarise(cyl_n = n())
## # A tibble: 5 x 3
## # Groups:   cyl [3]
##     cyl    vs cyl_n
##   <dbl> <dbl> <int>
## 1     4     0     1
## 2     4     1    10
## 3     6     0     3
## 4     6     1     4
## 5     8     0    14
mtcars %>%
group_by(cyl, vs) %>%
summarise(cyl_n = n()) %>%
group_vars()
## [1] "cyl"

mtcars %>%
group_by(cyl, vs) %>%
summarise(cyl_n = n(), .groups = "keep") %>%
group_vars()
## [1] "cyl" "vs"

• 丢弃所有的分组信息
mtcars %>%
group_by(cyl, vs) %>%
summarise(cyl_n = n(), .groups = "drop") %>%
group_vars()
## character(0)
• 变成行方向分组，即，每行是一个分组
mtcars %>%
group_by(cyl, vs) %>%
summarise(cyl_n = n(), .groups = "rowwise") %>%
group_vars()
## [1] "cyl" "vs"

## 22.5 选择某列

• 通过位置索引进行选取
df %>% select(1, 3)
## # A tibble: 10 x 2
##    grp         y
##    <chr>   <dbl>
##  1 a     -0.387
##  2 a     -0.839
##  3 a      0.0371
##  4 a     -0.144
##  5 a      0.148
##  6 b      0.143
##  7 b      0.0986
##  8 b     -1.11
##  9 b     -0.670
## 10 b     -0.440
df %>% select(2:3)
## # A tibble: 10 x 2
##         x       y
##     <dbl>   <dbl>
##  1 -0.665 -0.387
##  2 -0.270 -0.839
##  3  0.791  0.0371
##  4 -1.38  -0.144
##  5  0.903  0.148
##  6  1.55   0.143
##  7  1.10   0.0986
##  8 -0.400 -1.11
##  9 -2.47  -0.670
## 10 -0.374 -0.440
• 通过列名
df %>% select(grp, x, y)
## # A tibble: 10 x 3
##    grp        x       y
##    <chr>  <dbl>   <dbl>
##  1 a     -0.665 -0.387
##  2 a     -0.270 -0.839
##  3 a      0.791  0.0371
##  4 a     -1.38  -0.144
##  5 a      0.903  0.148
##  6 b      1.55   0.143
##  7 b      1.10   0.0986
##  8 b     -0.400 -1.11
##  9 b     -2.47  -0.670
## 10 b     -0.374 -0.440
df %>% select(x:y)
## # A tibble: 10 x 2
##         x       y
##     <dbl>   <dbl>
##  1 -0.665 -0.387
##  2 -0.270 -0.839
##  3  0.791  0.0371
##  4 -1.38  -0.144
##  5  0.903  0.148
##  6  1.55   0.143
##  7  1.10   0.0986
##  8 -0.400 -1.11
##  9 -2.47  -0.670
## 10 -0.374 -0.440
• 通过函数选取
df %>% select(starts_with("x"))
## # A tibble: 10 x 1
##         x
##     <dbl>
##  1 -0.665
##  2 -0.270
##  3  0.791
##  4 -1.38
##  5  0.903
##  6  1.55
##  7  1.10
##  8 -0.400
##  9 -2.47
## 10 -0.374
df %>% select(ends_with("p"))
## # A tibble: 10 x 1
##    grp
##    <chr>
##  1 a
##  2 a
##  3 a
##  4 a
##  5 a
##  6 b
##  7 b
##  8 b
##  9 b
## 10 b
df %>% select(contains("x"))
## # A tibble: 10 x 1
##         x
##     <dbl>
##  1 -0.665
##  2 -0.270
##  3  0.791
##  4 -1.38
##  5  0.903
##  6  1.55
##  7  1.10
##  8 -0.400
##  9 -2.47
## 10 -0.374
df %>% select(matches("x"))
## # A tibble: 10 x 1
##         x
##     <dbl>
##  1 -0.665
##  2 -0.270
##  3  0.791
##  4 -1.38
##  5  0.903
##  6  1.55
##  7  1.10
##  8 -0.400
##  9 -2.47
## 10 -0.374
• 通过类型
df %>% select(where(is.character))
## # A tibble: 10 x 1
##    grp
##    <chr>
##  1 a
##  2 a
##  3 a
##  4 a
##  5 a
##  6 b
##  7 b
##  8 b
##  9 b
## 10 b
df %>% select(where(is.numeric))
## # A tibble: 10 x 2
##         x       y
##     <dbl>   <dbl>
##  1 -0.665 -0.387
##  2 -0.270 -0.839
##  3  0.791  0.0371
##  4 -1.38  -0.144
##  5  0.903  0.148
##  6  1.55   0.143
##  7  1.10   0.0986
##  8 -0.400 -1.11
##  9 -2.47  -0.670
## 10 -0.374 -0.440
• 通过各种组合
df %>% select(!where(is.character))
## # A tibble: 10 x 2
##         x       y
##     <dbl>   <dbl>
##  1 -0.665 -0.387
##  2 -0.270 -0.839
##  3  0.791  0.0371
##  4 -1.38  -0.144
##  5  0.903  0.148
##  6  1.55   0.143
##  7  1.10   0.0986
##  8 -0.400 -1.11
##  9 -2.47  -0.670
## 10 -0.374 -0.440
df %>% select(where(is.numeric) & starts_with("x"))
## # A tibble: 10 x 1
##         x
##     <dbl>
##  1 -0.665
##  2 -0.270
##  3  0.791
##  4 -1.38
##  5  0.903
##  6  1.55
##  7  1.10
##  8 -0.400
##  9 -2.47
## 10 -0.374
df %>% select(starts_with("g") | ends_with("y"))
## # A tibble: 10 x 2
##    grp         y
##    <chr>   <dbl>
##  1 a     -0.387
##  2 a     -0.839
##  3 a      0.0371
##  4 a     -0.144
##  5 a      0.148
##  6 b      0.143
##  7 b      0.0986
##  8 b     -1.11
##  9 b     -0.670
## 10 b     -0.440
# 注意any_of和all_of的区别

vars <- c("x", "y", "z")
df %>% select(all_of(vars))
df %>% select(any_of(vars))

## 22.6 重命名某列

df %>% rename(group = grp)
## # A tibble: 10 x 3
##    group      x       y
##    <chr>  <dbl>   <dbl>
##  1 a     -0.665 -0.387
##  2 a     -0.270 -0.839
##  3 a      0.791  0.0371
##  4 a     -1.38  -0.144
##  5 a      0.903  0.148
##  6 b      1.55   0.143
##  7 b      1.10   0.0986
##  8 b     -0.400 -1.11
##  9 b     -2.47  -0.670
## 10 b     -0.374 -0.440
df %>% rename_with(toupper)
## # A tibble: 10 x 3
##    GRP        X       Y
##    <chr>  <dbl>   <dbl>
##  1 a     -0.665 -0.387
##  2 a     -0.270 -0.839
##  3 a      0.791  0.0371
##  4 a     -1.38  -0.144
##  5 a      0.903  0.148
##  6 b      1.55   0.143
##  7 b      1.10   0.0986
##  8 b     -0.400 -1.11
##  9 b     -2.47  -0.670
## 10 b     -0.374 -0.440
df %>% rename_with(toupper, is.numeric)
## # A tibble: 10 x 3
##    grp        X       Y
##    <chr>  <dbl>   <dbl>
##  1 a     -0.665 -0.387
##  2 a     -0.270 -0.839
##  3 a      0.791  0.0371
##  4 a     -1.38  -0.144
##  5 a      0.903  0.148
##  6 b      1.55   0.143
##  7 b      1.10   0.0986
##  8 b     -0.400 -1.11
##  9 b     -2.47  -0.670
## 10 b     -0.374 -0.440
df %>% rename_with(toupper, starts_with("x"))
## # A tibble: 10 x 3
##    grp        X       y
##    <chr>  <dbl>   <dbl>
##  1 a     -0.665 -0.387
##  2 a     -0.270 -0.839
##  3 a      0.791  0.0371
##  4 a     -1.38  -0.144
##  5 a      0.903  0.148
##  6 b      1.55   0.143
##  7 b      1.10   0.0986
##  8 b     -0.400 -1.11
##  9 b     -2.47  -0.670
## 10 b     -0.374 -0.440

## 22.7 调整列的位置

df %>% arrange(desc(abs(x)))
## # A tibble: 10 x 3
##    grp        x       y
##    <chr>  <dbl>   <dbl>
##  1 b     -2.47  -0.670
##  2 b      1.55   0.143
##  3 a     -1.38  -0.144
##  4 b      1.10   0.0986
##  5 a      0.903  0.148
##  6 a      0.791  0.0371
##  7 a     -0.665 -0.387
##  8 b     -0.400 -1.11
##  9 b     -0.374 -0.440
## 10 a     -0.270 -0.839

df %>% select(x, grp, y)
## # A tibble: 10 x 3
##         x grp         y
##     <dbl> <chr>   <dbl>
##  1 -0.665 a     -0.387
##  2 -0.270 a     -0.839
##  3  0.791 a      0.0371
##  4 -1.38  a     -0.144
##  5  0.903 a      0.148
##  6  1.55  b      0.143
##  7  1.10  b      0.0986
##  8 -0.400 b     -1.11
##  9 -2.47  b     -0.670
## 10 -0.374 b     -0.440

df %>% relocate(grp, .after = y)
## # A tibble: 10 x 3
##         x       y grp
##     <dbl>   <dbl> <chr>
##  1 -0.665 -0.387  a
##  2 -0.270 -0.839  a
##  3  0.791  0.0371 a
##  4 -1.38  -0.144  a
##  5  0.903  0.148  a
##  6  1.55   0.143  b
##  7  1.10   0.0986 b
##  8 -0.400 -1.11   b
##  9 -2.47  -0.670  b
## 10 -0.374 -0.440  b
df %>% relocate(x, .before = grp)
## # A tibble: 10 x 3
##         x grp         y
##     <dbl> <chr>   <dbl>
##  1 -0.665 a     -0.387
##  2 -0.270 a     -0.839
##  3  0.791 a      0.0371
##  4 -1.38  a     -0.144
##  5  0.903 a      0.148
##  6  1.55  b      0.143
##  7  1.10  b      0.0986
##  8 -0.400 b     -1.11
##  9 -2.47  b     -0.670
## 10 -0.374 b     -0.440

df %>% relocate(grp, .after = last_col())
## # A tibble: 10 x 3
##         x       y grp
##     <dbl>   <dbl> <chr>
##  1 -0.665 -0.387  a
##  2 -0.270 -0.839  a
##  3  0.791  0.0371 a
##  4 -1.38  -0.144  a
##  5  0.903  0.148  a
##  6  1.55   0.143  b
##  7  1.10   0.0986 b
##  8 -0.400 -1.11   b
##  9 -2.47  -0.670  b
## 10 -0.374 -0.440  b

## 22.8 强大的across函数

iris <- iris %>% as_tibble()
iris
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1          5.1         3.5          1.4         0.2
##  2          4.9         3            1.4         0.2
##  3          4.7         3.2          1.3         0.2
##  4          4.6         3.1          1.5         0.2
##  5          5           3.6          1.4         0.2
##  6          5.4         3.9          1.7         0.4
##  7          4.6         3.4          1.4         0.3
##  8          5           3.4          1.5         0.2
##  9          4.4         2.9          1.4         0.2
## 10          4.9         3.1          1.5         0.1
## # ... with 140 more rows, and 1 more variable:
## #   Species <fct>
iris %>%
group_by(Species) %>%
summarise(
mean_Sepal_Length = mean(Sepal.Length),
mean_Sepal_Width = mean(Sepal.Width),
mean_Petal_Length = mean(Petal.Length),
mean_Petal_Width = mean(Petal.Width)
)
## # A tibble: 3 x 5
##   Species mean_Sepal_Leng~ mean_Sepal_Width
##   <fct>              <dbl>            <dbl>
## 1 setosa              5.01             3.43
## 2 versic~             5.94             2.77
## 3 virgin~             6.59             2.97
## # ... with 2 more variables: mean_Petal_Length <dbl>,
## #   mean_Petal_Width <dbl>

dplyr 1.0之后，使用across()函数异常简练

iris %>%
group_by(Species) %>%
summarise(
across(everything(), mean)
)
## # A tibble: 3 x 5
##   Species Sepal.Length Sepal.Width Petal.Length
##   <fct>          <dbl>       <dbl>        <dbl>
## 1 setosa          5.01        3.43         1.46
## 2 versic~         5.94        2.77         4.26
## 3 virgin~         6.59        2.97         5.55
## # ... with 1 more variable: Petal.Width <dbl>

iris %>%
group_by(Species) %>%
summarise(
across(is.numeric, mean)
)
## # A tibble: 3 x 5
##   Species Sepal.Length Sepal.Width Petal.Length
##   <fct>          <dbl>       <dbl>        <dbl>
## 1 setosa          5.01        3.43         1.46
## 2 versic~         5.94        2.77         4.26
## 3 virgin~         6.59        2.97         5.55
## # ... with 1 more variable: Petal.Width <dbl>

across(.cols = , .fns = )
• 第一个参数.cols，选取我们要需要的若干列，选取多列的语法与select()的语法一致
• 第二个参数.fns，我们要执行的函数（或者多个函数），函数的语法有三种形式可选：
• A function, e.g. mean.
• A purrr-style lambda, e.g. ~ mean(.x, na.rm = TRUE)
• A list of functions/lambdas, e.g. list(mean = mean, n_miss = ~ sum(is.na(.x))

std <- function(x) {
(x - mean(x)) / sd(x)
}

iris %>%
group_by(Species) %>%
summarise(
across(starts_with("Sepal"), std)
)
## # A tibble: 150 x 3
## # Groups:   Species [3]
##    Species Sepal.Length Sepal.Width
##    <fct>          <dbl>       <dbl>
##  1 setosa        0.267       0.190
##  2 setosa       -0.301      -1.13
##  3 setosa       -0.868      -0.601
##  4 setosa       -1.15       -0.865
##  5 setosa       -0.0170      0.454
##  6 setosa        1.12        1.25
##  7 setosa       -1.15       -0.0739
##  8 setosa       -0.0170     -0.0739
##  9 setosa       -1.72       -1.39
## 10 setosa       -0.301      -0.865
## # ... with 140 more rows
# purrr style
iris %>%
group_by(Species) %>%
summarise(
across(starts_with("Sepal"), ~ (.x - mean(.x)) / sd(.x))
)
## # A tibble: 150 x 3
## # Groups:   Species [3]
##    Species Sepal.Length Sepal.Width
##    <fct>          <dbl>       <dbl>
##  1 setosa        0.267       0.190
##  2 setosa       -0.301      -1.13
##  3 setosa       -0.868      -0.601
##  4 setosa       -1.15       -0.865
##  5 setosa       -0.0170      0.454
##  6 setosa        1.12        1.25
##  7 setosa       -1.15       -0.0739
##  8 setosa       -0.0170     -0.0739
##  9 setosa       -1.72       -1.39
## 10 setosa       -0.301      -0.865
## # ... with 140 more rows
iris %>%
group_by(Species) %>%
summarise(
across(starts_with("Petal"), list(min = min, max = max))
# across(starts_with("Petal"), list(min = min, max = max), .names = "{fn}_{col}")
)
## # A tibble: 3 x 5
##   Species Petal.Length_min Petal.Length_max
##   <fct>              <dbl>            <dbl>
## 1 setosa               1                1.9
## 2 versic~              3                5.1
## 3 virgin~              4.5              6.9
## # ... with 2 more variables: Petal.Width_min <dbl>,
## #   Petal.Width_max <dbl>
iris %>%
group_by(Species) %>%
summarise(
across(starts_with("Sepal"), mean),
Area = mean(Petal.Length * Petal.Width),
across(c(Petal.Width), min),
n = n()
)
## # A tibble: 3 x 6
##   Species Sepal.Length Sepal.Width   Area Petal.Width
##   <fct>          <dbl>       <dbl>  <dbl>       <dbl>
## 1 setosa          5.01        3.43  0.366         0.1
## 2 versic~         5.94        2.77  5.72          1
## 3 virgin~         6.59        2.97 11.3           1.4
## # ... with 1 more variable: n <int>

iris %>% mutate(across(is.numeric, mean))
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1         5.84        3.06         3.76        1.20
##  2         5.84        3.06         3.76        1.20
##  3         5.84        3.06         3.76        1.20
##  4         5.84        3.06         3.76        1.20
##  5         5.84        3.06         3.76        1.20
##  6         5.84        3.06         3.76        1.20
##  7         5.84        3.06         3.76        1.20
##  8         5.84        3.06         3.76        1.20
##  9         5.84        3.06         3.76        1.20
## 10         5.84        3.06         3.76        1.20
## # ... with 140 more rows, and 1 more variable:
## #   Species <fct>
iris %>% mutate(across(starts_with("Sepal"), mean))
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1         5.84        3.06          1.4         0.2
##  2         5.84        3.06          1.4         0.2
##  3         5.84        3.06          1.3         0.2
##  4         5.84        3.06          1.5         0.2
##  5         5.84        3.06          1.4         0.2
##  6         5.84        3.06          1.7         0.4
##  7         5.84        3.06          1.4         0.3
##  8         5.84        3.06          1.5         0.2
##  9         5.84        3.06          1.4         0.2
## 10         5.84        3.06          1.5         0.1
## # ... with 140 more rows, and 1 more variable:
## #   Species <fct>
iris %>% mutate(across(is.numeric, std)) # std function has defined before
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1       -0.898      1.02          -1.34       -1.31
##  2       -1.14      -0.132         -1.34       -1.31
##  3       -1.38       0.327         -1.39       -1.31
##  4       -1.50       0.0979        -1.28       -1.31
##  5       -1.02       1.25          -1.34       -1.31
##  6       -0.535      1.93          -1.17       -1.05
##  7       -1.50       0.786         -1.34       -1.18
##  8       -1.02       0.786         -1.28       -1.31
##  9       -1.74      -0.361         -1.34       -1.31
## 10       -1.14       0.0979        -1.28       -1.44
## # ... with 140 more rows, and 1 more variable:
## #   Species <fct>
iris %>% mutate(
across(is.numeric, ~ .x / 2),
across(is.factor, stringr::str_to_upper)
)
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1         2.55        1.75         0.7         0.1
##  2         2.45        1.5          0.7         0.1
##  3         2.35        1.6          0.65        0.1
##  4         2.3         1.55         0.75        0.1
##  5         2.5         1.8          0.7         0.1
##  6         2.7         1.95         0.85        0.2
##  7         2.3         1.7          0.7         0.15
##  8         2.5         1.7          0.75        0.1
##  9         2.2         1.45         0.7         0.1
## 10         2.45        1.55         0.75        0.05
## # ... with 140 more rows, and 1 more variable:
## #   Species <chr>

## 22.9 “current” group or “current” variable

• n(), 返回当前分组的多少行
• cur_data(), 返回当前分组的数据内容（不包含分组变量）
• cur_group(), 返回当前分组的分组变量（一行一列的数据框）
• across(cur_column()), 返回当前列的列名

df <- tibble(
g = sample(rep(letters[1:3], 1:3)),
x = runif(6),
y = runif(6)
)
df
## # A tibble: 6 x 3
##   g          x     y
##   <chr>  <dbl> <dbl>
## 1 b     0.603  0.415
## 2 b     0.443  0.936
## 3 c     0.0727 0.301
## 4 c     0.749  0.888
## 5 c     0.591  0.273
## 6 a     0.278  0.259
df %>%
group_by(g) %>%
summarise(
n = n()
)
## # A tibble: 3 x 2
##   g         n
##   <chr> <int>
## 1 a         1
## 2 b         2
## 3 c         3
df %>%
group_by(g) %>%
summarise(
data = list(cur_group())
)
## # A tibble: 3 x 2
##   g     data
##   <chr> <list>
## 1 a     <tibble [1 x 1]>
## 2 b     <tibble [1 x 1]>
## 3 c     <tibble [1 x 1]>
df %>%
group_by(g) %>%
summarise(
data = list(cur_data())
)
## # A tibble: 3 x 2
##   g     data
##   <chr> <list>
## 1 a     <tibble [1 x 2]>
## 2 b     <tibble [2 x 2]>
## 3 c     <tibble [3 x 2]>
mtcars %>%
group_by(cyl) %>%
summarise(
broom::tidy(lm(mpg ~ wt, data = cur_data()))
)
## # A tibble: 6 x 6
## # Groups:   cyl [3]
##     cyl term      estimate std.error statistic  p.value
##   <dbl> <chr>        <dbl>     <dbl>     <dbl>    <dbl>
## 1     4 (Interce~    39.6      4.35       9.10  7.77e-6
## 2     4 wt           -5.65     1.85      -3.05  1.37e-2
## 3     6 (Interce~    28.4      4.18       6.79  1.05e-3
## 4     6 wt           -2.78     1.33      -2.08  9.18e-2
## 5     8 (Interce~    23.9      3.01       7.94  4.05e-6
## 6     8 wt           -2.19     0.739     -2.97  1.18e-2
df %>%
group_by(g) %>%
mutate(across(everything(), ~ paste(cur_column(), round(.x, 2))))
## # A tibble: 6 x 3
## # Groups:   g [3]
##   g     x      y
##   <chr> <chr>  <chr>
## 1 b     x 0.6  y 0.42
## 2 b     x 0.44 y 0.94
## 3 c     x 0.07 y 0.3
## 4 c     x 0.75 y 0.89
## 5 c     x 0.59 y 0.27
## 6 a     x 0.28 y 0.26
wt <- c(x = 0.2, y = 0.8)

df %>%
mutate(
across(c(x, y), ~ .x * wt[cur_column()])
)
## # A tibble: 6 x 3
##   g          x     y
##   <chr>  <dbl> <dbl>
## 1 b     0.121  0.332
## 2 b     0.0885 0.749
## 3 c     0.0145 0.241
## 4 c     0.150  0.711
## 5 c     0.118  0.219
## 6 a     0.0555 0.207

## 22.10 行方向操作

tidyverse遵循的tidy原则，一列表示一个变量，一行表示一次观察。 这种数据的存储格式，对ggplot2很方便，但对行方向的操作或者运算不同友好。比如

### 22.10.1 行方向上的统计

df <- tibble(id = letters[1:6], w = 10:15, x = 20:25, y = 30:35, z = 40:45)
df
## # A tibble: 6 x 5
##   id        w     x     y     z
##   <chr> <int> <int> <int> <int>
## 1 a        10    20    30    40
## 2 b        11    21    31    41
## 3 c        12    22    32    42
## 4 d        13    23    33    43
## 5 e        14    24    34    44
## 6 f        15    25    35    45

df %>% mutate(avg = mean(c(w, x, y, z)))
## # A tibble: 6 x 6
##   id        w     x     y     z   avg
##   <chr> <int> <int> <int> <int> <dbl>
## 1 a        10    20    30    40  27.5
## 2 b        11    21    31    41  27.5
## 3 c        12    22    32    42  27.5
## 4 d        13    23    33    43  27.5
## 5 e        14    24    34    44  27.5
## 6 f        15    25    35    45  27.5

• 按照tidy的方法
df %>%
pivot_longer(
cols = -id,
names_to = "variable",
values_to = "value"
) %>%
group_by(id) %>%
summarize(
r_mean = mean(value)
)
## # A tibble: 6 x 2
##   id    r_mean
##   <chr>  <dbl>
## 1 a         25
## 2 b         26
## 3 c         27
## 4 d         28
## 5 e         29
## 6 f         30

• 按照Jenny Bryan的方案，使用purrr宏包的pmap_dbl函数
library(purrr)
df %>%
mutate(r_mean = pmap_dbl(select_if(., is.numeric), lift_vd(mean)))
## # A tibble: 6 x 6
##   id        w     x     y     z r_mean
##   <chr> <int> <int> <int> <int>  <dbl>
## 1 a        10    20    30    40     25
## 2 b        11    21    31    41     26
## 3 c        12    22    32    42     27
## 4 d        13    23    33    43     28
## 5 e        14    24    34    44     29
## 6 f        15    25    35    45     30

• rowwise()
df %>%
rowwise() %>%
mutate(avg = mean(c(w, x, y, z)))
## # A tibble: 6 x 6
## # Rowwise:
##   id        w     x     y     z   avg
##   <chr> <int> <int> <int> <int> <dbl>
## 1 a        10    20    30    40    25
## 2 b        11    21    31    41    26
## 3 c        12    22    32    42    27
## 4 d        13    23    33    43    28
## 5 e        14    24    34    44    29
## 6 f        15    25    35    45    30

• rowwise() + c_across()，现在dplyr 1.0终于给出了一个很好的解决方案
df %>%
rowwise() %>%
mutate(
avg = mean(c_across(w:z))
)
## # A tibble: 6 x 6
## # Rowwise:
##   id        w     x     y     z   avg
##   <chr> <int> <int> <int> <int> <dbl>
## 1 a        10    20    30    40    25
## 2 b        11    21    31    41    26
## 3 c        12    22    32    42    27
## 4 d        13    23    33    43    28
## 5 e        14    24    34    44    29
## 6 f        15    25    35    45    30

df %>%
rowwise(id) %>%
mutate(total = mean(c_across(w:z)))
## # A tibble: 6 x 6
## # Rowwise:  id
##   id        w     x     y     z total
##   <chr> <int> <int> <int> <int> <dbl>
## 1 a        10    20    30    40    25
## 2 b        11    21    31    41    26
## 3 c        12    22    32    42    27
## 4 d        13    23    33    43    28
## 5 e        14    24    34    44    29
## 6 f        15    25    35    45    30
df %>%
rowwise(id) %>%
mutate(mean = mean(c_across(is.numeric)))
## # A tibble: 6 x 6
## # Rowwise:  id
##   id        w     x     y     z  mean
##   <chr> <int> <int> <int> <int> <dbl>
## 1 a        10    20    30    40    25
## 2 b        11    21    31    41    26
## 3 c        12    22    32    42    27
## 4 d        13    23    33    43    28
## 5 e        14    24    34    44    29
## 6 f        15    25    35    45    30
df %>%
rowwise(id) %>%
summarise(
m = mean(c_across(is.numeric))
)
## # A tibble: 6 x 2
## # Groups:   id [6]
##   id        m
##   <chr> <dbl>
## 1 a        25
## 2 b        26
## 3 c        27
## 4 d        28
## 5 e        29
## 6 f        30

### 22.10.2 行方向处理与列表列是天然一对

rowwise()不仅仅用于计算行方向均值这样的简单统计，而是当处理列表列时，方才显示出rowwise()purrr::map一样的强大。那么，什么是列表列列表列指的是数据框的一列是一个列表， 比如

tb <- tibble(
x = list(1, 2:3, 4:6)
)

tb %>% mutate(l = purrr::map_int(x, length))
## # A tibble: 3 x 2
##   x             l
##   <list>    <int>
## 1 <dbl [1]>     1
## 2 <int [2]>     2
## 3 <int [3]>     3

tb %>%
rowwise() %>%
mutate(l = length(x))
## # A tibble: 3 x 2
## # Rowwise:
##   x             l
##   <list>    <int>
## 1 <dbl [1]>     1
## 2 <int [2]>     2
## 3 <int [3]>     3

### 22.10.3 行方向上的建模

mtcars <- mtcars %>% as_tibble()
mtcars
## # A tibble: 32 x 11
##      mpg   cyl  disp    hp  drat    wt  qsec    vs
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6  160    110  3.9   2.62  16.5     0
##  2  21       6  160    110  3.9   2.88  17.0     0
##  3  22.8     4  108     93  3.85  2.32  18.6     1
##  4  21.4     6  258    110  3.08  3.22  19.4     1
##  5  18.7     8  360    175  3.15  3.44  17.0     0
##  6  18.1     6  225    105  2.76  3.46  20.2     1
##  7  14.3     8  360    245  3.21  3.57  15.8     0
##  8  24.4     4  147.    62  3.69  3.19  20       1
##  9  22.8     4  141.    95  3.92  3.15  22.9     1
## 10  19.2     6  168.   123  3.92  3.44  18.3     1
## # ... with 22 more rows, and 3 more variables:
## #   am <dbl>, gear <dbl>, carb <dbl>

mtcars %>%
group_by(cyl) %>%
nest()
## # A tibble: 3 x 2
## # Groups:   cyl [3]
##     cyl data
##   <dbl> <list>
## 1     6 <tibble [7 x 10]>
## 2     4 <tibble [11 x 10]>
## 3     8 <tibble [14 x 10]>

#### 22.10.3.1 列方向的做法

mtcars %>%
group_by(cyl) %>%
nest() %>%
mutate(model = purrr::map(data, ~ lm(mpg ~ wt, data = .))) %>%
mutate(result = purrr::map(model, ~ broom::tidy(.))) %>%
unnest(result)
## # A tibble: 6 x 8
## # Groups:   cyl [3]
##     cyl data  model term  estimate std.error statistic
##   <dbl> <lis> <lis> <chr>    <dbl>     <dbl>     <dbl>
## 1     6 <tib~ <lm>  (Int~    28.4      4.18       6.79
## 2     6 <tib~ <lm>  wt       -2.78     1.33      -2.08
## 3     4 <tib~ <lm>  (Int~    39.6      4.35       9.10
## 4     4 <tib~ <lm>  wt       -5.65     1.85      -3.05
## 5     8 <tib~ <lm>  (Int~    23.9      3.01       7.94
## 6     8 <tib~ <lm>  wt       -2.19     0.739     -2.97
## # ... with 1 more variable: p.value <dbl>

purrr::map实现列表元素一个一个的依次迭代，从数据框的角度来看（数据框是列表的一种特殊形式），因此实质上就是一行一行的处理。所以，尽管purrr很强大，但需要一定学习成本，从解决问题的路径上也比较周折。

#### 22.10.3.2 行方向的做法

mtcars %>%
nest_by(cyl) %>%
mutate(model = list(lm(mpg ~ wt, data = data))) %>%
summarise(broom::tidy(model))
## # A tibble: 6 x 6
## # Groups:   cyl [3]
##     cyl term      estimate std.error statistic  p.value
##   <dbl> <chr>        <dbl>     <dbl>     <dbl>    <dbl>
## 1     4 (Interce~    39.6      4.35       9.10  7.77e-6
## 2     4 wt           -5.65     1.85      -3.05  1.37e-2
## 3     6 (Interce~    28.4      4.18       6.79  1.05e-3
## 4     6 wt           -2.78     1.33      -2.08  9.18e-2
## 5     8 (Interce~    23.9      3.01       7.94  4.05e-6
## 6     8 wt           -2.19     0.739     -2.97  1.18e-2
# or
mtcars %>%
nest_by(cyl) %>%
summarise(
broom::tidy(lm(mpg ~ wt, data = data))
)
## # A tibble: 6 x 6
## # Groups:   cyl [3]
##     cyl term      estimate std.error statistic  p.value
##   <dbl> <chr>        <dbl>     <dbl>     <dbl>    <dbl>
## 1     4 (Interce~    39.6      4.35       9.10  7.77e-6
## 2     4 wt           -5.65     1.85      -3.05  1.37e-2
## 3     6 (Interce~    28.4      4.18       6.79  1.05e-3
## 4     6 wt           -2.78     1.33      -2.08  9.18e-2
## 5     8 (Interce~    23.9      3.01       7.94  4.05e-6
## 6     8 wt           -2.19     0.739     -2.97  1.18e-2

mtcars %>%
group_nest(cyl) %>%
mutate(model = purrr::map(data, ~ lm(mpg ~ wt, data = .))) %>%
mutate(result = purrr::map(model, ~ broom::tidy(.))) %>%
tidyr::unnest(result)

mtcars %>%
group_by(cyl) %>%
group_modify(
~ broom::tidy(lm(mpg ~ wt, data = .))
)

mtcars %>%
nest_by(cyl) %>%
summarise(
broom::tidy(lm(mpg ~ wt, data = data))
)

mtcars %>%
group_by(cyl) %>%
summarise(
broom::tidy(lm(mpg ~ wt, data = cur_data()))
)

# or
mtcars %>%
group_by(cyl) %>%
summarise(broom::tidy(lm(mpg ~ wt)))