# 第 21 章 tidyverse进阶

library(tidyverse)

## 21.1 scoped 函数

• 对数据框所有列操作，可以用_all
• 对数据框指定的几列操作，可以用_at
• 对数据框符合条件的几列进行操作，可以用_if
Operate _all _at _if
select() select_all() select_at() select_if()
mutate() mutate_all() mutate_at() mutate_if()
rename() rename_all() rename_at() rename_if()
arrange() arrange_all() arrange_at() arrange_if()
filter() filter_all() filter_at() filter_if()
distinct() distinct_all() distinct_at() distinct_if()
group_by() group_by_all() group_by_at() group_by_if()
summarise() summarise_all() summarise_at() summarise_if()
map() map_all() map_at() map_if()
modify() modify_all() modify_at() modify_if()

### 21.1.1 mutate_if

iris <- iris %>% as_tibble()

df_iris <- iris %>% head(5)
df_iris %>% mutate_if(is.double, as.integer)
## # A tibble: 5 x 5
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
##          <int>       <int>        <int>       <int>
## 1            5           3            1           0
## 2            4           3            1           0
## 3            4           3            1           0
## 4            4           3            1           0
## 5            5           3            1           0
## # ... with 1 more variable: Species <fct>

df_iris %>% mutate_if(is.numeric, list(scale, log))
## # A tibble: 5 x 13
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
##          <dbl>       <dbl>        <dbl>       <dbl>
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3            1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5           3.6          1.4         0.2
## # ... with 9 more variables: Species <fct>,
## #   Sepal.Length_fn1[,1] <dbl>,
## #   Sepal.Width_fn1[,1] <dbl>,
## #   Petal.Length_fn1[,1] <dbl>,
## #   Petal.Width_fn1[,1] <dbl>, Sepal.Length_fn2 <dbl>,
## #   Sepal.Width_fn2 <dbl>, Petal.Length_fn2 <dbl>,
## #   Petal.Width_fn2 <dbl>

df_iris %>% mutate_if(is.numeric, list(~ scale(.), ~ log(.)))
## # A tibble: 5 x 13
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
##          <dbl>       <dbl>        <dbl>       <dbl>
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3            1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5           3.6          1.4         0.2
## # ... with 9 more variables: Species <fct>,
## #   Sepal.Length_scale[,1] <dbl>,
## #   Sepal.Width_scale[,1] <dbl>,
## #   Petal.Length_scale[,1] <dbl>,
## #   Petal.Width_scale[,1] <dbl>,
## #   Sepal.Length_log <dbl>, Sepal.Width_log <dbl>,
## #   Petal.Length_log <dbl>, Petal.Width_log <dbl>

### 21.1.2 select_if()

df <- tibble::tibble(
x = letters[1:3],
y = c(1:3),
z = c(0, 0, 0)
)
df
## # A tibble: 3 x 3
##   x         y     z
##   <chr> <int> <dbl>
## 1 a         1     0
## 2 b         2     0
## 3 c         3     0
df %>% select_if(is.numeric)
## # A tibble: 3 x 2
##       y     z
##   <int> <dbl>
## 1     1     0
## 2     2     0
## 3     3     0
df %>% select_if(~ n_distinct(.) > 2)
## # A tibble: 3 x 2
##   x         y
##   <chr> <int>
## 1 a         1
## 2 b         2
## 3 c         3

select_if 多个条件的情况

df %>% select_if(
list(~ (is.numeric(.) | is.character(.)))
)
## # A tibble: 3 x 3
##   x         y     z
##   <chr> <int> <dbl>
## 1 a         1     0
## 2 b         2     0
## 3 c         3     0
df %>% select_if(
~ (is.numeric(.) | is.character(.))
)
## # A tibble: 3 x 3
##   x         y     z
##   <chr> <int> <dbl>
## 1 a         1     0
## 2 b         2     0
## 3 c         3     0
to_keep <- function(x) is.numeric(x) | is.character(x)
df %>% select_if(to_keep)
## # A tibble: 3 x 3
##   x         y     z
##   <chr> <int> <dbl>
## 1 a         1     0
## 2 b         2     0
## 3 c         3     0
df %>% select_if(
list(~ (is.numeric(.) && sum(.) > 2))
)
## # A tibble: 3 x 1
##       y
##   <int>
## 1     1
## 2     2
## 3     3
df %>% select_if(
list(~ (is.numeric(.) && mean(.) > 1))
)
## # A tibble: 3 x 1
##       y
##   <int>
## 1     1
## 2     2
## 3     3

to_want <- function(x) is.numeric(x) && sum(x) > 3

df %>% select_if(to_want)
## # A tibble: 3 x 1
##       y
##   <int>
## 1     1
## 2     2
## 3     3

## 21.2 summarise_if

msleep <- ggplot2::msleep
msleep %>%
dplyr::group_by(vore) %>%
dplyr::summarise_all(~ mean(., na.rm = TRUE))
## # A tibble: 5 x 11
##   vore   name genus order conservation sleep_total
##   <chr> <dbl> <dbl> <dbl>        <dbl>       <dbl>
## 1 carni    NA    NA    NA           NA       10.4
## 2 herbi    NA    NA    NA           NA        9.51
## 3 inse~    NA    NA    NA           NA       14.9
## 4 omni     NA    NA    NA           NA       10.9
## 5 <NA>     NA    NA    NA           NA       10.2
## # ... with 5 more variables: sleep_rem <dbl>,
## #   sleep_cycle <dbl>, awake <dbl>, brainwt <dbl>,
## #   bodywt <dbl>
msleep <- ggplot2::msleep
msleep %>%
dplyr::group_by(vore) %>%
# summarise_if(is.numeric, ~mean(., na.rm = TRUE))
dplyr::summarise_if(is.numeric, mean, na.rm = TRUE)
## # A tibble: 5 x 7
##   vore  sleep_total sleep_rem sleep_cycle awake brainwt
##   <chr>       <dbl>     <dbl>       <dbl> <dbl>   <dbl>
## 1 carni       10.4       2.29       0.373 13.6  0.0793
## 2 herbi        9.51      1.37       0.418 14.5  0.622
## 3 inse~       14.9       3.52       0.161  9.06 0.0216
## 4 omni        10.9       1.96       0.592 13.1  0.146
## 5 <NA>        10.2       1.88       0.183 13.8  0.00763
## # ... with 1 more variable: bodywt <dbl>

## 21.3 filter_if()

msleep <- ggplot2::msleep
msleep %>%
dplyr::select(name, sleep_total) %>%
dplyr::filter(sleep_total > 18)
## # A tibble: 4 x 2
##   name                 sleep_total
##   <chr>                      <dbl>
## 1 Big brown bat               19.7
## 2 Thick-tailed opposum        19.4
## 3 Little brown bat            19.9
## 4 Giant armadillo             18.1
msleep %>%
dplyr::select(name, sleep_total) %>%
dplyr::filter(between(sleep_total, 16, 18))
## # A tibble: 4 x 2
##   name                   sleep_total
##   <chr>                        <dbl>
## 1 Owl monkey                    17
## 3 North American Opossum        18
## 4 Arctic ground squirrel        16.6
msleep %>%
dplyr::select(name, sleep_total) %>%
# filter(near(sleep_total, 17,  tol=sd(sleep_total)))
dplyr::filter(near(sleep_total, mean(sleep_total), tol = 0.5 * sd(sleep_total)))
## # A tibble: 35 x 2
##    name                      sleep_total
##    <chr>                           <dbl>
##  1 Cheetah                          12.1
##  2 Northern fur seal                 8.7
##  3 Dog                              10.1
##  4 Guinea pig                        9.4
##  5 Grivet                           10
##  6 Chinchilla                       12.5
##  7 Star-nosed mole                  10.3
##  8 African giant pouched rat         8.3
##  9 Lesser short-tailed shrew         9.1
## 10 European hedgehog                10.1
## # ... with 25 more rows

mtcars是 R内置数据集，记录了32种不同品牌的轿车的的11个属性

mtcars <- mtcars %>% as_tibble()
mtcars
## # A tibble: 32 x 11
##      mpg   cyl  disp    hp  drat    wt  qsec    vs
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6  160    110  3.9   2.62  16.5     0
##  2  21       6  160    110  3.9   2.88  17.0     0
##  3  22.8     4  108     93  3.85  2.32  18.6     1
##  4  21.4     6  258    110  3.08  3.22  19.4     1
##  5  18.7     8  360    175  3.15  3.44  17.0     0
##  6  18.1     6  225    105  2.76  3.46  20.2     1
##  7  14.3     8  360    245  3.21  3.57  15.8     0
##  8  24.4     4  147.    62  3.69  3.19  20       1
##  9  22.8     4  141.    95  3.92  3.15  22.9     1
## 10  19.2     6  168.   123  3.92  3.44  18.3     1
## # ... with 22 more rows, and 3 more variables:
## #   am <dbl>, gear <dbl>, carb <dbl>

filter_if()配合all_vars(), any_vars()函数，可以完成很酷的工作. 比如，要求一行中所有变量的值都大于150

mtcars %>% filter_all(all_vars(. > 150))
## # A tibble: 0 x 11
## # ... with 11 variables: mpg <dbl>, cyl <dbl>,
## #   disp <dbl>, hp <dbl>, drat <dbl>, wt <dbl>,
## #   qsec <dbl>, vs <dbl>, am <dbl>, gear <dbl>,
## #   carb <dbl>

# Or the union:
mtcars %>% filter_all(any_vars(. > 150))
## # A tibble: 21 x 11
##      mpg   cyl  disp    hp  drat    wt  qsec    vs
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6  160    110  3.9   2.62  16.5     0
##  2  21       6  160    110  3.9   2.88  17.0     0
##  3  21.4     6  258    110  3.08  3.22  19.4     1
##  4  18.7     8  360    175  3.15  3.44  17.0     0
##  5  18.1     6  225    105  2.76  3.46  20.2     1
##  6  14.3     8  360    245  3.21  3.57  15.8     0
##  7  19.2     6  168.   123  3.92  3.44  18.3     1
##  8  17.8     6  168.   123  3.92  3.44  18.9     1
##  9  16.4     8  276.   180  3.07  4.07  17.4     0
## 10  17.3     8  276.   180  3.07  3.73  17.6     0
## # ... with 11 more rows, and 3 more variables:
## #   am <dbl>, gear <dbl>, carb <dbl>
# You can vary the selection of columns on which to apply the predicate.
# filter_at() takes a vars() specification:
mtcars %>% filter_at(vars(starts_with("d")), any_vars((. %% 2) == 0))
## # A tibble: 13 x 11
##      mpg   cyl  disp    hp  drat    wt  qsec    vs
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6   160   110  3.9   2.62  16.5     0
##  2  21       6   160   110  3.9   2.88  17.0     0
##  3  22.8     4   108    93  3.85  2.32  18.6     1
##  4  21.4     6   258   110  3.08  3.22  19.4     1
##  5  18.7     8   360   175  3.15  3.44  17.0     0
##  6  14.3     8   360   245  3.21  3.57  15.8     0
##  7  10.4     8   472   205  2.93  5.25  18.0     0
##  8  10.4     8   460   215  3     5.42  17.8     0
##  9  14.7     8   440   230  3.23  5.34  17.4     0
## 10  15.5     8   318   150  2.76  3.52  16.9     0
## 11  15.2     8   304   150  3.15  3.44  17.3     0
## 12  13.3     8   350   245  3.73  3.84  15.4     0
## 13  19.2     8   400   175  3.08  3.84  17.0     0
## # ... with 3 more variables: am <dbl>, gear <dbl>,
## #   carb <dbl>

filter_if(.tbl, .predicate, .vars_predicate) 相对复杂点，我这里多说几句。

filter_if() 有三个参数：

• .tbl, 数据框
• .predicate, 应用在列上的函数，一般作为列的选择条件
• .vars_predicate, 应用在一行上的函数，通过 all_vars(), any_vars()返回值决定是否选取该行。
# And filter_if() selects variables with a predicate function:
# filter_if(.tbl, .predicate, .vars_predicate)
# mtcars %>% map_df(~ all(floor(.) == .) )
# mtcars %>% select_if( ~ all(floor(.) == .) )

mtcars %>% filter_if(~ all(floor(.) == .), all_vars(. != 0))
## # A tibble: 7 x 11
##     mpg   cyl  disp    hp  drat    wt  qsec    vs    am
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  22.8     4 108      93  3.85  2.32  18.6     1     1
## 2  32.4     4  78.7    66  4.08  2.2   19.5     1     1
## 3  30.4     4  75.7    52  4.93  1.62  18.5     1     1
## 4  33.9     4  71.1    65  4.22  1.84  19.9     1     1
## 5  27.3     4  79      66  4.08  1.94  18.9     1     1
## 6  30.4     4  95.1   113  3.77  1.51  16.9     1     1
## 7  21.4     4 121     109  4.11  2.78  18.6     1     1
## # ... with 2 more variables: gear <dbl>, carb <dbl>

## 21.4 group_by

group_by() 用的很多，所以要多讲讲

mtcars %>% dplyr::group_by(cyl)
## # A tibble: 32 x 11
## # Groups:   cyl [3]
##      mpg   cyl  disp    hp  drat    wt  qsec    vs
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6  160    110  3.9   2.62  16.5     0
##  2  21       6  160    110  3.9   2.88  17.0     0
##  3  22.8     4  108     93  3.85  2.32  18.6     1
##  4  21.4     6  258    110  3.08  3.22  19.4     1
##  5  18.7     8  360    175  3.15  3.44  17.0     0
##  6  18.1     6  225    105  2.76  3.46  20.2     1
##  7  14.3     8  360    245  3.21  3.57  15.8     0
##  8  24.4     4  147.    62  3.69  3.19  20       1
##  9  22.8     4  141.    95  3.92  3.15  22.9     1
## 10  19.2     6  168.   123  3.92  3.44  18.3     1
## # ... with 22 more rows, and 3 more variables:
## #   am <dbl>, gear <dbl>, carb <dbl>
mtcars %>% group_by_at(vars(cyl))
## # A tibble: 32 x 11
## # Groups:   cyl [3]
##      mpg   cyl  disp    hp  drat    wt  qsec    vs
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6  160    110  3.9   2.62  16.5     0
##  2  21       6  160    110  3.9   2.88  17.0     0
##  3  22.8     4  108     93  3.85  2.32  18.6     1
##  4  21.4     6  258    110  3.08  3.22  19.4     1
##  5  18.7     8  360    175  3.15  3.44  17.0     0
##  6  18.1     6  225    105  2.76  3.46  20.2     1
##  7  14.3     8  360    245  3.21  3.57  15.8     0
##  8  24.4     4  147.    62  3.69  3.19  20       1
##  9  22.8     4  141.    95  3.92  3.15  22.9     1
## 10  19.2     6  168.   123  3.92  3.44  18.3     1
## # ... with 22 more rows, and 3 more variables:
## #   am <dbl>, gear <dbl>, carb <dbl>
# Group a data frame by all variables:
mtcars %>% group_by_all()
## # A tibble: 32 x 11
## # Groups:   mpg, cyl, disp, hp, drat, wt, qsec, vs,
## #   am, gear, carb [32]
##      mpg   cyl  disp    hp  drat    wt  qsec    vs
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6  160    110  3.9   2.62  16.5     0
##  2  21       6  160    110  3.9   2.88  17.0     0
##  3  22.8     4  108     93  3.85  2.32  18.6     1
##  4  21.4     6  258    110  3.08  3.22  19.4     1
##  5  18.7     8  360    175  3.15  3.44  17.0     0
##  6  18.1     6  225    105  2.76  3.46  20.2     1
##  7  14.3     8  360    245  3.21  3.57  15.8     0
##  8  24.4     4  147.    62  3.69  3.19  20       1
##  9  22.8     4  141.    95  3.92  3.15  22.9     1
## 10  19.2     6  168.   123  3.92  3.44  18.3     1
## # ... with 22 more rows, and 3 more variables:
## #   am <dbl>, gear <dbl>, carb <dbl>
# Group by variables selected with a predicate:
iris %>% group_by_if(is.factor)
## # A tibble: 150 x 5
## # Groups:   Species [3]
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1          5.1         3.5          1.4         0.2
##  2          4.9         3            1.4         0.2
##  3          4.7         3.2          1.3         0.2
##  4          4.6         3.1          1.5         0.2
##  5          5           3.6          1.4         0.2
##  6          5.4         3.9          1.7         0.4
##  7          4.6         3.4          1.4         0.3
##  8          5           3.4          1.5         0.2
##  9          4.4         2.9          1.4         0.2
## 10          4.9         3.1          1.5         0.1
## # ... with 140 more rows, and 1 more variable:
## #   Species <fct>

### 21.4.1 group_split(), group_map(), group_modify()

iris %>%
dplyr::group_by(Species) %>%
dplyr::group_split()
## <list_of<
##   tbl_df<
##     Sepal.Length: double
##     Sepal.Width : double
##     Petal.Length: double
##     Petal.Width : double
##     Species     : factor<12d60>
##   >
## >[3]>
## [[1]]
## # A tibble: 50 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1          5.1         3.5          1.4         0.2
##  2          4.9         3            1.4         0.2
##  3          4.7         3.2          1.3         0.2
##  4          4.6         3.1          1.5         0.2
##  5          5           3.6          1.4         0.2
##  6          5.4         3.9          1.7         0.4
##  7          4.6         3.4          1.4         0.3
##  8          5           3.4          1.5         0.2
##  9          4.4         2.9          1.4         0.2
## 10          4.9         3.1          1.5         0.1
## # ... with 40 more rows, and 1 more variable:
## #   Species <fct>
##
## [[2]]
## # A tibble: 50 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1          7           3.2          4.7         1.4
##  2          6.4         3.2          4.5         1.5
##  3          6.9         3.1          4.9         1.5
##  4          5.5         2.3          4           1.3
##  5          6.5         2.8          4.6         1.5
##  6          5.7         2.8          4.5         1.3
##  7          6.3         3.3          4.7         1.6
##  8          4.9         2.4          3.3         1
##  9          6.6         2.9          4.6         1.3
## 10          5.2         2.7          3.9         1.4
## # ... with 40 more rows, and 1 more variable:
## #   Species <fct>
##
## [[3]]
## # A tibble: 50 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1          6.3         3.3          6           2.5
##  2          5.8         2.7          5.1         1.9
##  3          7.1         3            5.9         2.1
##  4          6.3         2.9          5.6         1.8
##  5          6.5         3            5.8         2.2
##  6          7.6         3            6.6         2.1
##  7          4.9         2.5          4.5         1.7
##  8          7.3         2.9          6.3         1.8
##  9          6.7         2.5          5.8         1.8
## 10          7.2         3.6          6.1         2.5
## # ... with 40 more rows, and 1 more variable:
## #   Species <fct>

iris %>%
dplyr::group_split(Species)
## <list_of<
##   tbl_df<
##     Sepal.Length: double
##     Sepal.Width : double
##     Petal.Length: double
##     Petal.Width : double
##     Species     : factor<12d60>
##   >
## >[3]>
## [[1]]
## # A tibble: 50 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1          5.1         3.5          1.4         0.2
##  2          4.9         3            1.4         0.2
##  3          4.7         3.2          1.3         0.2
##  4          4.6         3.1          1.5         0.2
##  5          5           3.6          1.4         0.2
##  6          5.4         3.9          1.7         0.4
##  7          4.6         3.4          1.4         0.3
##  8          5           3.4          1.5         0.2
##  9          4.4         2.9          1.4         0.2
## 10          4.9         3.1          1.5         0.1
## # ... with 40 more rows, and 1 more variable:
## #   Species <fct>
##
## [[2]]
## # A tibble: 50 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1          7           3.2          4.7         1.4
##  2          6.4         3.2          4.5         1.5
##  3          6.9         3.1          4.9         1.5
##  4          5.5         2.3          4           1.3
##  5          6.5         2.8          4.6         1.5
##  6          5.7         2.8          4.5         1.3
##  7          6.3         3.3          4.7         1.6
##  8          4.9         2.4          3.3         1
##  9          6.6         2.9          4.6         1.3
## 10          5.2         2.7          3.9         1.4
## # ... with 40 more rows, and 1 more variable:
## #   Species <fct>
##
## [[3]]
## # A tibble: 50 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1          6.3         3.3          6           2.5
##  2          5.8         2.7          5.1         1.9
##  3          7.1         3            5.9         2.1
##  4          6.3         2.9          5.6         1.8
##  5          6.5         3            5.8         2.2
##  6          7.6         3            6.6         2.1
##  7          4.9         2.5          4.5         1.7
##  8          7.3         2.9          6.3         1.8
##  9          6.7         2.5          5.8         1.8
## 10          7.2         3.6          6.1         2.5
## # ... with 40 more rows, and 1 more variable:
## #   Species <fct>

iris %>%
dplyr::group_split(Species)
## <list_of<
##   tbl_df<
##     Sepal.Length: double
##     Sepal.Width : double
##     Petal.Length: double
##     Petal.Width : double
##     Species     : factor<12d60>
##   >
## >[3]>
## [[1]]
## # A tibble: 50 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1          5.1         3.5          1.4         0.2
##  2          4.9         3            1.4         0.2
##  3          4.7         3.2          1.3         0.2
##  4          4.6         3.1          1.5         0.2
##  5          5           3.6          1.4         0.2
##  6          5.4         3.9          1.7         0.4
##  7          4.6         3.4          1.4         0.3
##  8          5           3.4          1.5         0.2
##  9          4.4         2.9          1.4         0.2
## 10          4.9         3.1          1.5         0.1
## # ... with 40 more rows, and 1 more variable:
## #   Species <fct>
##
## [[2]]
## # A tibble: 50 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1          7           3.2          4.7         1.4
##  2          6.4         3.2          4.5         1.5
##  3          6.9         3.1          4.9         1.5
##  4          5.5         2.3          4           1.3
##  5          6.5         2.8          4.6         1.5
##  6          5.7         2.8          4.5         1.3
##  7          6.3         3.3          4.7         1.6
##  8          4.9         2.4          3.3         1
##  9          6.6         2.9          4.6         1.3
## 10          5.2         2.7          3.9         1.4
## # ... with 40 more rows, and 1 more variable:
## #   Species <fct>
##
## [[3]]
## # A tibble: 50 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1          6.3         3.3          6           2.5
##  2          5.8         2.7          5.1         1.9
##  3          7.1         3            5.9         2.1
##  4          6.3         2.9          5.6         1.8
##  5          6.5         3            5.8         2.2
##  6          7.6         3            6.6         2.1
##  7          4.9         2.5          4.5         1.7
##  8          7.3         2.9          6.3         1.8
##  9          6.7         2.5          5.8         1.8
## 10          7.2         3.6          6.1         2.5
## # ... with 40 more rows, and 1 more variable:
## #   Species <fct>

iris %>%
dplyr::group_split(Species) %>%
purrr::map(~ broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x)))
## [[1]]
## # A tibble: 2 x 5
##   term         estimate std.error statistic p.value
##   <chr>           <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)     0.803    0.344       2.34  0.0238
## 2 Sepal.Length    0.132    0.0685      1.92  0.0607
##
## [[2]]
## # A tibble: 2 x 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     0.185    0.514      0.360 7.20e- 1
## 2 Sepal.Length    0.686    0.0863     7.95  2.59e-10
##
## [[3]]
## # A tibble: 2 x 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     0.610    0.417       1.46 1.50e- 1
## 2 Sepal.Length    0.750    0.0630     11.9  6.30e-16
iris %>%
dplyr::group_split(Species) %>%
purrr::map_df(~ broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x)))
## # A tibble: 6 x 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     0.803    0.344      2.34  2.38e- 2
## 2 Sepal.Length    0.132    0.0685     1.92  6.07e- 2
## 3 (Intercept)     0.185    0.514      0.360 7.20e- 1
## 4 Sepal.Length    0.686    0.0863     7.95  2.59e-10
## 5 (Intercept)     0.610    0.417      1.46  1.50e- 1
## 6 Sepal.Length    0.750    0.0630    11.9   6.30e-16

## The result of .f should be a data frame(.f 必须返回数据框)
## group_map() return a list of tibble(返回元素均为df的一个列表list(df1,df2,df3))
iris %>%
dplyr::group_by(Species) %>%
dplyr::group_map(~ broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x)))
## [[1]]
## # A tibble: 2 x 5
##   term         estimate std.error statistic p.value
##   <chr>           <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)     0.803    0.344       2.34  0.0238
## 2 Sepal.Length    0.132    0.0685      1.92  0.0607
##
## [[2]]
## # A tibble: 2 x 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     0.185    0.514      0.360 7.20e- 1
## 2 Sepal.Length    0.686    0.0863     7.95  2.59e-10
##
## [[3]]
## # A tibble: 2 x 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     0.610    0.417       1.46 1.50e- 1
## 2 Sepal.Length    0.750    0.0630     11.9  6.30e-16

iris %>%
dplyr::group_by(Species) %>%
dplyr::group_map(
~ lm(Petal.Length ~ Sepal.Length, data = .x)
)
## [[1]]
##
## Call:
## lm(formula = Petal.Length ~ Sepal.Length, data = .x)
##
## Coefficients:
##  (Intercept)  Sepal.Length
##        0.803         0.132
##
##
## [[2]]
##
## Call:
## lm(formula = Petal.Length ~ Sepal.Length, data = .x)
##
## Coefficients:
##  (Intercept)  Sepal.Length
##        0.185         0.686
##
##
## [[3]]
##
## Call:
## lm(formula = Petal.Length ~ Sepal.Length, data = .x)
##
## Coefficients:
##  (Intercept)  Sepal.Length
##         0.61          0.75

group_modify() 才是真正意义上的“数据框进、数据框出”。

iris %>%
dplyr::group_by(Species) %>%
dplyr::group_modify(~ broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x)))
## # A tibble: 6 x 6
## # Groups:   Species [3]
##   Species  term   estimate std.error statistic  p.value
##   <fct>    <chr>     <dbl>     <dbl>     <dbl>    <dbl>
## 1 setosa   (Inte~    0.803    0.344      2.34  2.38e- 2
## 2 setosa   Sepal~    0.132    0.0685     1.92  6.07e- 2
## 3 versico~ (Inte~    0.185    0.514      0.360 7.20e- 1
## 4 versico~ Sepal~    0.686    0.0863     7.95  2.59e-10
## 5 virgini~ (Inte~    0.610    0.417      1.46  1.50e- 1
## 6 virgini~ Sepal~    0.750    0.0630    11.9   6.30e-16

map() 列表进、列表出 df %>%
group_split() %>%
map()
list
map_df() 列表进、数据框出 df %>%
group_split() %>%
map_df()
df
group_map() 数据框进、列表出 df %>%
group_by() %>%
group_map()

group_modify() 数据框进、数据框出 df %>%
group_by() %>%
group_modify()

walk 列表进 df %>%
group_split() %>%
walk()
side effects
group_walk() 数据框进 df %>%
group_by() %>%
group_walk()
side effects

nobel_winners %>%
dplyr::group_split(category) %>%
purrr::map(
~ ggplot(data = .x, aes(x = prize_age)) +
geom_density() +
ggtitle(.x$category) ) nobel_winners %>% dplyr::group_by(category) %>% dplyr::group_map( ~ ggplot(data = .x, aes(x = prize_age)) + geom_density() + ggtitle(.y) ) nobel_winners %>% dplyr::group_by(category) %>% dplyr::group_walk( ~ ggsave( paste0(.y, ".png"), ggplot(data = .x, aes(x = prize_age)) + geom_density() + ggtitle(.y), device = "png", path = temp ) ) %>% invisible() ### 21.4.2 其他group函数 group_nest(), group_data(), group_keys(), group_rows() ## 21.5 列名清理 数据框的列名，不要用有空格和中文。 如果拿到的原始数据中列比较多，手动修改麻烦，可以使用janitor::clean_names()函数 library(readxl) library(janitor) # install.packages("janitor") roster_raw <- read_excel(here::here("demo_data", "dirty_data.xlsx")) glimpse(roster_raw) ## Rows: 13 ## Columns: 11 ##$ First Name        <chr> "Jason", "Jason", "Ali...
## $Last Name <chr> "Bourne", "Bourne", "K... ##$ Employee Status   <chr> "Teacher", "Teacher", ...
## $Subject <chr> "PE", "Drafting", "Mus... ##$ Hire Date         <dbl> 39690, 39690, 37118, 2...
## $% Allocated <dbl> 0.75, 0.25, 1.00, 1.00... ##$ Full time?        <chr> "Yes", "Yes", "Yes", "...
## $do not edit! ---> <lgl> NA, NA, NA, NA, NA, NA... ##$ Certification...9   <chr> "Physical ed", "Physic...
## $Certification...10 <chr> "Theater", "Theater", ... ##$ Certification...11  <lgl> NA, NA, NA, NA, NA, NA...
roster <- roster_raw %>%
janitor::clean_names()

glimpse(roster)
## Rows: 13
## Columns: 11
## $first_name <chr> "Jason", "Jason", "Alici... ##$ last_name         <chr> "Bourne", "Bourne", "Key...
## $employee_status <chr> "Teacher", "Teacher", "T... ##$ subject           <chr> "PE", "Drafting", "Music...
## $hire_date <dbl> 39690, 39690, 37118, 275... ##$ percent_allocated <dbl> 0.75, 0.25, 1.00, 1.00, ...
## $full_time <chr> "Yes", "Yes", "Yes", "Ye... ##$ do_not_edit       <lgl> NA, NA, NA, NA, NA, NA, ...
## $certification_9 <chr> "Physical ed", "Physical... ##$ certification_10  <chr> "Theater", "Theater", "V...
## $certification_11 <lgl> NA, NA, NA, NA, NA, NA, ... ## 21.6 缺失值检查与处理 ### 21.6.1 purrr & dplyr 技巧 library(purrr) airquality <- as_tibble(airquality) airquality %>% purrr::map(~ sum(is.na(.))) ##$Ozone
## [1] 37
##
## $Solar.R ## [1] 7 ## ##$Wind
## [1] 0
##
## $Temp ## [1] 0 ## ##$Month
## [1] 0
##
## \$Day
## [1] 0
airquality %>%
purrr::map_df(~ sum(is.na(.)))
## # A tibble: 1 x 6
##   Ozone Solar.R  Wind  Temp Month   Day
##   <int>   <int> <int> <int> <int> <int>
## 1    37       7     0     0     0     0
airquality %>%
dplyr::summarise_at(2:3, ~ sum(is.na(.)))
## # A tibble: 1 x 2
##   Solar.R  Wind
##     <int> <int>
## 1       7     0

### 21.6.2 缺失值替换

airquality %>%
mutate_all(funs(replace(., is.na(.), 0)))
## # A tibble: 153 x 6
##    Ozone Solar.R  Wind  Temp Month   Day
##    <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1    41     190   7.4    67     5     1
##  2    36     118   8      72     5     2
##  3    12     149  12.6    74     5     3
##  4    18     313  11.5    62     5     4
##  5     0       0  14.3    56     5     5
##  6    28       0  14.9    66     5     6
##  7    23     299   8.6    65     5     7
##  8    19      99  13.8    59     5     8
##  9     8      19  20.1    61     5     9
## 10     0     194   8.6    69     5    10
## # ... with 143 more rows
airquality %>%
mutate_all(replace_na, replace = 0)
## # A tibble: 153 x 6
##    Ozone Solar.R  Wind  Temp Month   Day
##    <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1    41     190   7.4    67     5     1
##  2    36     118   8      72     5     2
##  3    12     149  12.6    74     5     3
##  4    18     313  11.5    62     5     4
##  5     0       0  14.3    56     5     5
##  6    28       0  14.9    66     5     6
##  7    23     299   8.6    65     5     7
##  8    19      99  13.8    59     5     8
##  9     8      19  20.1    61     5     9
## 10     0     194   8.6    69     5    10
## # ... with 143 more rows
airquality %>%
mutate_if(is.numeric, replace_na, replace = 0)
## # A tibble: 153 x 6
##    Ozone Solar.R  Wind  Temp Month   Day
##    <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1    41     190   7.4    67     5     1
##  2    36     118   8      72     5     2
##  3    12     149  12.6    74     5     3
##  4    18     313  11.5    62     5     4
##  5     0       0  14.3    56     5     5
##  6    28       0  14.9    66     5     6
##  7    23     299   8.6    65     5     7
##  8    19      99  13.8    59     5     8
##  9     8      19  20.1    61     5     9
## 10     0     194   8.6    69     5    10
## # ... with 143 more rows
airquality %>%
mutate_all(as.numeric) %>%
mutate_all(~ coalesce(., 0))
## # A tibble: 153 x 6
##    Ozone Solar.R  Wind  Temp Month   Day
##    <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1    41     190   7.4    67     5     1
##  2    36     118   8      72     5     2
##  3    12     149  12.6    74     5     3
##  4    18     313  11.5    62     5     4
##  5     0       0  14.3    56     5     5
##  6    28       0  14.9    66     5     6
##  7    23     299   8.6    65     5     7
##  8    19      99  13.8    59     5     8
##  9     8      19  20.1    61     5     9
## 10     0     194   8.6    69     5    10
## # ... with 143 more rows
tibble(
y = c(1, 2, NA, NA, 5),
z = c(NA, NA, 3, 4, 5)
) %>%
mutate_all(~ coalesce(., 0))
## # A tibble: 5 x 2
##       y     z
##   <dbl> <dbl>
## 1     1     0
## 2     2     0
## 3     0     3
## 4     0     4
## 5     5     5

## 21.7 标准化

df_mtcars
## # A tibble: 32 x 12
##    rowname   mpg cyl    disp    hp  drat    wt  qsec
##    <chr>   <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 1        21   6      160    110  3.9   2.62  16.5
##  2 2        21   6      160    110  3.9   2.88  17.0
##  3 3        22.8 4      108     93  3.85  2.32  18.6
##  4 4        21.4 6      258    110  3.08  3.22  19.4
##  5 5        18.7 8      360    175  3.15  3.44  17.0
##  6 6        18.1 6      225    105  2.76  3.46  20.2
##  7 7        14.3 8      360    245  3.21  3.57  15.8
##  8 8        24.4 4      147.    62  3.69  3.19  20
##  9 9        22.8 4      141.    95  3.92  3.15  22.9
## 10 10       19.2 6      168.   123  3.92  3.44  18.3
## # ... with 22 more rows, and 4 more variables:
## #   vs <fct>, am <fct>, gear <fct>, carb <fct>
df_mtcars %>% select_if(funs(is.numeric))
## # A tibble: 32 x 6
##      mpg  disp    hp  drat    wt  qsec
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21    160    110  3.9   2.62  16.5
##  2  21    160    110  3.9   2.88  17.0
##  3  22.8  108     93  3.85  2.32  18.6
##  4  21.4  258    110  3.08  3.22  19.4
##  5  18.7  360    175  3.15  3.44  17.0
##  6  18.1  225    105  2.76  3.46  20.2
##  7  14.3  360    245  3.21  3.57  15.8
##  8  24.4  147.    62  3.69  3.19  20
##  9  22.8  141.    95  3.92  3.15  22.9
## 10  19.2  168.   123  3.92  3.44  18.3
## # ... with 22 more rows
# way 1
df_mtcars %>%
mutate_at(vars(mpg, disp), ~ scale(., center = T, scale = T))
## # A tibble: 32 x 12
##    rowname mpg[,1] cyl   disp[,1]    hp  drat    wt
##    <chr>     <dbl> <fct>    <dbl> <dbl> <dbl> <dbl>
##  1 1         0.151 6      -0.571    110  3.9   2.62
##  2 2         0.151 6      -0.571    110  3.9   2.88
##  3 3         0.450 4      -0.990     93  3.85  2.32
##  4 4         0.217 6       0.220    110  3.08  3.22
##  5 5        -0.231 8       1.04     175  3.15  3.44
##  6 6        -0.330 6      -0.0462   105  2.76  3.46
##  7 7        -0.961 8       1.04     245  3.21  3.57
##  8 8         0.715 4      -0.678     62  3.69  3.19
##  9 9         0.450 4      -0.726     95  3.92  3.15
## 10 10       -0.148 6      -0.509    123  3.92  3.44
## # ... with 22 more rows, and 5 more variables:
## #   qsec <dbl>, vs <fct>, am <fct>, gear <fct>,
## #   carb <fct>
# way 2
df_mtcars %>%
mutate_at(vars(mpg, disp), funs((. - mean(.)) / sd(.)))
## # A tibble: 32 x 12
##    rowname    mpg cyl      disp    hp  drat    wt  qsec
##    <chr>    <dbl> <fct>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 1        0.151 6     -0.571    110  3.9   2.62  16.5
##  2 2        0.151 6     -0.571    110  3.9   2.88  17.0
##  3 3        0.450 4     -0.990     93  3.85  2.32  18.6
##  4 4        0.217 6      0.220    110  3.08  3.22  19.4
##  5 5       -0.231 8      1.04     175  3.15  3.44  17.0
##  6 6       -0.330 6     -0.0462   105  2.76  3.46  20.2
##  7 7       -0.961 8      1.04     245  3.21  3.57  15.8
##  8 8        0.715 4     -0.678     62  3.69  3.19  20
##  9 9        0.450 4     -0.726     95  3.92  3.15  22.9
## 10 10      -0.148 6     -0.509    123  3.92  3.44  18.3
## # ... with 22 more rows, and 4 more variables:
## #   vs <fct>, am <fct>, gear <fct>, carb <fct>
# way 3
func <- function(x) (x - min(x)) / (max(x) - min(x))
df_mtcars %>%
mutate_at(vars(mpg, disp), ~ func(.))
## # A tibble: 32 x 12
##    rowname   mpg cyl     disp    hp  drat    wt  qsec
##    <chr>   <dbl> <fct>  <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 1       0.451 6     0.222    110  3.9   2.62  16.5
##  2 2       0.451 6     0.222    110  3.9   2.88  17.0
##  3 3       0.528 4     0.0920    93  3.85  2.32  18.6
##  4 4       0.468 6     0.466    110  3.08  3.22  19.4
##  5 5       0.353 8     0.721    175  3.15  3.44  17.0
##  6 6       0.328 6     0.384    105  2.76  3.46  20.2
##  7 7       0.166 8     0.721    245  3.21  3.57  15.8
##  8 8       0.596 4     0.189     62  3.69  3.19  20
##  9 9       0.528 4     0.174     95  3.92  3.15  22.9
## 10 10      0.374 6     0.241    123  3.92  3.44  18.3
## # ... with 22 more rows, and 4 more variables:
## #   vs <fct>, am <fct>, gear <fct>, carb <fct>

func <- function(x) (x - min(x)) / (max(x) - min(x))

df_mtcars %>% mutate_all(~ func(.))
## Error: Problem with mutate() input rowname.
## x 二进列运算符中有非数值参数
## i Input rowname is (structure(function (..., .x = ..1, .y = ..2, . = ..1) ....
• 但这里数据中还有其他类型（fct, chr），所以这里 mutate_all() 会报错。
• 这种情形，用mutate_if()
func <- function(x) (x - min(x)) / (max(x) - min(x))

df_mtcars %>% mutate_if(is.numeric, ~ func(.))
## # A tibble: 32 x 12
##    rowname   mpg cyl     disp     hp  drat    wt  qsec
##    <chr>   <dbl> <fct>  <dbl>  <dbl> <dbl> <dbl> <dbl>
##  1 1       0.451 6     0.222  0.205  0.525 0.283 0.233
##  2 2       0.451 6     0.222  0.205  0.525 0.348 0.3
##  3 3       0.528 4     0.0920 0.145  0.502 0.206 0.489
##  4 4       0.468 6     0.466  0.205  0.147 0.435 0.588
##  5 5       0.353 8     0.721  0.435  0.180 0.493 0.3
##  6 6       0.328 6     0.384  0.187  0     0.498 0.681
##  7 7       0.166 8     0.721  0.682  0.207 0.526 0.160
##  8 8       0.596 4     0.189  0.0353 0.429 0.429 0.655
##  9 9       0.528 4     0.174  0.152  0.535 0.419 1
## 10 10      0.374 6     0.241  0.251  0.535 0.493 0.452
## # ... with 22 more rows, and 4 more variables:
## #   vs <fct>, am <fct>, gear <fct>, carb <fct>
funs <- list(
centered = mean, # Function object
scaled = ~ . - mean(.) / sd(.) # Purrr-style lambda
)

iris %>%
mutate_if(is.numeric, funs)
## # A tibble: 150 x 13
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1          5.1         3.5          1.4         0.2
##  2          4.9         3            1.4         0.2
##  3          4.7         3.2          1.3         0.2
##  4          4.6         3.1          1.5         0.2
##  5          5           3.6          1.4         0.2
##  6          5.4         3.9          1.7         0.4
##  7          4.6         3.4          1.4         0.3
##  8          5           3.4          1.5         0.2
##  9          4.4         2.9          1.4         0.2
## 10          4.9         3.1          1.5         0.1
## # ... with 140 more rows, and 9 more variables:
## #   Species <fct>, Sepal.Length_centered <dbl>,
## #   Sepal.Width_centered <dbl>,
## #   Petal.Length_centered <dbl>,
## #   Petal.Width_centered <dbl>,
## #   Sepal.Length_scaled <dbl>,
## #   Sepal.Width_scaled <dbl>,
## #   Petal.Length_scaled <dbl>,
## #   Petal.Width_scaled <dbl>

## 21.8 across函数

• colwise: group_by() %>% summarise/mutate + across()
• rowwise: rowwise()/nest_by() %>% summarise/mutate + c_across()

iris %>%
dplyr::group_by(Species) %>%
dplyr::summarise(
across(starts_with("Sepal"), mean),
Area = mean(Petal.Length * Petal.Width),
across(starts_with("Petal"), min)
)

### 21.8.1 across函数替代scope函数

df %>% mutate_if(is.numeric, mean, na.rm = TRUE)
# ->
df %>% mutate(across(is.numeric, mean, na.rm = TRUE))

df %>% mutate_at(vars(x, starts_with("y")), mean, na.rm = TRUE)
# ->
df %>% mutate(across(c(x, starts_with("y")), mean, na.rm = TRUE))

df %>% mutate_all(mean, na.rm = TRUE)
# ->
df %>% mutate(across(everything(), mean, na.rm = TRUE))

### 21.8.2 更方便的colwise操作

# multiple
df <- tibble(x = 1:3, y = 3:5, z = 5:7)
mult <- list(x = 1, y = 10, z = 100)

df %>% mutate(across(all_of(names(mult)), ~ .x * mult[[cur_column()]]))

# weights
df <- tibble(x = 1:3, y = 3:5, z = 5:7)
df
weights <- list(x = 0.2, y = 0.3, z = 0.5)

df %>% dplyr::mutate(
across(all_of(names(weights)),
list(wt = ~ .x * weights[[cur_column()]]),
.names = "{col}.{fn}"
)
)

# cutoffs
df <- tibble(x = 1:3, y = 3:5, z = 5:7)
df

cutoffs <- list(x = 2, y = 3, z = 7)

df %>% dplyr::mutate(
across(all_of(names(cutoffs)), ~ if_else(.x > cutoffs[[cur_column()]], 1, 0))
)