第 37 章 tidyverse进阶

让我们继续聊聊,相见恨晚的tidyverse

37.1 scoped 函数

在第 12 章介绍了dplyr的一些函数(mutate(), select()等等),事实上,这些函数加上后缀 _all, _at, _if,形成三组变体函数,可以方便对特定的子集进行操作。比如

  • 对数据框所有列操作,可以用_all
  • 对数据框指定的几列操作,可以用_at
  • 对数据框符合条件的几列进行操作,可以用_if

下面选取其中几个函数加以说明

37.1.1 mutate_if

iris <- iris %>% as_tibble() 

df_iris <- iris %>% head(5)
df_iris %>% mutate_if(is.double, as.integer)
## # A tibble: 5 × 5
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <int>       <int>        <int>       <int> <fct>  
## 1            5           3            1           0 setosa 
## 2            4           3            1           0 setosa 
## 3            4           3            1           0 setosa 
## 4            4           3            1           0 setosa 
## 5            5           3            1           0 setosa

可以一次性增加多列

df_iris %>% mutate_if(is.numeric, list(scale, log))
## # A tibble: 5 × 13
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.Length_fn1[,1]
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>                  <dbl>
## 1          5.1         3.5          1.4         0.2 setosa                 1.16 
## 2          4.9         3            1.4         0.2 setosa                 0.193
## 3          4.7         3.2          1.3         0.2 setosa                -0.772
## 4          4.6         3.1          1.5         0.2 setosa                -1.25 
## 5          5           3.6          1.4         0.2 setosa                 0.675
## # ℹ 7 more variables: Sepal.Width_fn1 <dbl[,1]>, Petal.Length_fn1 <dbl[,1]>,
## #   Petal.Width_fn1 <dbl[,1]>, Sepal.Length_fn2 <dbl>, Sepal.Width_fn2 <dbl>,
## #   Petal.Length_fn2 <dbl>, Petal.Width_fn2 <dbl>

也可以把函数放在list()中,用 Purrr-style lambda 形式写出

df_iris %>% mutate_if(is.numeric, list(~ scale(.), ~ log(.)))
## # A tibble: 5 × 13
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
## 1          5.1         3.5          1.4         0.2 setosa 
## 2          4.9         3            1.4         0.2 setosa 
## 3          4.7         3.2          1.3         0.2 setosa 
## 4          4.6         3.1          1.5         0.2 setosa 
## 5          5           3.6          1.4         0.2 setosa 
## # ℹ 8 more variables: Sepal.Length_scale <dbl[,1]>,
## #   Sepal.Width_scale <dbl[,1]>, Petal.Length_scale <dbl[,1]>,
## #   Petal.Width_scale <dbl[,1]>, Sepal.Length_log <dbl>, Sepal.Width_log <dbl>,
## #   Petal.Length_log <dbl>, Petal.Width_log <dbl>

37.1.2 select_if()

df <- tibble::tibble(
  x = letters[1:3],
  y = c(1:3),
  z = c(0, 0, 0)
)
df
## # A tibble: 3 × 3
##   x         y     z
##   <chr> <int> <dbl>
## 1 a         1     0
## 2 b         2     0
## 3 c         3     0
df %>% select_if(is.numeric)
## # A tibble: 3 × 2
##       y     z
##   <int> <dbl>
## 1     1     0
## 2     2     0
## 3     3     0
df %>% select_if(~ n_distinct(.) > 2)
## # A tibble: 3 × 2
##   x         y
##   <chr> <int>
## 1 a         1
## 2 b         2
## 3 c         3

select_if 多个条件的情况

## # A tibble: 3 × 3
##   x         y     z
##   <chr> <int> <dbl>
## 1 a         1     0
## 2 b         2     0
## 3 c         3     0
## # A tibble: 3 × 3
##   x         y     z
##   <chr> <int> <dbl>
## 1 a         1     0
## 2 b         2     0
## 3 c         3     0
to_keep <- function(x) is.numeric(x) | is.character(x)
df %>% select_if(to_keep)
## # A tibble: 3 × 3
##   x         y     z
##   <chr> <int> <dbl>
## 1 a         1     0
## 2 b         2     0
## 3 c         3     0
df %>% select_if(
  list(~ (is.numeric(.) && sum(.) > 2))
)
## # A tibble: 3 × 1
##       y
##   <int>
## 1     1
## 2     2
## 3     3
df %>% select_if(
  list(~ (is.numeric(.) && mean(.) > 1))
)
## # A tibble: 3 × 1
##       y
##   <int>
## 1     1
## 2     2
## 3     3

我们也可以写成函数的形式

to_want <- function(x) is.numeric(x) && sum(x) > 3

df %>% select_if(to_want)
## # A tibble: 3 × 1
##       y
##   <int>
## 1     1
## 2     2
## 3     3

37.2 summarise_if

msleep <- ggplot2::msleep
msleep %>%
  dplyr::group_by(vore) %>%
  dplyr::summarise_all(~ mean(., na.rm = TRUE))
## # A tibble: 5 × 11
##   vore     name genus order conservation sleep_total sleep_rem sleep_cycle awake
##   <chr>   <dbl> <dbl> <dbl>        <dbl>       <dbl>     <dbl>       <dbl> <dbl>
## 1 carni      NA    NA    NA           NA       10.4       2.29       0.373 13.6 
## 2 herbi      NA    NA    NA           NA        9.51      1.37       0.418 14.5 
## 3 insecti    NA    NA    NA           NA       14.9       3.52       0.161  9.06
## 4 omni       NA    NA    NA           NA       10.9       1.96       0.592 13.1 
## 5 <NA>       NA    NA    NA           NA       10.2       1.88       0.183 13.8 
## # ℹ 2 more variables: brainwt <dbl>, bodywt <dbl>
msleep <- ggplot2::msleep
msleep %>%
  dplyr::group_by(vore) %>%
  # summarise_if(is.numeric, ~mean(., na.rm = TRUE))
  dplyr::summarise_if(is.numeric, mean, na.rm = TRUE)
## # A tibble: 5 × 7
##   vore    sleep_total sleep_rem sleep_cycle awake brainwt  bodywt
##   <chr>         <dbl>     <dbl>       <dbl> <dbl>   <dbl>   <dbl>
## 1 carni         10.4       2.29       0.373 13.6  0.0793   90.8  
## 2 herbi          9.51      1.37       0.418 14.5  0.622   367.   
## 3 insecti       14.9       3.52       0.161  9.06 0.0216   12.9  
## 4 omni          10.9       1.96       0.592 13.1  0.146    12.7  
## 5 <NA>          10.2       1.88       0.183 13.8  0.00763   0.858

37.3 filter_if()

事实上,filter已经很强大了,有了scoped函数,就如虎添翼了

msleep <- ggplot2::msleep
msleep %>%
  dplyr::select(name, sleep_total) %>%
  dplyr::filter(sleep_total > 18)
## # A tibble: 4 × 2
##   name                 sleep_total
##   <chr>                      <dbl>
## 1 Big brown bat               19.7
## 2 Thick-tailed opposum        19.4
## 3 Little brown bat            19.9
## 4 Giant armadillo             18.1
msleep %>%
  dplyr::select(name, sleep_total) %>%
  dplyr::filter(between(sleep_total, 16, 18))
## # A tibble: 4 × 2
##   name                   sleep_total
##   <chr>                        <dbl>
## 1 Owl monkey                    17  
## 2 Long-nosed armadillo          17.4
## 3 North American Opossum        18  
## 4 Arctic ground squirrel        16.6
msleep %>%
  dplyr::select(name, sleep_total) %>%
  # filter(near(sleep_total, 17,  tol=sd(sleep_total)))
  dplyr::filter(near(sleep_total, mean(sleep_total), tol = 0.5 * sd(sleep_total)))
## # A tibble: 35 × 2
##    name                      sleep_total
##    <chr>                           <dbl>
##  1 Cheetah                          12.1
##  2 Northern fur seal                 8.7
##  3 Dog                              10.1
##  4 Guinea pig                        9.4
##  5 Grivet                           10  
##  6 Chinchilla                       12.5
##  7 Star-nosed mole                  10.3
##  8 African giant pouched rat         8.3
##  9 Lesser short-tailed shrew         9.1
## 10 European hedgehog                10.1
## # ℹ 25 more rows

mtcars是 R内置数据集,记录了32种不同品牌的轿车的的11个属性

mtcars <- mtcars %>% as_tibble()
mtcars
## # A tibble: 32 × 11
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
##  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
##  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
##  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
##  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
##  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
##  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
##  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
##  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
## 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
## # ℹ 22 more rows

filter_if()配合all_vars(), any_vars()函数,可以完成很酷的工作. 比如,要求一行中所有变量的值都大于150

mtcars %>% filter_all(all_vars(. > 150))
## # A tibble: 0 × 11
## # ℹ 11 variables: mpg <dbl>, cyl <dbl>, disp <dbl>, hp <dbl>, drat <dbl>,
## #   wt <dbl>, qsec <dbl>, vs <dbl>, am <dbl>, gear <dbl>, carb <dbl>

比如,要求一行中至少有一个变量的值都大于150

# Or the union:
mtcars %>% filter_all(any_vars(. > 150))
## # A tibble: 21 × 11
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
##  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
##  3  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
##  4  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
##  5  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
##  6  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
##  7  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
##  8  17.8     6  168.   123  3.92  3.44  18.9     1     0     4     4
##  9  16.4     8  276.   180  3.07  4.07  17.4     0     0     3     3
## 10  17.3     8  276.   180  3.07  3.73  17.6     0     0     3     3
## # ℹ 11 more rows
# You can vary the selection of columns on which to apply the predicate.
# filter_at() takes a vars() specification:
mtcars %>% filter_at(vars(starts_with("d")), any_vars((. %% 2) == 0))
## # A tibble: 13 × 11
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
##  2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
##  3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
##  4  21.4     6   258   110  3.08  3.22  19.4     1     0     3     1
##  5  18.7     8   360   175  3.15  3.44  17.0     0     0     3     2
##  6  14.3     8   360   245  3.21  3.57  15.8     0     0     3     4
##  7  10.4     8   472   205  2.93  5.25  18.0     0     0     3     4
##  8  10.4     8   460   215  3     5.42  17.8     0     0     3     4
##  9  14.7     8   440   230  3.23  5.34  17.4     0     0     3     4
## 10  15.5     8   318   150  2.76  3.52  16.9     0     0     3     2
## 11  15.2     8   304   150  3.15  3.44  17.3     0     0     3     2
## 12  13.3     8   350   245  3.73  3.84  15.4     0     0     3     4
## 13  19.2     8   400   175  3.08  3.84  17.0     0     0     3     2

filter_if(.tbl, .predicate, .vars_predicate) 相对复杂点,我这里多说几句。

filter_if() 有三个参数:

  • .tbl, 数据框
  • .predicate, 应用在列上的函数,一般作为列的选择条件
  • .vars_predicate, 应用在一行上的函数,通过 all_vars(), any_vars()返回值决定是否选取该行。
# And filter_if() selects variables with a predicate function:
# filter_if(.tbl, .predicate, .vars_predicate)
# mtcars %>% map_df(~ all(floor(.) == .) )
# mtcars %>% select_if( ~ all(floor(.) == .) )

mtcars %>% filter_if(~ all(floor(.) == .), all_vars(. != 0))
## # A tibble: 7 × 11
##     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  22.8     4 108      93  3.85  2.32  18.6     1     1     4     1
## 2  32.4     4  78.7    66  4.08  2.2   19.5     1     1     4     1
## 3  30.4     4  75.7    52  4.93  1.62  18.5     1     1     4     2
## 4  33.9     4  71.1    65  4.22  1.84  19.9     1     1     4     1
## 5  27.3     4  79      66  4.08  1.94  18.9     1     1     4     1
## 6  30.4     4  95.1   113  3.77  1.51  16.9     1     1     5     2
## 7  21.4     4 121     109  4.11  2.78  18.6     1     1     4     2

所以这里是,先通过.predicate = ~ all(floor(.) == .) 选取变量值为整数的列,然后再看选取的这些列的行方向,如果每一行的值.vars_predicate = all_vars(. != 0) ,都不为0,就保留下来,否则过滤掉。

简单点说,这段代码的意思,数值全部为整数的列,不能同时为0

37.4 group_by

group_by() 用的很多,所以要多讲讲

mtcars %>% dplyr::group_by(cyl)
## # A tibble: 32 × 11
## # Groups:   cyl [3]
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
##  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
##  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
##  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
##  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
##  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
##  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
##  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
##  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
## 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
## # ℹ 22 more rows
mtcars %>% group_by_at(vars(cyl))
## # A tibble: 32 × 11
## # Groups:   cyl [3]
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
##  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
##  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
##  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
##  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
##  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
##  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
##  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
##  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
## 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
## # ℹ 22 more rows
# Group a data frame by all variables:
mtcars %>% group_by_all()
## # A tibble: 32 × 11
## # Groups:   mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb [32]
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
##  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
##  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
##  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
##  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
##  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
##  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
##  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
##  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
## 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
## # ℹ 22 more rows
# Group by variables selected with a predicate:
iris %>% group_by_if(is.factor)
## # A tibble: 150 × 5
## # Groups:   Species [3]
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # ℹ 140 more rows

37.4.1 group_split(), group_map(), group_modify()

iris %>%
  dplyr::group_by(Species) %>%
  dplyr::group_split()
## <list_of<
##   tbl_df<
##     Sepal.Length: double
##     Sepal.Width : double
##     Petal.Length: double
##     Petal.Width : double
##     Species     : factor<fb977>
##   >
## >[3]>
## [[1]]
## # A tibble: 50 × 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # ℹ 40 more rows
## 
## [[2]]
## # A tibble: 50 × 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>     
##  1          7           3.2          4.7         1.4 versicolor
##  2          6.4         3.2          4.5         1.5 versicolor
##  3          6.9         3.1          4.9         1.5 versicolor
##  4          5.5         2.3          4           1.3 versicolor
##  5          6.5         2.8          4.6         1.5 versicolor
##  6          5.7         2.8          4.5         1.3 versicolor
##  7          6.3         3.3          4.7         1.6 versicolor
##  8          4.9         2.4          3.3         1   versicolor
##  9          6.6         2.9          4.6         1.3 versicolor
## 10          5.2         2.7          3.9         1.4 versicolor
## # ℹ 40 more rows
## 
## [[3]]
## # A tibble: 50 × 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>    
##  1          6.3         3.3          6           2.5 virginica
##  2          5.8         2.7          5.1         1.9 virginica
##  3          7.1         3            5.9         2.1 virginica
##  4          6.3         2.9          5.6         1.8 virginica
##  5          6.5         3            5.8         2.2 virginica
##  6          7.6         3            6.6         2.1 virginica
##  7          4.9         2.5          4.5         1.7 virginica
##  8          7.3         2.9          6.3         1.8 virginica
##  9          6.7         2.5          5.8         1.8 virginica
## 10          7.2         3.6          6.1         2.5 virginica
## # ℹ 40 more rows

简单点写,就是

iris %>%
  dplyr::group_split(Species)
## <list_of<
##   tbl_df<
##     Sepal.Length: double
##     Sepal.Width : double
##     Petal.Length: double
##     Petal.Width : double
##     Species     : factor<fb977>
##   >
## >[3]>
## [[1]]
## # A tibble: 50 × 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # ℹ 40 more rows
## 
## [[2]]
## # A tibble: 50 × 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>     
##  1          7           3.2          4.7         1.4 versicolor
##  2          6.4         3.2          4.5         1.5 versicolor
##  3          6.9         3.1          4.9         1.5 versicolor
##  4          5.5         2.3          4           1.3 versicolor
##  5          6.5         2.8          4.6         1.5 versicolor
##  6          5.7         2.8          4.5         1.3 versicolor
##  7          6.3         3.3          4.7         1.6 versicolor
##  8          4.9         2.4          3.3         1   versicolor
##  9          6.6         2.9          4.6         1.3 versicolor
## 10          5.2         2.7          3.9         1.4 versicolor
## # ℹ 40 more rows
## 
## [[3]]
## # A tibble: 50 × 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>    
##  1          6.3         3.3          6           2.5 virginica
##  2          5.8         2.7          5.1         1.9 virginica
##  3          7.1         3            5.9         2.1 virginica
##  4          6.3         2.9          5.6         1.8 virginica
##  5          6.5         3            5.8         2.2 virginica
##  6          7.6         3            6.6         2.1 virginica
##  7          4.9         2.5          4.5         1.7 virginica
##  8          7.3         2.9          6.3         1.8 virginica
##  9          6.7         2.5          5.8         1.8 virginica
## 10          7.2         3.6          6.1         2.5 virginica
## # ℹ 40 more rows

如果使用group_split(), 注意分组后,返回的是列表

iris %>%
  dplyr::group_split(Species)
## <list_of<
##   tbl_df<
##     Sepal.Length: double
##     Sepal.Width : double
##     Petal.Length: double
##     Petal.Width : double
##     Species     : factor<fb977>
##   >
## >[3]>
## [[1]]
## # A tibble: 50 × 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # ℹ 40 more rows
## 
## [[2]]
## # A tibble: 50 × 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>     
##  1          7           3.2          4.7         1.4 versicolor
##  2          6.4         3.2          4.5         1.5 versicolor
##  3          6.9         3.1          4.9         1.5 versicolor
##  4          5.5         2.3          4           1.3 versicolor
##  5          6.5         2.8          4.6         1.5 versicolor
##  6          5.7         2.8          4.5         1.3 versicolor
##  7          6.3         3.3          4.7         1.6 versicolor
##  8          4.9         2.4          3.3         1   versicolor
##  9          6.6         2.9          4.6         1.3 versicolor
## 10          5.2         2.7          3.9         1.4 versicolor
## # ℹ 40 more rows
## 
## [[3]]
## # A tibble: 50 × 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>    
##  1          6.3         3.3          6           2.5 virginica
##  2          5.8         2.7          5.1         1.9 virginica
##  3          7.1         3            5.9         2.1 virginica
##  4          6.3         2.9          5.6         1.8 virginica
##  5          6.5         3            5.8         2.2 virginica
##  6          7.6         3            6.6         2.1 virginica
##  7          4.9         2.5          4.5         1.7 virginica
##  8          7.3         2.9          6.3         1.8 virginica
##  9          6.7         2.5          5.8         1.8 virginica
## 10          7.2         3.6          6.1         2.5 virginica
## # ℹ 40 more rows

既然是列表,当然想到用前面讲到的purrr::map()家族

iris %>%
  dplyr::group_split(Species) %>%
  purrr::map(~ broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x)))
## [[1]]
## # A tibble: 2 × 5
##   term         estimate std.error statistic p.value
##   <chr>           <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)     0.803    0.344       2.34  0.0238
## 2 Sepal.Length    0.132    0.0685      1.92  0.0607
## 
## [[2]]
## # A tibble: 2 × 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     0.185    0.514      0.360 7.20e- 1
## 2 Sepal.Length    0.686    0.0863     7.95  2.59e-10
## 
## [[3]]
## # A tibble: 2 × 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     0.610    0.417       1.46 1.50e- 1
## 2 Sepal.Length    0.750    0.0630     11.9  6.30e-16
iris %>%
  dplyr::group_split(Species) %>%
  purrr::map_df(~ broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x)))
## # A tibble: 6 × 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     0.803    0.344      2.34  2.38e- 2
## 2 Sepal.Length    0.132    0.0685     1.92  6.07e- 2
## 3 (Intercept)     0.185    0.514      0.360 7.20e- 1
## 4 Sepal.Length    0.686    0.0863     7.95  2.59e-10
## 5 (Intercept)     0.610    0.417      1.46  1.50e- 1
## 6 Sepal.Length    0.750    0.0630    11.9   6.30e-16

上面这个代码,数据框分割成list, 处理完后再合并成数据框,难道不觉得折腾么? 为什么直接点? tidyverse不会让我们失望的,先看看group_map()

iris %>%
  dplyr::group_by(Species) %>%
  dplyr::group_map(~ broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x)))
## [[1]]
## # A tibble: 2 × 5
##   term         estimate std.error statistic p.value
##   <chr>           <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)     0.803    0.344       2.34  0.0238
## 2 Sepal.Length    0.132    0.0685      1.92  0.0607
## 
## [[2]]
## # A tibble: 2 × 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     0.185    0.514      0.360 7.20e- 1
## 2 Sepal.Length    0.686    0.0863     7.95  2.59e-10
## 
## [[3]]
## # A tibble: 2 × 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     0.610    0.417       1.46 1.50e- 1
## 2 Sepal.Length    0.750    0.0630     11.9  6.30e-16
  • group_map()要求 The result of .f should be a data frame(.f 必须返回数据框)
  • group_map() return a list of tibble(返回元素均为df的一个列表list(df1,df2,df3))

数据框进来,然后分组,依次处理成一个个数据框,最后以列表形式(a list of tibble)输出。

事实上,group_map()是返回list形式,也就是说,可以是返回任何形式,(a list of tibble)是其中特殊形式。 可以看看下面这个

iris %>%
  dplyr::group_by(Species) %>%
  dplyr::group_map(
    ~ lm(Petal.Length ~ Sepal.Length, data = .x)
  )
## [[1]]
## 
## Call:
## lm(formula = Petal.Length ~ Sepal.Length, data = .x)
## 
## Coefficients:
##  (Intercept)  Sepal.Length  
##       0.8031        0.1316  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = Petal.Length ~ Sepal.Length, data = .x)
## 
## Coefficients:
##  (Intercept)  Sepal.Length  
##       0.1851        0.6865  
## 
## 
## [[3]]
## 
## Call:
## lm(formula = Petal.Length ~ Sepal.Length, data = .x)
## 
## Coefficients:
##  (Intercept)  Sepal.Length  
##       0.6105        0.7501

group_modify() 才是真正意义上的”数据框进、数据框出”。

iris %>%
  dplyr::group_by(Species) %>%
  dplyr::group_modify(~ broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x)))
## # A tibble: 6 × 6
## # Groups:   Species [3]
##   Species    term         estimate std.error statistic  p.value
##   <fct>      <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 setosa     (Intercept)     0.803    0.344      2.34  2.38e- 2
## 2 setosa     Sepal.Length    0.132    0.0685     1.92  6.07e- 2
## 3 versicolor (Intercept)     0.185    0.514      0.360 7.20e- 1
## 4 versicolor Sepal.Length    0.686    0.0863     7.95  2.59e-10
## 5 virginica  (Intercept)     0.610    0.417      1.46  1.50e- 1
## 6 virginica  Sepal.Length    0.750    0.0630    11.9   6.30e-16

为了大家方便查阅和记忆,我总结下表

函数 说明 常用组合 返回值 要求
map() 列表进、列表出 df %>%
group_split() %>%
map()
list
map_df() 列表进、数据框出 df %>%
group_split() %>%
map_df()
df
group_map() 数据框进、列表出 df %>%
group_by() %>%
group_map()
返回list(list1, list2, …)
特例list(df1, df2, …)
group_modify() 数据框进、数据框出 df %>%
group_by() %>%
group_modify()
返回grouped tibble .f返回df
walk 列表进 df %>%
group_split() %>%
walk()
side effects
group_walk() 数据框进 df %>%
group_by() %>%
group_walk()
side effects

我常用的批量出图的语句

nobel_winners %>%
  dplyr::group_split(category) %>%
  purrr::map(
    ~ ggplot(data = .x, aes(x = prize_age)) +
      geom_density() +
      ggtitle(.x$category)
  )
nobel_winners %>%
  dplyr::group_by(category) %>%
  dplyr::group_map(
    ~ ggplot(data = .x, aes(x = prize_age)) +
      geom_density() +
      ggtitle(.y)
  )
nobel_winners %>%
  dplyr::group_by(category) %>%
  dplyr::group_walk(
    ~ ggsave(
      paste0(.y, ".png"),
      ggplot(data = .x, aes(x = prize_age)) +
        geom_density() +
        ggtitle(.y),
      device = "png",
      path = temp
    )
  ) %>%
  invisible()

37.4.2 其他group函数

group_nest(), group_data(), group_keys(), group_rows()

37.5 列名清理

数据框的列名,不要用有空格和中文。 如果拿到的原始数据中列比较多,手动修改麻烦,可以使用janitor::clean_names()函数

library(readxl)
library(janitor) # install.packages("janitor")

roster_raw <- read_excel(here::here("demo_data", "dirty_data.xlsx"))

glimpse(roster_raw)
## Rows: 13
## Columns: 11
## $ `First Name`        <chr> "Jason", "Jason", "Alicia", "Ada", "Desus", "Chien…
## $ `Last Name`         <chr> "Bourne", "Bourne", "Keys", "Lovelace", "Nice", "W…
## $ `Employee Status`   <chr> "Teacher", "Teacher", "Teacher", "Teacher", "Admin…
## $ Subject             <chr> "PE", "Drafting", "Music", NA, "Dean", "Physics", …
## $ `Hire Date`         <dbl> 39690, 39690, 37118, 27515, 41431, 11037, 11037, N…
## $ `% Allocated`       <dbl> 0.75, 0.25, 1.00, 1.00, 1.00, 0.50, 0.50, NA, 0.50…
## $ `Full time?`        <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", N…
## $ `do not edit! --->` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ Certification...9   <chr> "Physical ed", "Physical ed", "Instr. music", "PEN…
## $ Certification...10  <chr> "Theater", "Theater", "Vocal music", "Computers", …
## $ Certification...11  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
roster <- roster_raw %>%
  janitor::clean_names()

glimpse(roster)
## Rows: 13
## Columns: 11
## $ first_name        <chr> "Jason", "Jason", "Alicia", "Ada", "Desus", "Chien-S…
## $ last_name         <chr> "Bourne", "Bourne", "Keys", "Lovelace", "Nice", "Wu"…
## $ employee_status   <chr> "Teacher", "Teacher", "Teacher", "Teacher", "Adminis…
## $ subject           <chr> "PE", "Drafting", "Music", NA, "Dean", "Physics", "C…
## $ hire_date         <dbl> 39690, 39690, 37118, 27515, 41431, 11037, 11037, NA,…
## $ percent_allocated <dbl> 0.75, 0.25, 1.00, 1.00, 1.00, 0.50, 0.50, NA, 0.50, …
## $ full_time         <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", NA,…
## $ do_not_edit       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ certification_9   <chr> "Physical ed", "Physical ed", "Instr. music", "PENDI…
## $ certification_10  <chr> "Theater", "Theater", "Vocal music", "Computers", NA…
## $ certification_11  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA

37.6 缺失值检查与处理

37.6.1 purrr & dplyr 技巧

library(purrr)
airquality <- as_tibble(airquality)

airquality %>% purrr::map(~ sum(is.na(.)))
## $Ozone
## [1] 37
## 
## $Solar.R
## [1] 7
## 
## $Wind
## [1] 0
## 
## $Temp
## [1] 0
## 
## $Month
## [1] 0
## 
## $Day
## [1] 0
airquality %>%
  purrr::map_df(~ sum(is.na(.)))
## # A tibble: 1 × 6
##   Ozone Solar.R  Wind  Temp Month   Day
##   <int>   <int> <int> <int> <int> <int>
## 1    37       7     0     0     0     0
airquality %>%
  dplyr::summarise_at(2:3, ~ sum(is.na(.)))
## # A tibble: 1 × 2
##   Solar.R  Wind
##     <int> <int>
## 1       7     0

37.6.2 缺失值替换

airquality %>%
  mutate_all(funs(replace(., is.na(.), 0)))
## # A tibble: 153 × 6
##    Ozone Solar.R  Wind  Temp Month   Day
##    <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1    41     190   7.4    67     5     1
##  2    36     118   8      72     5     2
##  3    12     149  12.6    74     5     3
##  4    18     313  11.5    62     5     4
##  5     0       0  14.3    56     5     5
##  6    28       0  14.9    66     5     6
##  7    23     299   8.6    65     5     7
##  8    19      99  13.8    59     5     8
##  9     8      19  20.1    61     5     9
## 10     0     194   8.6    69     5    10
## # ℹ 143 more rows
airquality %>%
  mutate_all(replace_na, replace = 0)
## # A tibble: 153 × 6
##    Ozone Solar.R  Wind  Temp Month   Day
##    <int>   <int> <dbl> <int> <int> <int>
##  1    41     190   7.4    67     5     1
##  2    36     118   8      72     5     2
##  3    12     149  12.6    74     5     3
##  4    18     313  11.5    62     5     4
##  5     0       0  14.3    56     5     5
##  6    28       0  14.9    66     5     6
##  7    23     299   8.6    65     5     7
##  8    19      99  13.8    59     5     8
##  9     8      19  20.1    61     5     9
## 10     0     194   8.6    69     5    10
## # ℹ 143 more rows
airquality %>%
  mutate_if(is.numeric, replace_na, replace = 0)
## # A tibble: 153 × 6
##    Ozone Solar.R  Wind  Temp Month   Day
##    <int>   <int> <dbl> <int> <int> <int>
##  1    41     190   7.4    67     5     1
##  2    36     118   8      72     5     2
##  3    12     149  12.6    74     5     3
##  4    18     313  11.5    62     5     4
##  5     0       0  14.3    56     5     5
##  6    28       0  14.9    66     5     6
##  7    23     299   8.6    65     5     7
##  8    19      99  13.8    59     5     8
##  9     8      19  20.1    61     5     9
## 10     0     194   8.6    69     5    10
## # ℹ 143 more rows
airquality %>%
  mutate_all(as.numeric) %>%
  mutate_all(~ coalesce(., 0))
## # A tibble: 153 × 6
##    Ozone Solar.R  Wind  Temp Month   Day
##    <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1    41     190   7.4    67     5     1
##  2    36     118   8      72     5     2
##  3    12     149  12.6    74     5     3
##  4    18     313  11.5    62     5     4
##  5     0       0  14.3    56     5     5
##  6    28       0  14.9    66     5     6
##  7    23     299   8.6    65     5     7
##  8    19      99  13.8    59     5     8
##  9     8      19  20.1    61     5     9
## 10     0     194   8.6    69     5    10
## # ℹ 143 more rows
tibble(
  y = c(1, 2, NA, NA, 5),
  z = c(NA, NA, 3, 4, 5)
) %>%
  mutate_all(~ coalesce(., 0))
## # A tibble: 5 × 2
##       y     z
##   <dbl> <dbl>
## 1     1     0
## 2     2     0
## 3     0     3
## 4     0     4
## 5     5     5

37.7 标准化

数据变量,在标准化之前是有单位的,如mm,kg等,标准之后就没有量纲了,而是偏离均值的程度,一般用多少方差,几个方差来度量。 标准化的好处在于,不同量纲的变量可以比较分析。

df_mtcars
## # A tibble: 32 × 12
##    rowname   mpg cyl    disp    hp  drat    wt  qsec vs    am    gear  carb 
##    <chr>   <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct> <fct>
##  1 1        21   6      160    110  3.9   2.62  16.5 0     1     4     4    
##  2 2        21   6      160    110  3.9   2.88  17.0 0     1     4     4    
##  3 3        22.8 4      108     93  3.85  2.32  18.6 1     1     4     1    
##  4 4        21.4 6      258    110  3.08  3.22  19.4 1     0     3     1    
##  5 5        18.7 8      360    175  3.15  3.44  17.0 0     0     3     2    
##  6 6        18.1 6      225    105  2.76  3.46  20.2 1     0     3     1    
##  7 7        14.3 8      360    245  3.21  3.57  15.8 0     0     3     4    
##  8 8        24.4 4      147.    62  3.69  3.19  20   1     0     4     2    
##  9 9        22.8 4      141.    95  3.92  3.15  22.9 1     0     4     2    
## 10 10       19.2 6      168.   123  3.92  3.44  18.3 1     0     4     4    
## # ℹ 22 more rows
df_mtcars %>% select_if(funs(is.numeric))
## # A tibble: 32 × 6
##      mpg  disp    hp  drat    wt  qsec
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21    160    110  3.9   2.62  16.5
##  2  21    160    110  3.9   2.88  17.0
##  3  22.8  108     93  3.85  2.32  18.6
##  4  21.4  258    110  3.08  3.22  19.4
##  5  18.7  360    175  3.15  3.44  17.0
##  6  18.1  225    105  2.76  3.46  20.2
##  7  14.3  360    245  3.21  3.57  15.8
##  8  24.4  147.    62  3.69  3.19  20  
##  9  22.8  141.    95  3.92  3.15  22.9
## 10  19.2  168.   123  3.92  3.44  18.3
## # ℹ 22 more rows
# way 1
df_mtcars %>%
  mutate_at(vars(mpg, disp), ~ scale(., center = T, scale = T))
## # A tibble: 32 × 12
##    rowname mpg[,1] cyl   disp[,1]    hp  drat    wt  qsec vs    am    gear 
##    <chr>     <dbl> <fct>    <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct>
##  1 1         0.151 6      -0.571    110  3.9   2.62  16.5 0     1     4    
##  2 2         0.151 6      -0.571    110  3.9   2.88  17.0 0     1     4    
##  3 3         0.450 4      -0.990     93  3.85  2.32  18.6 1     1     4    
##  4 4         0.217 6       0.220    110  3.08  3.22  19.4 1     0     3    
##  5 5        -0.231 8       1.04     175  3.15  3.44  17.0 0     0     3    
##  6 6        -0.330 6      -0.0462   105  2.76  3.46  20.2 1     0     3    
##  7 7        -0.961 8       1.04     245  3.21  3.57  15.8 0     0     3    
##  8 8         0.715 4      -0.678     62  3.69  3.19  20   1     0     4    
##  9 9         0.450 4      -0.726     95  3.92  3.15  22.9 1     0     4    
## 10 10       -0.148 6      -0.509    123  3.92  3.44  18.3 1     0     4    
## # ℹ 22 more rows
## # ℹ 1 more variable: carb <fct>
# way 2
df_mtcars %>%
  mutate_at(vars(mpg, disp), funs((. - mean(.)) / sd(.)))
## # A tibble: 32 × 12
##    rowname    mpg cyl      disp    hp  drat    wt  qsec vs    am    gear  carb 
##    <chr>    <dbl> <fct>   <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct> <fct>
##  1 1        0.151 6     -0.571    110  3.9   2.62  16.5 0     1     4     4    
##  2 2        0.151 6     -0.571    110  3.9   2.88  17.0 0     1     4     4    
##  3 3        0.450 4     -0.990     93  3.85  2.32  18.6 1     1     4     1    
##  4 4        0.217 6      0.220    110  3.08  3.22  19.4 1     0     3     1    
##  5 5       -0.231 8      1.04     175  3.15  3.44  17.0 0     0     3     2    
##  6 6       -0.330 6     -0.0462   105  2.76  3.46  20.2 1     0     3     1    
##  7 7       -0.961 8      1.04     245  3.21  3.57  15.8 0     0     3     4    
##  8 8        0.715 4     -0.678     62  3.69  3.19  20   1     0     4     2    
##  9 9        0.450 4     -0.726     95  3.92  3.15  22.9 1     0     4     2    
## 10 10      -0.148 6     -0.509    123  3.92  3.44  18.3 1     0     4     4    
## # ℹ 22 more rows
# way 3
func <- function(x) (x - min(x)) / (max(x) - min(x))
df_mtcars %>%
  mutate_at(vars(mpg, disp), ~ func(.))
## # A tibble: 32 × 12
##    rowname   mpg cyl     disp    hp  drat    wt  qsec vs    am    gear  carb 
##    <chr>   <dbl> <fct>  <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct> <fct>
##  1 1       0.451 6     0.222    110  3.9   2.62  16.5 0     1     4     4    
##  2 2       0.451 6     0.222    110  3.9   2.88  17.0 0     1     4     4    
##  3 3       0.528 4     0.0920    93  3.85  2.32  18.6 1     1     4     1    
##  4 4       0.468 6     0.466    110  3.08  3.22  19.4 1     0     3     1    
##  5 5       0.353 8     0.721    175  3.15  3.44  17.0 0     0     3     2    
##  6 6       0.328 6     0.384    105  2.76  3.46  20.2 1     0     3     1    
##  7 7       0.166 8     0.721    245  3.21  3.57  15.8 0     0     3     4    
##  8 8       0.596 4     0.189     62  3.69  3.19  20   1     0     4     2    
##  9 9       0.528 4     0.174     95  3.92  3.15  22.9 1     0     4     2    
## 10 10      0.374 6     0.241    123  3.92  3.44  18.3 1     0     4     4    
## # ℹ 22 more rows

如果所有的列,都是数值型

func <- function(x) (x - min(x)) / (max(x) - min(x))

df_mtcars %>% mutate_all(~ func(.))
## Error in `mutate()`:
## ℹ In argument: `rowname = (structure(function (..., .x = ..1, .y = ..2,
##   . = ..1) ...`.
## Caused by error in `x - min(x)`:
## ! non-numeric argument to binary operator
  • 但这里数据中还有其他类型(fct, chr),所以这里 mutate_all() 会报错。
  • 这种情形,用mutate_if()
func <- function(x) (x - min(x)) / (max(x) - min(x))

df_mtcars %>% mutate_if(is.numeric, ~ func(.))
## # A tibble: 32 × 12
##    rowname   mpg cyl     disp     hp  drat    wt  qsec vs    am    gear  carb 
##    <chr>   <dbl> <fct>  <dbl>  <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct> <fct>
##  1 1       0.451 6     0.222  0.205  0.525 0.283 0.233 0     1     4     4    
##  2 2       0.451 6     0.222  0.205  0.525 0.348 0.3   0     1     4     4    
##  3 3       0.528 4     0.0920 0.145  0.502 0.206 0.489 1     1     4     1    
##  4 4       0.468 6     0.466  0.205  0.147 0.435 0.588 1     0     3     1    
##  5 5       0.353 8     0.721  0.435  0.180 0.493 0.3   0     0     3     2    
##  6 6       0.328 6     0.384  0.187  0     0.498 0.681 1     0     3     1    
##  7 7       0.166 8     0.721  0.682  0.207 0.526 0.160 0     0     3     4    
##  8 8       0.596 4     0.189  0.0353 0.429 0.429 0.655 1     0     4     2    
##  9 9       0.528 4     0.174  0.152  0.535 0.419 1     1     0     4     2    
## 10 10      0.374 6     0.241  0.251  0.535 0.493 0.452 1     0     4     4    
## # ℹ 22 more rows
funs <- list(
  centered = mean, # Function object
  scaled = ~ . - mean(.) / sd(.) # Purrr-style lambda
)

iris %>%
  mutate_if(is.numeric, funs)
## # A tibble: 150 × 13
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # ℹ 140 more rows
## # ℹ 8 more variables: Sepal.Length_centered <dbl>, Sepal.Width_centered <dbl>,
## #   Petal.Length_centered <dbl>, Petal.Width_centered <dbl>,
## #   Sepal.Length_scaled <dbl>, Sepal.Width_scaled <dbl>,
## #   Petal.Length_scaled <dbl>, Petal.Width_scaled <dbl>

37.8 across函数

数据框中向量de方向,事实上可以看做有两个方向,横着看是row-vector,竖着看是col-vector。

  • colwise: group_by() %>% summarise/mutate + across()
  • rowwise: rowwise()/nest_by() %>% summarise/mutate + c_across()

比如

iris %>%
  dplyr::group_by(Species) %>%
  dplyr::summarise(
    across(starts_with("Sepal"), mean),
    Area = mean(Petal.Length * Petal.Width),
    across(starts_with("Petal"), min)
  )

37.8.1 across函数替代scope函数

强大的across()函数,替代以上scope函数(_if, _at, 和 _all函数), 同时slice_max(), slice_min(), slice_n() 将替代 top_n()函数。请参考阅读第?? 章。

df %>% mutate_if(is.numeric, mean, na.rm = TRUE)
# ->
df %>% mutate(across(is.numeric, mean, na.rm = TRUE))

df %>% mutate_at(vars(x, starts_with("y")), mean, na.rm = TRUE)
# ->
df %>% mutate(across(c(x, starts_with("y")), mean, na.rm = TRUE))

df %>% mutate_all(mean, na.rm = TRUE)
# ->
df %>% mutate(across(everything(), mean, na.rm = TRUE))

37.8.2 更方便的colwise操作

# multiple
df <- tibble(x = 1:3, y = 3:5, z = 5:7)
mult <- list(x = 1, y = 10, z = 100)

df %>% mutate(across(all_of(names(mult)), ~ .x * mult[[cur_column()]]))



# weights
df <- tibble(x = 1:3, y = 3:5, z = 5:7)
df
weights <- list(x = 0.2, y = 0.3, z = 0.5)

df %>% dplyr::mutate(
  across(all_of(names(weights)),
    list(wt = ~ .x * weights[[cur_column()]]),
    .names = "{col}.{fn}"
  )
)



# cutoffs
df <- tibble(x = 1:3, y = 3:5, z = 5:7)
df

cutoffs <- list(x = 2, y = 3, z = 7)

df %>% dplyr::mutate(
  across(all_of(names(cutoffs)), ~ if_else(.x > cutoffs[[cur_column()]], 1, 0))
)