第 42 章 tidyverse中的across()之美3

有同学说across()函数只能在summarise()或者mutate()中使用,事实上能使用across()的函数还是挺多的。我们列举一些看看。

library(tidyverse)
library(palmerpenguins)

penguins <- palmerpenguins::penguins %>% drop_na()

42.1 用在mutate()

penguins %>%
  mutate(
    across(where(is.numeric), log),
    across(where(is.character), as.factor)
  )
## # A tibble: 333 × 8
##   species island    bill_length_mm bill_depth_mm
##   <fct>   <fct>              <dbl>         <dbl>
## 1 Adelie  Torgersen           3.67          2.93
## 2 Adelie  Torgersen           3.68          2.86
## 3 Adelie  Torgersen           3.70          2.89
## 4 Adelie  Torgersen           3.60          2.96
## 5 Adelie  Torgersen           3.67          3.03
## 6 Adelie  Torgersen           3.66          2.88
## # … with 327 more rows, and 4 more variables:
## #   flipper_length_mm <dbl>, body_mass_g <dbl>,
## #   sex <fct>, year <dbl>

42.2 用在summarise()

penguins %>% 
  group_by(species) %>%
  summarise(
    across(starts_with("bill_length_"), mean),
    Area = mean(bill_length_mm * bill_depth_mm),
    across(starts_with("bill_depth_"), min)
  )
## # A tibble: 3 × 4
##   species   bill_length_mm  Area bill_depth_mm
##   <fct>              <dbl> <dbl>         <dbl>
## 1 Adelie              38.8  712.          15.5
## 2 Chinstrap           48.8  900.          16.4
## 3 Gentoo              47.6  713.          13.1
penguins %>% 
  select(species, sex, bill_length_mm) %>% 
  summarise(
    mean = mean(bill_length_mm),
    across(-bill_length_mm)
  )
## # A tibble: 333 × 3
##    mean species sex   
##   <dbl> <fct>   <fct> 
## 1  44.0 Adelie  male  
## 2  44.0 Adelie  female
## 3  44.0 Adelie  female
## 4  44.0 Adelie  female
## 5  44.0 Adelie  male  
## 6  44.0 Adelie  female
## # … with 327 more rows

42.3 用在group_by()

penguins %>% 
  group_by(across(c(species, island, sex))) %>% 
  summarise(
   across(bill_length_mm, mean, na.rm = TRUE)
  )
## # A tibble: 10 × 4
## # Groups:   species, island [5]
##   species island    sex    bill_length_mm
##   <fct>   <fct>     <fct>           <dbl>
## 1 Adelie  Biscoe    female           37.4
## 2 Adelie  Biscoe    male             40.6
## 3 Adelie  Dream     female           36.9
## 4 Adelie  Dream     male             40.1
## 5 Adelie  Torgersen female           37.6
## 6 Adelie  Torgersen male             40.6
## # … with 4 more rows
penguins %>% 
  group_by(across(where(is.factor))) %>% 
  summarise(
    across(bill_length_mm, mean, na.rm = TRUE)
  )
## # A tibble: 10 × 4
## # Groups:   species, island [5]
##   species island    sex    bill_length_mm
##   <fct>   <fct>     <fct>           <dbl>
## 1 Adelie  Biscoe    female           37.4
## 2 Adelie  Biscoe    male             40.6
## 3 Adelie  Dream     female           36.9
## 4 Adelie  Dream     male             40.1
## 5 Adelie  Torgersen female           37.6
## 6 Adelie  Torgersen male             40.6
## # … with 4 more rows
sum_group_vars <- function(df, group_vars, sum_vars) {
  df %>% 
    group_by(across({{ group_vars }})) %>% 
    summarise(n = n(), 
              across({{ sum_vars }}, 
                     list(mean = mean, sd = sd))
              )
}

penguins %>% 
  sum_group_vars(
    c(species, year), c(bill_length_mm, bill_depth_mm)
  )
## # A tibble: 9 × 7
## # Groups:   species [3]
##   species  year     n bill_length_mm_… bill_length_mm_…
##   <fct>   <int> <int>            <dbl>            <dbl>
## 1 Adelie   2007    44             38.9             2.44
## 2 Adelie   2008    50             38.6             2.98
## 3 Adelie   2009    52             39.0             2.56
## 4 Chinst…  2007    26             48.7             3.47
## 5 Chinst…  2008    18             48.7             3.62
## 6 Chinst…  2009    24             49.1             3.10
## # … with 3 more rows, and 2 more variables:
## #   bill_depth_mm_mean <dbl>, bill_depth_mm_sd <dbl>

42.4 用在filter()

df <- tibble(
  a = letters[1:5],
  b = 1:5,
  c = 6:10,
  d = 11:15
)


df %>%
  dplyr::filter(
    across(where(is.numeric), .fns = ~ .x > 2)
  )
## # A tibble: 3 × 4
##   a         b     c     d
##   <chr> <int> <int> <int>
## 1 c         3     8    13
## 2 d         4     9    14
## 3 e         5    10    15
# 等价
df %>%
  dplyr::filter(
    if_all(where(is.numeric), .fns = ~ .x > 2)
  )
## # A tibble: 3 × 4
##   a         b     c     d
##   <chr> <int> <int> <int>
## 1 c         3     8    13
## 2 d         4     9    14
## 3 e         5    10    15

42.5 用在distinct()

penguins %>% 
  distinct(
    across(c(island, species))
  )
## # A tibble: 5 × 2
##   species   island   
##   <fct>     <fct>    
## 1 Adelie    Torgersen
## 2 Adelie    Biscoe   
## 3 Adelie    Dream    
## 4 Gentoo    Biscoe   
## 5 Chinstrap Dream

42.6 用在arrange()

penguins %>% 
  arrange(across(bill_length_mm))
## # A tibble: 333 × 8
##   species island    bill_length_mm bill_depth_mm
##   <fct>   <fct>              <dbl>         <dbl>
## 1 Adelie  Dream               32.1          15.5
## 2 Adelie  Dream               33.1          16.1
## 3 Adelie  Torgersen           33.5          19  
## 4 Adelie  Dream               34            17.1
## 5 Adelie  Torgersen           34.4          18.4
## 6 Adelie  Biscoe              34.5          18.1
## # … with 327 more rows, and 4 more variables:
## #   flipper_length_mm <int>, body_mass_g <int>,
## #   sex <fct>, year <int>
penguins %>% 
  arrange(across(ends_with("_mm")))
## # A tibble: 333 × 8
##   species island    bill_length_mm bill_depth_mm
##   <fct>   <fct>              <dbl>         <dbl>
## 1 Adelie  Dream               32.1          15.5
## 2 Adelie  Dream               33.1          16.1
## 3 Adelie  Torgersen           33.5          19  
## 4 Adelie  Dream               34            17.1
## 5 Adelie  Torgersen           34.4          18.4
## 6 Adelie  Biscoe              34.5          18.1
## # … with 327 more rows, and 4 more variables:
## #   flipper_length_mm <int>, body_mass_g <int>,
## #   sex <fct>, year <int>
f <- function(.data, order_by) {
  .data %>%
    arrange(across({{order_by}}))
}

penguins %>% 
  f(sex)
## # A tibble: 333 × 8
##   species island    bill_length_mm bill_depth_mm
##   <fct>   <fct>              <dbl>         <dbl>
## 1 Adelie  Torgersen           39.5          17.4
## 2 Adelie  Torgersen           40.3          18  
## 3 Adelie  Torgersen           36.7          19.3
## 4 Adelie  Torgersen           38.9          17.8
## 5 Adelie  Torgersen           41.1          17.6
## 6 Adelie  Torgersen           36.6          17.8
## # … with 327 more rows, and 4 more variables:
## #   flipper_length_mm <int>, body_mass_g <int>,
## #   sex <fct>, year <int>

42.7 用在count()

penguins %>% 
  count(across(sex))
## # A tibble: 2 × 2
##   sex        n
##   <fct>  <int>
## 1 female   165
## 2 male     168
penguins %>% 
  count(
    across(where(is.factor))
  )
## # A tibble: 10 × 4
##   species island    sex        n
##   <fct>   <fct>     <fct>  <int>
## 1 Adelie  Biscoe    female    22
## 2 Adelie  Biscoe    male      22
## 3 Adelie  Dream     female    27
## 4 Adelie  Dream     male      28
## 5 Adelie  Torgersen female    24
## 6 Adelie  Torgersen male      23
## # … with 4 more rows

用在自定义的函数里,挺方便

count_multiple <- function(df, ...) {
  df %>% 
    select(...) %>% 
    names() %>% 
    map( ~ count(df, across(all_of(.x)), sort = TRUE))
}

penguins %>%
  count_multiple(where(is.factor))
## [[1]]
## # A tibble: 3 × 2
##   species       n
##   <fct>     <int>
## 1 Adelie      146
## 2 Gentoo      119
## 3 Chinstrap    68
## 
## [[2]]
## # A tibble: 3 × 2
##   island        n
##   <fct>     <int>
## 1 Biscoe      163
## 2 Dream       123
## 3 Torgersen    47
## 
## [[3]]
## # A tibble: 2 × 2
##   sex        n
##   <fct>  <int>
## 1 male     168
## 2 female   165

42.8 用在purrr::map()

我们想求行方向的均值,根据第 39 章介绍的技术

tibble(
  x = 1:3,
  y = 2:4
) %>% 
  rowwise() %>% 
  mutate(
    min = mean(c_across())
  )

根据第 21 章介绍函数式编程

tibble(
  x = 1:3,
  y = 2:4
) %>% 
  pmap_dfr(
    ~list(z = mean(c(...)))
  )

事实上,我们还可以这样写,

tibble(
  x = 1:3,
  y = 2:4
) %>% 
  mutate(
    z = pmap_dbl(across(), lift_vd(mean))
  )
## # A tibble: 3 × 3
##       x     y     z
##   <int> <int> <dbl>
## 1     1     2   1.5
## 2     2     3   2.5
## 3     3     4   3.5

或者利用mutate()数据框并入

tibble(
  x = 1:3,
  y = 2:4
) %>% 
  mutate(
    pmap_dfr(across(), ~list(z = mean(c(...))))
  ) 
## # A tibble: 3 × 3
##       x     y     z
##   <int> <int> <dbl>
## 1     1     2   1.5
## 2     2     3   2.5
## 3     3     4   3.5

再举一个例,我想求出数据框每一行的多个统计值,也可以用到数据框并入

df <- tibble(
  a = letters[1:5],
  b = 1:5,
  c = 6:10,
  d = 11:15
)

df %>% 
  mutate(
    pmap_dfr(across(b:d), ~lst(min = min(c(...)), 
                               max = max(c(...)), 
                               ratio = min/max
                               )
    )
  )
## # A tibble: 5 × 7
##   a         b     c     d   min   max  ratio
##   <chr> <int> <int> <int> <int> <int>  <dbl>
## 1 a         1     6    11     1    11 0.0909
## 2 b         2     7    12     2    12 0.167 
## 3 c         3     8    13     3    13 0.231 
## 4 d         4     9    14     4    14 0.286 
## 5 e         5    10    15     5    15 0.333

再比如例子,一行中,将最大值出现后的所有数值替换成0

df <- tibble(
  x = c(55, 23, 15, 10),
  y = c(42, NA, 90, 30),
  z = c(12, 17, 10, 12),
  w = c(NA, 45, NA, NA)
)
df
## # A tibble: 4 × 4
##       x     y     z     w
##   <dbl> <dbl> <dbl> <dbl>
## 1    55    42    12    NA
## 2    23    NA    17    45
## 3    15    90    10    NA
## 4    10    30    12    NA
df %>% mutate(
  pmap_dfr(
    across(everything()), 
    ~ `[<-`(c(...), seq_along(c(...)) > which.max(c(...)), 0))
)
## # A tibble: 4 × 4
##       x     y     z     w
##   <dbl> <dbl> <dbl> <dbl>
## 1    55     0     0     0
## 2    23    NA    17    45
## 3    15    90     0     0
## 4    10    30     0     0

也可以这样写

myfun <- function(x) {
  x[seq_along(x) > which.max(x)] <- 0
  return(x)
}

df %>% mutate(
  pmap_dfr(
    across(everything()), 
   ~ myfun(c(...))
  )
)
## # A tibble: 4 × 4
##       x     y     z     w
##   <dbl> <dbl> <dbl> <dbl>
## 1    55     0     0     0
## 2    23    NA    17    45
## 3    15    90     0     0
## 4    10    30     0     0

更多案例请看第 43 章。