第 42 章 tidyverse中的across()之美3
有同学说across()
函数只能在summarise()
或者mutate()
中使用,事实上能使用across()
的函数还是挺多的。我们列举一些看看。
42.1 用在mutate()
中
## # A tibble: 333 × 8
## species island bill_length_mm bill_depth_mm
## <fct> <fct> <dbl> <dbl>
## 1 Adelie Torgersen 3.67 2.93
## 2 Adelie Torgersen 3.68 2.86
## 3 Adelie Torgersen 3.70 2.89
## 4 Adelie Torgersen 3.60 2.96
## 5 Adelie Torgersen 3.67 3.03
## 6 Adelie Torgersen 3.66 2.88
## # … with 327 more rows, and 4 more variables:
## # flipper_length_mm <dbl>, body_mass_g <dbl>,
## # sex <fct>, year <dbl>
42.2 用在summarise()
中
penguins %>%
group_by(species) %>%
summarise(
across(starts_with("bill_length_"), mean),
Area = mean(bill_length_mm * bill_depth_mm),
across(starts_with("bill_depth_"), min)
)
## # A tibble: 3 × 4
## species bill_length_mm Area bill_depth_mm
## <fct> <dbl> <dbl> <dbl>
## 1 Adelie 38.8 712. 15.5
## 2 Chinstrap 48.8 900. 16.4
## 3 Gentoo 47.6 713. 13.1
penguins %>%
select(species, sex, bill_length_mm) %>%
summarise(
mean = mean(bill_length_mm),
across(-bill_length_mm)
)
## # A tibble: 333 × 3
## mean species sex
## <dbl> <fct> <fct>
## 1 44.0 Adelie male
## 2 44.0 Adelie female
## 3 44.0 Adelie female
## 4 44.0 Adelie female
## 5 44.0 Adelie male
## 6 44.0 Adelie female
## # … with 327 more rows
42.3 用在group_by()
中
penguins %>%
group_by(across(c(species, island, sex))) %>%
summarise(
across(bill_length_mm, mean, na.rm = TRUE)
)
## # A tibble: 10 × 4
## # Groups: species, island [5]
## species island sex bill_length_mm
## <fct> <fct> <fct> <dbl>
## 1 Adelie Biscoe female 37.4
## 2 Adelie Biscoe male 40.6
## 3 Adelie Dream female 36.9
## 4 Adelie Dream male 40.1
## 5 Adelie Torgersen female 37.6
## 6 Adelie Torgersen male 40.6
## # … with 4 more rows
penguins %>%
group_by(across(where(is.factor))) %>%
summarise(
across(bill_length_mm, mean, na.rm = TRUE)
)
## # A tibble: 10 × 4
## # Groups: species, island [5]
## species island sex bill_length_mm
## <fct> <fct> <fct> <dbl>
## 1 Adelie Biscoe female 37.4
## 2 Adelie Biscoe male 40.6
## 3 Adelie Dream female 36.9
## 4 Adelie Dream male 40.1
## 5 Adelie Torgersen female 37.6
## 6 Adelie Torgersen male 40.6
## # … with 4 more rows
sum_group_vars <- function(df, group_vars, sum_vars) {
df %>%
group_by(across({{ group_vars }})) %>%
summarise(n = n(),
across({{ sum_vars }},
list(mean = mean, sd = sd))
)
}
penguins %>%
sum_group_vars(
c(species, year), c(bill_length_mm, bill_depth_mm)
)
## # A tibble: 9 × 7
## # Groups: species [3]
## species year n bill_length_mm_… bill_length_mm_…
## <fct> <int> <int> <dbl> <dbl>
## 1 Adelie 2007 44 38.9 2.44
## 2 Adelie 2008 50 38.6 2.98
## 3 Adelie 2009 52 39.0 2.56
## 4 Chinst… 2007 26 48.7 3.47
## 5 Chinst… 2008 18 48.7 3.62
## 6 Chinst… 2009 24 49.1 3.10
## # … with 3 more rows, and 2 more variables:
## # bill_depth_mm_mean <dbl>, bill_depth_mm_sd <dbl>
42.4 用在filter()
中
df <- tibble(
a = letters[1:5],
b = 1:5,
c = 6:10,
d = 11:15
)
df %>%
dplyr::filter(
across(where(is.numeric), .fns = ~ .x > 2)
)
## # A tibble: 3 × 4
## a b c d
## <chr> <int> <int> <int>
## 1 c 3 8 13
## 2 d 4 9 14
## 3 e 5 10 15
## # A tibble: 3 × 4
## a b c d
## <chr> <int> <int> <int>
## 1 c 3 8 13
## 2 d 4 9 14
## 3 e 5 10 15
42.5 用在distinct()
中
## # A tibble: 5 × 2
## species island
## <fct> <fct>
## 1 Adelie Torgersen
## 2 Adelie Biscoe
## 3 Adelie Dream
## 4 Gentoo Biscoe
## 5 Chinstrap Dream
42.6 用在arrange()
中
## # A tibble: 333 × 8
## species island bill_length_mm bill_depth_mm
## <fct> <fct> <dbl> <dbl>
## 1 Adelie Dream 32.1 15.5
## 2 Adelie Dream 33.1 16.1
## 3 Adelie Torgersen 33.5 19
## 4 Adelie Dream 34 17.1
## 5 Adelie Torgersen 34.4 18.4
## 6 Adelie Biscoe 34.5 18.1
## # … with 327 more rows, and 4 more variables:
## # flipper_length_mm <int>, body_mass_g <int>,
## # sex <fct>, year <int>
## # A tibble: 333 × 8
## species island bill_length_mm bill_depth_mm
## <fct> <fct> <dbl> <dbl>
## 1 Adelie Dream 32.1 15.5
## 2 Adelie Dream 33.1 16.1
## 3 Adelie Torgersen 33.5 19
## 4 Adelie Dream 34 17.1
## 5 Adelie Torgersen 34.4 18.4
## 6 Adelie Biscoe 34.5 18.1
## # … with 327 more rows, and 4 more variables:
## # flipper_length_mm <int>, body_mass_g <int>,
## # sex <fct>, year <int>
## # A tibble: 333 × 8
## species island bill_length_mm bill_depth_mm
## <fct> <fct> <dbl> <dbl>
## 1 Adelie Torgersen 39.5 17.4
## 2 Adelie Torgersen 40.3 18
## 3 Adelie Torgersen 36.7 19.3
## 4 Adelie Torgersen 38.9 17.8
## 5 Adelie Torgersen 41.1 17.6
## 6 Adelie Torgersen 36.6 17.8
## # … with 327 more rows, and 4 more variables:
## # flipper_length_mm <int>, body_mass_g <int>,
## # sex <fct>, year <int>
42.7 用在count()
## # A tibble: 2 × 2
## sex n
## <fct> <int>
## 1 female 165
## 2 male 168
## # A tibble: 10 × 4
## species island sex n
## <fct> <fct> <fct> <int>
## 1 Adelie Biscoe female 22
## 2 Adelie Biscoe male 22
## 3 Adelie Dream female 27
## 4 Adelie Dream male 28
## 5 Adelie Torgersen female 24
## 6 Adelie Torgersen male 23
## # … with 4 more rows
用在自定义的函数里,挺方便
count_multiple <- function(df, ...) {
df %>%
select(...) %>%
names() %>%
map( ~ count(df, across(all_of(.x)), sort = TRUE))
}
penguins %>%
count_multiple(where(is.factor))
## [[1]]
## # A tibble: 3 × 2
## species n
## <fct> <int>
## 1 Adelie 146
## 2 Gentoo 119
## 3 Chinstrap 68
##
## [[2]]
## # A tibble: 3 × 2
## island n
## <fct> <int>
## 1 Biscoe 163
## 2 Dream 123
## 3 Torgersen 47
##
## [[3]]
## # A tibble: 2 × 2
## sex n
## <fct> <int>
## 1 male 168
## 2 female 165
42.8 用在purrr::map()
中
我们想求行方向的均值,根据第 39 章介绍的技术
根据第 21 章介绍函数式编程
事实上,我们还可以这样写,
## # A tibble: 3 × 3
## x y z
## <int> <int> <dbl>
## 1 1 2 1.5
## 2 2 3 2.5
## 3 3 4 3.5
或者利用mutate()
数据框并入
## # A tibble: 3 × 3
## x y z
## <int> <int> <dbl>
## 1 1 2 1.5
## 2 2 3 2.5
## 3 3 4 3.5
再举一个例,我想求出数据框每一行的多个统计值,也可以用到数据框并入
df <- tibble(
a = letters[1:5],
b = 1:5,
c = 6:10,
d = 11:15
)
df %>%
mutate(
pmap_dfr(across(b:d), ~lst(min = min(c(...)),
max = max(c(...)),
ratio = min/max
)
)
)
## # A tibble: 5 × 7
## a b c d min max ratio
## <chr> <int> <int> <int> <int> <int> <dbl>
## 1 a 1 6 11 1 11 0.0909
## 2 b 2 7 12 2 12 0.167
## 3 c 3 8 13 3 13 0.231
## 4 d 4 9 14 4 14 0.286
## 5 e 5 10 15 5 15 0.333
再比如例子,一行中,将最大值出现后的所有数值替换成0
df <- tibble(
x = c(55, 23, 15, 10),
y = c(42, NA, 90, 30),
z = c(12, 17, 10, 12),
w = c(NA, 45, NA, NA)
)
df
## # A tibble: 4 × 4
## x y z w
## <dbl> <dbl> <dbl> <dbl>
## 1 55 42 12 NA
## 2 23 NA 17 45
## 3 15 90 10 NA
## 4 10 30 12 NA
df %>% mutate(
pmap_dfr(
across(everything()),
~ `[<-`(c(...), seq_along(c(...)) > which.max(c(...)), 0))
)
## # A tibble: 4 × 4
## x y z w
## <dbl> <dbl> <dbl> <dbl>
## 1 55 0 0 0
## 2 23 NA 17 45
## 3 15 90 0 0
## 4 10 30 0 0
也可以这样写
myfun <- function(x) {
x[seq_along(x) > which.max(x)] <- 0
return(x)
}
df %>% mutate(
pmap_dfr(
across(everything()),
~ myfun(c(...))
)
)
## # A tibble: 4 × 4
## x y z w
## <dbl> <dbl> <dbl> <dbl>
## 1 55 0 0 0
## 2 23 NA 17 45
## 3 15 90 0 0
## 4 10 30 0 0
更多案例请看第 43 章。