第 42 章 tidyverse中的across()之美3
有同学说across()
函数只能在summarise()
或者mutate()
中使用,事实上能使用across()
的函数还是挺多的。我们列举一些看看。
42.1 用在mutate()
中
## # A tibble: 333 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie Torgersen 3.67 2.93 5.20 8.23
## 2 Adelie Torgersen 3.68 2.86 5.23 8.24
## 3 Adelie Torgersen 3.70 2.89 5.27 8.09
## 4 Adelie Torgersen 3.60 2.96 5.26 8.15
## 5 Adelie Torgersen 3.67 3.03 5.25 8.20
## 6 Adelie Torgersen 3.66 2.88 5.20 8.20
## 7 Adelie Torgersen 3.67 2.98 5.27 8.45
## 8 Adelie Torgersen 3.72 2.87 5.20 8.07
## 9 Adelie Torgersen 3.65 3.05 5.25 8.24
## 10 Adelie Torgersen 3.54 3.05 5.29 8.39
## # ℹ 323 more rows
## # ℹ 2 more variables: sex <fct>, year <dbl>
42.2 用在summarise()
中
penguins %>%
group_by(species) %>%
summarise(
across(starts_with("bill_length_"), mean),
Area = mean(bill_length_mm * bill_depth_mm),
across(starts_with("bill_depth_"), min)
)
## # A tibble: 3 × 4
## species bill_length_mm Area bill_depth_mm
## <fct> <dbl> <dbl> <dbl>
## 1 Adelie 38.8 712. 15.5
## 2 Chinstrap 48.8 900. 16.4
## 3 Gentoo 47.6 713. 13.1
penguins %>%
select(species, sex, bill_length_mm) %>%
summarise(
mean = mean(bill_length_mm),
across(-bill_length_mm)
)
## # A tibble: 333 × 3
## mean species sex
## <dbl> <fct> <fct>
## 1 44.0 Adelie male
## 2 44.0 Adelie female
## 3 44.0 Adelie female
## 4 44.0 Adelie female
## 5 44.0 Adelie male
## 6 44.0 Adelie female
## 7 44.0 Adelie male
## 8 44.0 Adelie female
## 9 44.0 Adelie male
## 10 44.0 Adelie male
## # ℹ 323 more rows
42.3 用在group_by()
中
penguins %>%
group_by(across(c(species, island, sex))) %>%
summarise(
across(bill_length_mm, mean, na.rm = TRUE)
)
## # A tibble: 10 × 4
## # Groups: species, island [5]
## species island sex bill_length_mm
## <fct> <fct> <fct> <dbl>
## 1 Adelie Biscoe female 37.4
## 2 Adelie Biscoe male 40.6
## 3 Adelie Dream female 36.9
## 4 Adelie Dream male 40.1
## 5 Adelie Torgersen female 37.6
## 6 Adelie Torgersen male 40.6
## 7 Chinstrap Dream female 46.6
## 8 Chinstrap Dream male 51.1
## 9 Gentoo Biscoe female 45.6
## 10 Gentoo Biscoe male 49.5
penguins %>%
group_by(across(where(is.factor))) %>%
summarise(
across(bill_length_mm, mean, na.rm = TRUE)
)
## # A tibble: 10 × 4
## # Groups: species, island [5]
## species island sex bill_length_mm
## <fct> <fct> <fct> <dbl>
## 1 Adelie Biscoe female 37.4
## 2 Adelie Biscoe male 40.6
## 3 Adelie Dream female 36.9
## 4 Adelie Dream male 40.1
## 5 Adelie Torgersen female 37.6
## 6 Adelie Torgersen male 40.6
## 7 Chinstrap Dream female 46.6
## 8 Chinstrap Dream male 51.1
## 9 Gentoo Biscoe female 45.6
## 10 Gentoo Biscoe male 49.5
sum_group_vars <- function(df, group_vars, sum_vars) {
df %>%
group_by(across({{ group_vars }})) %>%
summarise(n = n(),
across({{ sum_vars }},
list(mean = mean, sd = sd))
)
}
penguins %>%
sum_group_vars(
c(species, year), c(bill_length_mm, bill_depth_mm)
)
## # A tibble: 9 × 7
## # Groups: species [3]
## species year n bill_length_mm_mean bill_length_mm_sd bill_depth_mm_mean
## <fct> <int> <int> <dbl> <dbl> <dbl>
## 1 Adelie 2007 44 38.9 2.44 18.8
## 2 Adelie 2008 50 38.6 2.98 18.2
## 3 Adelie 2009 52 39.0 2.56 18.1
## 4 Chinstrap 2007 26 48.7 3.47 18.5
## 5 Chinstrap 2008 18 48.7 3.62 18.4
## 6 Chinstrap 2009 24 49.1 3.10 18.3
## 7 Gentoo 2007 33 47.1 3.29 14.7
## 8 Gentoo 2008 45 47.0 2.66 14.9
## 9 Gentoo 2009 41 48.6 3.19 15.3
## # ℹ 1 more variable: bill_depth_mm_sd <dbl>
42.4 用在filter()
中
df <- tibble(
a = letters[1:5],
b = 1:5,
c = 6:10,
d = 11:15
)
df %>%
dplyr::filter(
across(where(is.numeric), .fns = ~ .x > 2)
)
## # A tibble: 3 × 4
## a b c d
## <chr> <int> <int> <int>
## 1 c 3 8 13
## 2 d 4 9 14
## 3 e 5 10 15
## # A tibble: 3 × 4
## a b c d
## <chr> <int> <int> <int>
## 1 c 3 8 13
## 2 d 4 9 14
## 3 e 5 10 15
42.5 用在distinct()
中
## # A tibble: 5 × 2
## island species
## <fct> <fct>
## 1 Torgersen Adelie
## 2 Biscoe Adelie
## 3 Dream Adelie
## 4 Biscoe Gentoo
## 5 Dream Chinstrap
42.6 用在arrange()
中
## # A tibble: 333 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Dream 32.1 15.5 188 3050
## 2 Adelie Dream 33.1 16.1 178 2900
## 3 Adelie Torgersen 33.5 19 190 3600
## 4 Adelie Dream 34 17.1 185 3400
## 5 Adelie Torgersen 34.4 18.4 184 3325
## 6 Adelie Biscoe 34.5 18.1 187 2900
## 7 Adelie Torgersen 34.6 21.1 198 4400
## 8 Adelie Torgersen 34.6 17.2 189 3200
## 9 Adelie Biscoe 35 17.9 190 3450
## 10 Adelie Biscoe 35 17.9 192 3725
## # ℹ 323 more rows
## # ℹ 2 more variables: sex <fct>, year <int>
## # A tibble: 333 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Dream 32.1 15.5 188 3050
## 2 Adelie Dream 33.1 16.1 178 2900
## 3 Adelie Torgersen 33.5 19 190 3600
## 4 Adelie Dream 34 17.1 185 3400
## 5 Adelie Torgersen 34.4 18.4 184 3325
## 6 Adelie Biscoe 34.5 18.1 187 2900
## 7 Adelie Torgersen 34.6 17.2 189 3200
## 8 Adelie Torgersen 34.6 21.1 198 4400
## 9 Adelie Biscoe 35 17.9 190 3450
## 10 Adelie Biscoe 35 17.9 192 3725
## # ℹ 323 more rows
## # ℹ 2 more variables: sex <fct>, year <int>
## # A tibble: 333 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.5 17.4 186 3800
## 2 Adelie Torgersen 40.3 18 195 3250
## 3 Adelie Torgersen 36.7 19.3 193 3450
## 4 Adelie Torgersen 38.9 17.8 181 3625
## 5 Adelie Torgersen 41.1 17.6 182 3200
## 6 Adelie Torgersen 36.6 17.8 185 3700
## 7 Adelie Torgersen 38.7 19 195 3450
## 8 Adelie Torgersen 34.4 18.4 184 3325
## 9 Adelie Biscoe 37.8 18.3 174 3400
## 10 Adelie Biscoe 35.9 19.2 189 3800
## # ℹ 323 more rows
## # ℹ 2 more variables: sex <fct>, year <int>
42.7 用在count()
## # A tibble: 2 × 2
## sex n
## <fct> <int>
## 1 female 165
## 2 male 168
## # A tibble: 10 × 4
## species island sex n
## <fct> <fct> <fct> <int>
## 1 Adelie Biscoe female 22
## 2 Adelie Biscoe male 22
## 3 Adelie Dream female 27
## 4 Adelie Dream male 28
## 5 Adelie Torgersen female 24
## 6 Adelie Torgersen male 23
## 7 Chinstrap Dream female 34
## 8 Chinstrap Dream male 34
## 9 Gentoo Biscoe female 58
## 10 Gentoo Biscoe male 61
用在自定义的函数里,挺方便
count_multiple <- function(df, ...) {
df %>%
select(...) %>%
names() %>%
map( ~ count(df, across(all_of(.x)), sort = TRUE))
}
penguins %>%
count_multiple(where(is.factor))
## [[1]]
## # A tibble: 3 × 2
## species n
## <fct> <int>
## 1 Adelie 146
## 2 Gentoo 119
## 3 Chinstrap 68
##
## [[2]]
## # A tibble: 3 × 2
## island n
## <fct> <int>
## 1 Biscoe 163
## 2 Dream 123
## 3 Torgersen 47
##
## [[3]]
## # A tibble: 2 × 2
## sex n
## <fct> <int>
## 1 male 168
## 2 female 165
42.8 用在purrr::map()
中
我们想求行方向的均值,根据第 39 章介绍的技术
根据第 21 章介绍函数式编程
事实上,我们还可以这样写,
## # A tibble: 3 × 3
## x y z
## <int> <int> <dbl>
## 1 1 2 1.5
## 2 2 3 2.5
## 3 3 4 3.5
或者利用mutate()
数据框并入
## # A tibble: 3 × 3
## x y z
## <int> <int> <dbl>
## 1 1 2 1.5
## 2 2 3 2.5
## 3 3 4 3.5
再举一个例,我想求出数据框每一行的多个统计值,也可以用到数据框并入
df <- tibble(
a = letters[1:5],
b = 1:5,
c = 6:10,
d = 11:15
)
df %>%
mutate(
pmap_dfr(across(b:d), ~lst(min = min(c(...)),
max = max(c(...)),
ratio = min/max
)
)
)
## # A tibble: 5 × 7
## a b c d min max ratio
## <chr> <int> <int> <int> <int> <int> <dbl>
## 1 a 1 6 11 1 11 0.0909
## 2 b 2 7 12 2 12 0.167
## 3 c 3 8 13 3 13 0.231
## 4 d 4 9 14 4 14 0.286
## 5 e 5 10 15 5 15 0.333
再比如例子,一行中,将最大值出现后的所有数值替换成0
df <- tibble(
x = c(55, 23, 15, 10),
y = c(42, NA, 90, 30),
z = c(12, 17, 10, 12),
w = c(NA, 45, NA, NA)
)
df
## # A tibble: 4 × 4
## x y z w
## <dbl> <dbl> <dbl> <dbl>
## 1 55 42 12 NA
## 2 23 NA 17 45
## 3 15 90 10 NA
## 4 10 30 12 NA
df %>% mutate(
pmap_dfr(
across(everything()),
~ `[<-`(c(...), seq_along(c(...)) > which.max(c(...)), 0))
)
## # A tibble: 4 × 4
## x y z w
## <dbl> <dbl> <dbl> <dbl>
## 1 55 0 0 0
## 2 23 NA 17 45
## 3 15 90 0 0
## 4 10 30 0 0
也可以这样写
myfun <- function(x) {
x[seq_along(x) > which.max(x)] <- 0
return(x)
}
df %>% mutate(
pmap_dfr(
across(everything()),
~ myfun(c(...))
)
)
## # A tibble: 4 × 4
## x y z w
## <dbl> <dbl> <dbl> <dbl>
## 1 55 0 0 0
## 2 23 NA 17 45
## 3 15 90 0 0
## 4 10 30 0 0
更多案例请看第 43 章。