# 第 19 章 tidyverse中的若干技巧

“most of data science is counting, and sometimes dividing” — Hadley Wickham

library(tidyverse)
library(patchwork)  # install.packages("patchwork")

## 19.1 count()

df <- tibble(
name = c("Alice", "Alice", "Bob", "Bob", "Carol", "Carol"),
type = c("english", "math", "english", "math", "english", "math"),
score = c(60.2, 90.5, 92.2, 98.8, 82.5, 74.6)
)

df
## # A tibble: 6 x 3
##   name  type    score
##   <chr> <chr>   <dbl>
## 1 Alice english  60.2
## 2 Alice math     90.5
## 3 Bob   english  92.2
## 4 Bob   math     98.8
## 5 Carol english  82.5
## 6 Carol math     74.6
df %>% count(name)
## # A tibble: 3 x 2
##   name      n
##   <chr> <int>
## 1 Alice     2
## 2 Bob       2
## 3 Carol     2

df %>%
group_by(name) %>%
summarise( n = n())
## # A tibble: 3 x 2
##   name      n
##   <chr> <int>
## 1 Alice     2
## 2 Bob       2
## 3 Carol     2

count() 还有更多强大的参数， 比如

df %>% count(name,
sort = TRUE,
wt = score,
name = "total_score"
)
## # A tibble: 3 x 2
##   name  total_score
##   <chr>       <dbl>
## 1 Bob          191
## 2 Carol        157.
## 3 Alice        151.

df %>%
group_by(name) %>%
summarise(
n = n(),
total_score = sum(score, na.rm = TRUE)
) %>%
arrange(desc(total_score))
## # A tibble: 3 x 3
##   name      n total_score
##   <chr> <int>       <dbl>
## 1 Bob       2        191
## 2 Carol     2        157.
## 3 Alice     2        151.

## 19.2 在 count() 中创建新变量

df %>% count(range = 10 * (score %/% 10))
## # A tibble: 4 x 2
##   range     n
##   <dbl> <int>
## 1    60     1
## 2    70     1
## 3    80     1
## 4    90     3

df %>%
group_by(name) %>%
mutate(n = n()) %>%
ungroup()
## # A tibble: 6 x 4
##   name  type    score     n
##   <chr> <chr>   <dbl> <int>
## 1 Alice english  60.2     2
## 2 Alice math     90.5     2
## 3 Bob   english  92.2     2
## 4 Bob   math     98.8     2
## 5 Carol english  82.5     2
## 6 Carol math     74.6     2

df %>% add_count(name)
## # A tibble: 6 x 4
##   name  type    score     n
##   <chr> <chr>   <dbl> <int>
## 1 Alice english  60.2     2
## 2 Alice math     90.5     2
## 3 Bob   english  92.2     2
## 4 Bob   math     98.8     2
## 5 Carol english  82.5     2
## 6 Carol math     74.6     2

## 19.4 nth(), first(), last()

v <- c("a", "c", "d", "k")
v[1]
## [1] "a"
v[length(v)]
## [1] "k"
c("a", "c", "d", "k") %>% nth(3)
## [1] "d"
c("a", "c", "d", "k") %>% first()
## [1] "a"
c("a", "c", "d", "k") %>% last()
## [1] "k"

df %>%
filter(score == first(score))
## # A tibble: 1 x 3
##   name  type    score
##   <chr> <chr>   <dbl>
## 1 Alice english  60.2
df %>%
group_by(name) %>%
filter(score == last(score))
## # A tibble: 3 x 3
## # Groups:   name [3]
##   name  type  score
##   <chr> <chr> <dbl>
## 1 Alice math   90.5
## 2 Bob   math   98.8
## 3 Carol math   74.6

## 19.5 列变量重新排序

df %>%
select(score, everything())
## # A tibble: 6 x 3
##   score name  type
##   <dbl> <chr> <chr>
## 1  60.2 Alice english
## 2  90.5 Alice math
## 3  92.2 Bob   english
## 4  98.8 Bob   math
## 5  82.5 Carol english
## 6  74.6 Carol math

## 19.6 if_else

df %>% mutate(
assess = if_else(score > 85, "very_good", "good")
)
## # A tibble: 6 x 4
##   name  type    score assess
##   <chr> <chr>   <dbl> <chr>
## 1 Alice english  60.2 good
## 2 Alice math     90.5 very_good
## 3 Bob   english  92.2 very_good
## 4 Bob   math     98.8 very_good
## 5 Carol english  82.5 good
## 6 Carol math     74.6 good

## 19.7 case_when

df %>% mutate(
assess = case_when(
score < 70 ~ "general",
score >= 70 & score < 80 ~ "good",
score >= 80 & score < 90 ~ "very_good",
score >= 90 ~ "best",
TRUE ~ "other"
)
)
## # A tibble: 6 x 4
##   name  type    score assess
##   <chr> <chr>   <dbl> <chr>
## 1 Alice english  60.2 general
## 2 Alice math     90.5 best
## 3 Bob   english  92.2 best
## 4 Bob   math     98.8 best
## 5 Carol english  82.5 very_good
## 6 Carol math     74.6 good

## 19.8 找出前几名

df %>%
top_n(2, score)
## # A tibble: 2 x 3
##   name  type    score
##   <chr> <chr>   <dbl>
## 1 Bob   english  92.2
## 2 Bob   math     98.8

## 19.9 去除多余的空白

library(stringr)

str_trim(" excess    whitespace in a string be gone!")
## [1] "excess    whitespace in a string be gone!"
# Use str_squish() to remove any leading, trailing, or excess whitespace
str_squish(" excess    whitespace in a string be gone!")
## [1] "excess whitespace in a string be gone!"

## 19.10 取反操作

3:10 %in% c(1:5)
## [1]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE

# 自定义一个不属于操作符
%nin% <- Negate(%in%)
3:10 %nin% c(1:5)
## [1] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
# 使用purrr::negate()自定义反向操作符
%nin% <- purrr::negate(%in%)
3:10 %nin% c(1:5)
## [1] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE

## 19.11 drop_na()

dt <- tribble(
~x, ~y,
1, NA,
2, NA,
NA, -3,
NA, -4,
5, -5
)

dt
## # A tibble: 5 x 2
##       x     y
##   <dbl> <dbl>
## 1     1    NA
## 2     2    NA
## 3    NA    -3
## 4    NA    -4
## 5     5    -5
dt %>% drop_na()
## # A tibble: 1 x 2
##       x     y
##   <dbl> <dbl>
## 1     5    -5
# dt %>% drop_na(x)

## 19.12 replace_na()

dt <- tribble(
~x, ~y,
1, NA,
2, NA,
NA, -3,
NA, -4,
5, -5
)

dt %>% mutate(x = replace_na(x, 0))
## # A tibble: 5 x 2
##       x     y
##   <dbl> <dbl>
## 1     1    NA
## 2     2    NA
## 3     0    -3
## 4     0    -4
## 5     5    -5
dt %>% mutate(
x = replace_na(x, mean(x, na.rm = TRUE))
)
## # A tibble: 5 x 2
##       x     y
##   <dbl> <dbl>
## 1  1       NA
## 2  2       NA
## 3  2.67    -3
## 4  2.67    -4
## 5  5       -5

## 19.13 coalesce

dt <- tribble(
~x, ~y,
1, NA,
2, NA,
NA, -3,
NA, -4,
5, -5
)

dt %>% mutate(
z = coalesce(x, 0)
# z = coalesce(x, y)
)
## # A tibble: 5 x 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1    NA     1
## 2     2    NA     2
## 3    NA    -3     0
## 4    NA    -4     0
## 5     5    -5     5

dt <- tribble(
~name, ~age,
"a", 1,
"b", 2,
"c", NA,
"d", 2
)

dt %>%
mutate(
age_adj = ifelse(is.na(age), mean(age, na.rm = TRUE), age)
)
## # A tibble: 4 x 3
##   <chr> <dbl>   <dbl>
## 1 a         1    1
## 2 b         2    2
## 3 c        NA    1.67
## 4 d         2    2

## 19.14 summarise() 生成 list-column

summarize()会生成一个value，

library(gapminder)
gapminder %>%
group_by(continent) %>%
summarise(
avg_gdpPercap = mean(gdpPercap)
)
## # A tibble: 5 x 2
##   continent avg_gdpPercap
##   <fct>             <dbl>
## 1 Africa            2194.
## 2 Americas          7136.
## 3 Asia              7902.
## 4 Europe           14469.
## 5 Oceania          18622.

summarize()也可以生成一个list，

library(gapminder)
gapminder %>%
group_by(continent) %>%
summarise(test = list(t.test(gdpPercap))) %>% # 单样本的t检验

mutate(tidied = purrr::map(test, broom::tidy)) %>%
unnest(tidied) %>%
ggplot(aes(estimate, continent)) +
geom_point() +
geom_errorbarh(aes(
xmin = conf.low,
xmax = conf.high
))

gapminder %>%
group_by(continent) %>%
summarise(test = list(lm(lifeExp ~ gdpPercap))) %>% # 线性回归

mutate(tidied = purrr::map(test, broom::tidy, conf.int = TRUE)) %>%
unnest(tidied) %>%
filter(term != "(Intercept)") %>%
ggplot(aes(estimate, continent)) +
geom_point() +
geom_errorbarh(aes(
xmin = conf.low,
xmax = conf.high,
height = .3
))

gapminder %>%
group_nest(continent) %>%
mutate(test = map(data, ~ t.test(.x$gdpPercap))) %>% mutate(tidied = map(test, broom::tidy)) %>% unnest(tidied) gapminder %>% group_by(continent) %>% group_modify( ~ broom::tidy(t.test(.x$gdpPercap))
)

## 19.15 count() + fct_reorder() + geom_col() + coord_flip()

gapminder %>%
distinct(continent, country) %>%
count(continent) %>%
ggplot(aes(x = continent, y = n)) +
geom_col()

gapminder %>%
distinct(continent, country) %>%
count(continent) %>%
ggplot(aes(x = fct_reorder(continent, n), y = n)) +
geom_col() +
coord_flip()

gapminder %>%
distinct(continent, country) %>%
count(continent) %>%
mutate(coll = if_else(continent == "Asia", "red", "gray")) %>%

ggplot(aes(x = fct_reorder(continent, n), y = n)) +
geom_text(aes(label = n), hjust = -0.25) +
geom_col(width = 0.8, aes(fill = coll) ) +
coord_flip() +
theme_classic() +
scale_fill_manual(values = c("#b3b3b3a0", "#D55E00")) +
theme(legend.position = "none",
axis.text = element_text(size = 11)
) +
labs(title = "我的标题", x = "")

gapminder %>%
distinct(continent, country) %>%
count(continent) %>%

ggplot(aes(x = fct_reorder(continent, n), y = n)) +
geom_text(aes(label = n), hjust = -0.25) +
geom_col(width = 0.8, aes(fill = continent == "Asia") ) +
coord_flip() +
theme_classic() +
scale_fill_manual(values = c("#b3b3b3a0", "#D55E00")) +
annotate("text", x = 3.8, y = 48, label = "this is important\ncase",
color = "#D55E00", size = 5) +
annotate(
geom = "curve", x = 4.1, y = 48, xend = 4.1, yend = 35,
curvature = .3, arrow = arrow(length = unit(2, "mm"))
) +
theme(legend.position = "none",
axis.text = element_text(size = 11)
) +
labs(title = "我的标题", x = "")

## 19.16 scale_x/y_log10

• 各国人均GDP
• 各国人口
• 不同人士的收入
• 公司的营业额
gapminder %>%
ggplot(aes(x = gdpPercap, y = lifeExp)) +
geom_point()

gapminder %>%
ggplot(aes(x = gdpPercap, y = lifeExp)) +
geom_point() +
scale_x_log10() # A better way to log transform

## 19.17 fct_lump

tb <- tibble::tribble(
~disease,  ~n,
"鼻塞", 112,
"流涕", 130,
"发热",  89,
"腹泻",   5,
"呕吐",  12,
"咳嗽", 102,
"咽痛",  98,
"乏力",  15,
"腹痛",   2,
"妄想",   3,
"幻听",   6,
"失眠",   1,
"贫血",   8,
"多动",   2,
"胸痛",   4,
"胸闷",   5
)
p1 <- tb %>%
uncount(n) %>%

ggplot(aes(x = disease, fill = disease)) +
geom_bar() +
coord_flip() +
theme(legend.position = "none")

p2 <- tb %>%
uncount(n) %>%
mutate(
disease = forcats::fct_lump(disease, 5),
disease = forcats::fct_reorder(disease, .x = disease, .fun = length)
) %>%
ggplot(aes(x = disease, fill = disease)) +
geom_bar() +
coord_flip() +
theme(legend.position = "none")
p1 + p2

## 19.18 fct_reoder2

dat_wide <- tibble(
x = 1:3,
top = c(4.5, 4, 5.5),
middle = c(4, 4.75, 5),
bottom = c(3.5, 3.75, 4.5)
)

dat_wide %>%
pivot_longer(
cols = c(top, middle, bottom),
names_to = "region",
values_to = "awfulness")
## # A tibble: 9 x 3
##       x region awfulness
##   <int> <chr>      <dbl>
## 1     1 top         4.5
## 2     1 middle      4
## 3     1 bottom      3.5
## 4     2 top         4
## 5     2 middle      4.75
## 6     2 bottom      3.75
## 7     3 top         5.5
## 8     3 middle      5
## 9     3 bottom      4.5
dat <- dat_wide %>%
pivot_longer(
cols = c(top, middle, bottom),
names_to = "region",
values_to = "awfulness") %>%
mutate(
region_ABCD = factor(region),
region_sane = fct_reorder2(region, x, awfulness)
)

p_ABCD <- ggplot(dat, aes(x, awfulness, colour = region_ABCD)) +
geom_line() + theme(legend.justification = c(1, 0.85))

p_sane <- ggplot(dat, aes(x, awfulness, colour = region_sane)) +
geom_line() + theme(legend.justification = c(1, 0.85))

p_ABCD + p_sane +
plot_annotation(
title = 'Make the legend order = data order, with forcats::fct_reorder2()')

## 19.19 unite

dfa <- tribble(
~school, ~class,
"chuansi", "01",
"chuansi", "02",
"shude", "07",
"shude", "08",
"huapulu", "101",
"huapulu", "103"
)

dfa
## # A tibble: 6 x 2
##   school  class
##   <chr>   <chr>
## 1 chuansi 01
## 2 chuansi 02
## 3 shude   07
## 4 shude   08
## 5 huapulu 101
## 6 huapulu 103
df_united <- dfa %>%
tidyr::unite(school, class, col = "school_plus_class", sep = "_", remove = FALSE)

df_united
## # A tibble: 6 x 3
##   school_plus_class school  class
##   <chr>             <chr>   <chr>
## 1 chuansi_01        chuansi 01
## 2 chuansi_02        chuansi 02
## 3 shude_07          shude   07
## 4 shude_08          shude   08
## 5 huapulu_101       huapulu 101
## 6 huapulu_103       huapulu 103

dfa %>% mutate(newcol = str_c(school, "_", class))
## # A tibble: 6 x 3
##   school  class newcol
##   <chr>   <chr> <chr>
## 1 chuansi 01    chuansi_01
## 2 chuansi 02    chuansi_02
## 3 shude   07    shude_07
## 4 shude   08    shude_08
## 5 huapulu 101   huapulu_101
## 6 huapulu 103   huapulu_103

## 19.20 separate()

df_united %>%
tidyr::separate(school_plus_class, into = c("sch", "cls"), sep = "_", remove = F)

df_united %>%
mutate(sch = str_split(school_plus_class, "_") %>% map_chr(1)) %>%
mutate(cls = str_split(school_plus_class, "_") %>% map_chr(2)) 
## # A tibble: 6 x 5
##   school_plus_class school  class sch     cls
##   <chr>             <chr>   <chr> <chr>   <chr>
## 1 chuansi_01        chuansi 01    chuansi 01
## 2 chuansi_02        chuansi 02    chuansi 02
## 3 shude_07          shude   07    shude   07
## 4 shude_08          shude   08    shude   08
## 5 huapulu_101       huapulu 101   huapulu 101
## 6 huapulu_103       huapulu 103   huapulu 103

dfb <- tribble(
~school_class,
"chuansi_01",
"chuansi_02_03",
"shude_07_0",
"shude_08_0",
"huapulu_101_u",
"huapulu_103__p"
)
dfb
## # A tibble: 6 x 1
##   school_class
##   <chr>
## 1 chuansi_01
## 2 chuansi_02_03
## 3 shude_07_0
## 4 shude_08_0
## 5 huapulu_101_u
## 6 huapulu_103__p
dfb %>% tidyr::separate(school_class,
into = c("sch", "cls"),
sep = "_",
extra = "drop",
remove = F)

## 19.21 extract()

dfc <- tibble(x = c("1-12week", "1-10wk", "5-12w", "01-05weeks"))
dfc
## # A tibble: 4 x 1
##   x
##   <chr>
## 1 1-12week
## 2 1-10wk
## 3 5-12w
## 4 01-05weeks
dfc %>% tidyr::extract(
x,
c("start", "end", "letter"), "(\\d+)-(\\d+)([a-z]+)",
remove = FALSE
)
## # A tibble: 4 x 4
##   x          start end   letter
##   <chr>      <chr> <chr> <chr>
## 1 1-12week   1     12    week
## 2 1-10wk     1     10    wk
## 3 5-12w      5     12    w
## 4 01-05weeks 01    05    weeks

## 19.22 crossing()

tidyr::crossing(x = c("F", "M"), y = c("a", "b"), z = c(1:2))
## # A tibble: 8 x 3
##   x     y         z
##   <chr> <chr> <int>
## 1 F     a         1
## 2 F     a         2
## 3 F     b         1
## 4 F     b         2
## 5 M     a         1
## 6 M     a         2
## 7 M     b         1
## 8 M     b         2

tidyr::crossing(trials = 1:10, m = 1:5) %>%
group_by(trials) %>%
mutate(
guess = sample.int(5, n()),
result = m == guess
) %>%
summarise(score = sum(result) / n())
## # A tibble: 10 x 2
##    trials score
##     <int> <dbl>
##  1      1   0.6
##  2      2   0.4
##  3      3   0.4
##  4      4   0.2
##  5      5   0
##  6      6   0
##  7      7   0.2
##  8      8   0.6
##  9      9   0
## 10     10   0.2

sim <- tribble(
~f, ~params,
"rbinom", list(size = 1, prob = 0.5, n = 10)
)
sim %>%
mutate(sim = invoke_map(f, params))
## # A tibble: 1 x 3
##   f      params           sim
##   <chr>  <list>           <list>
## 1 rbinom <named list [3]> <int [10]>
rep_sim <- sim %>%
crossing(rep = 1:1e5) %>%
mutate(sim = invoke_map(f, params)) %>%
unnest(sim) %>%
group_by(rep) %>%
summarise(mean_sim = mean(sim))

head(rep_sim)
## # A tibble: 6 x 2
##     rep mean_sim
##   <int>    <dbl>
## 1     1      0.3
## 2     2      0.5
## 3     3      0.4
## 4     4      0.5
## 5     5      0.7
## 6     6      0.4
rep_sim %>%
ggplot(aes(x = mean_sim)) +
geom_histogram(binwidth = 0.05,  fill = "skyblue") +
theme_classic()

sim <- tribble(
~n_tosses, ~f, ~params,
10, "rbinom", list(size = 1, prob = 0.5, n = 15),
30, "rbinom", list(size = 1, prob = 0.5, n = 30),
100, "rbinom", list(size = 1, prob = 0.5, n = 100),
1000, "rbinom", list(size = 1, prob = 0.5, n = 1000),
10000, "rbinom", list(size = 1, prob = 0.5, n = 1e4)
)
sim_rep <- sim %>%
crossing(replication = 1:50) %>%
mutate(sims = invoke_map(f, params)) %>%
unnest(sims) %>%
group_by(replication, n_tosses) %>%
summarise(avg = mean(sims))
sim_rep %>%
ggplot(aes(x = factor(n_tosses), y = avg)) +
ggbeeswarm::geom_quasirandom(color = "lightgrey") +
scale_y_continuous(limits = c(0, 1)) +
geom_hline(
yintercept = 0.5,
color = "skyblue", lty = 1, size = 1, alpha = 3 / 4
) +
ggthemes::theme_pander() +
labs(
title = "50 Replicates Of Mean 'Heads' As Number Of Tosses Increase",
y = "mean",
x = "Number Of Tosses"
)