# 第 15 章 数据规整1

library(tidyverse)

## 15.1 提问

plant_height <- data.frame(
Day = 1:5,
A = c(0.7, 1.0, 1.5, 1.8, 2.2),
B = c(0.5, 0.7, 0.9, 1.3, 1.8),
C = c(0.3, 0.6, 1.0, 1.2, 2.2),
D = c(0.4, 0.7, 1.2, 1.5, 3.2)
)

plant_height
##   Day   A   B   C   D
## 1   1 0.7 0.5 0.3 0.4
## 2   2 1.0 0.7 0.6 0.7
## 3   3 1.5 0.9 1.0 1.2
## 4   4 1.8 1.3 1.2 1.5
## 5   5 2.2 1.8 2.2 3.2

• 把植物高度大于或等于0.8cm的时刻筛选出来，怎么写语句?
• 用不同的颜色画出四种植物生长曲线，怎么写语句?

plant_height %>%
filter( ___ >= 0.8)
plant_height %>%
ggplot(aes(x = Day, y = ___, color = ___)) +
geom_line()

melted <- gather(plant_height, variable, value, 2:3)

## Column names instead of indices
melted <- gather(plant_height, variable, value, A, B)

## Excluding instead of including
melted <- gather(plant_height, variable, value, -1)

## Excluding using column name
melted <- gather(plant_height, variable, value, -Day)

• gather()/pivot_longer 宽表格变成长表格
• spread()/pivot_wider 长表格变成宽表格

## 15.2 宽表格变成长表格

long %  pivot_longer(    cols = A:D,    names_to = "plant",    values_to = "height"  )long

## # A tibble: 20 × 3
##     Day plant height
##   <int> <chr>  <dbl>
## 1     1 A        0.7
## 2     1 B        0.5
## 3     1 C        0.3
## 4     1 D        0.4
## 5     2 A        1
## 6     2 B        0.7
## # … with 14 more rows


• 参数cols，表示哪些列需要转换.
• 参数names_to，表示cols选取的这些列的名字，构成了新的一列，这里需要取一个名字.
• 参数values_to， 表示cols选取的这些列的，构成了新的一列，这里也需要取一个名字.
• 数据框总的信息量不会丢失

plant_height %>%
pivot_longer(
cols = -Day,         # A:D or c(A, B, C, D) or c("A", "B", "C", "D")
names_to = "plant",
values_to = "height"
)

long %>%
ggplot(aes(x = Day, y = height, color = plant)) +
geom_line()

## 15.3 长表格变成宽表格

wide <- long %>%
pivot_wider(
names_from = "plant",
values_from = "height"
)
wide
## # A tibble: 5 × 5
##     Day     A     B     C     D
##   <int> <dbl> <dbl> <dbl> <dbl>
## 1     1   0.7   0.5   0.3   0.4
## 2     2   1     0.7   0.6   0.7
## 3     3   1.5   0.9   1     1.2
## 4     4   1.8   1.3   1.2   1.5
## 5     5   2.2   1.8   2.2   3.2

## 15.4 列名转换成多个变量

plant_record <- data.frame(
day = c(1L, 2L, 3L, 4L, 5L),
A_height = c(1.1, 1.2, 1.3, 1.4, 1.5),
A_width = c(2.1, 2.2, 2.3, 2.4, 2.5),
A_depth = c(3.1, 3.2, 3.3, 3.4, 3.5),
B_height = c(4.1, 4.2, 4.3, 4.4, 4.5),
B_width = c(5.1, 5.2, 5.3, 5.4, 5.5),
B_depth = c(6.1, 6.2, 6.3, 6.4, 6.5),
C_height = c(7.1, 7.2, 7.3, 7.4, 7.5),
C_width = c(8.1, 8.2, 8.3, 8.4, 8.5),
C_depth = c(9.1, 9.2, 9.3, 9.4, 9.5)
)
plant_record %>%
knitr::kable()
day A_height A_width A_depth B_height B_width B_depth C_height C_width C_depth
1 1.1 2.1 3.1 4.1 5.1 6.1 7.1 8.1 9.1
2 1.2 2.2 3.2 4.2 5.2 6.2 7.2 8.2 9.2
3 1.3 2.3 3.3 4.3 5.3 6.3 7.3 8.3 9.3
4 1.4 2.4 3.4 4.4 5.4 6.4 7.4 8.4 9.4
5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5

pivot_longer()函数，

plant_record %>%
tidyr::pivot_longer(
cols = !day,
names_to = c("species", "parameter"),
names_pattern = "(.*)_(.*)",
values_to = "value"
)
## # A tibble: 45 × 4
##     day species parameter value
##   <int> <chr>   <chr>     <dbl>
## 1     1 A       height      1.1
## 2     1 A       width       2.1
## 3     1 A       depth       3.1
## 4     1 B       height      4.1
## 5     1 B       width       5.1
## 6     1 B       depth       6.1
## # … with 39 more rows

## 15.5 复杂的情形

plant_record_longer <- plant_record %>%
tidyr::pivot_longer(
cols = !day,
names_to = c("species", ".value"),
names_pattern = "(.*)_(.*)"
)
plant_record_longer
## # A tibble: 15 × 5
##     day species height width depth
##   <int> <chr>    <dbl> <dbl> <dbl>
## 1     1 A          1.1   2.1   3.1
## 2     1 B          4.1   5.1   6.1
## 3     1 C          7.1   8.1   9.1
## 4     2 A          1.2   2.2   3.2
## 5     2 B          4.2   5.2   6.2
## 6     2 C          7.2   8.2   9.2
## # … with 9 more rows

us_rent_income %>%
pivot_wider(
names_from = variable,
names_glue = "{variable}_{.value}",
values_from = c(estimate, moe)
)
plant_record_longer %>%
tidyr::pivot_wider(
names_from = species,
values_from = c(height, width, depth),
names_glue = "{species}_{.value}"
)
## # A tibble: 5 × 10
##     day A_height B_height C_height A_width B_width
##   <int>    <dbl>    <dbl>    <dbl>   <dbl>   <dbl>
## 1     1      1.1      4.1      7.1     2.1     5.1
## 2     2      1.2      4.2      7.2     2.2     5.2
## 3     3      1.3      4.3      7.3     2.3     5.3
## 4     4      1.4      4.4      7.4     2.4     5.4
## 5     5      1.5      4.5      7.5     2.5     5.5
## # … with 4 more variables: C_width <dbl>,
## #   A_depth <dbl>, B_depth <dbl>, C_depth <dbl>

• 注意 .value 而不是value，说明这里不是单个列名，而是匹配得到的多个值做列名

## 15.6 tidy data原则

Hadley Wickhamt提出了数据科学tidy原则，我结合自己的理解，tidy思想体现在:

• 一切都是数据框，任何数据都可以规整
• 数据框的一列代表一个变量，数据框的一行代表一次观察
• 函数处理数据时，数据框进数据框出（函数的第一个参数始终为数据框

long
## # A tibble: 20 × 3
##     Day plant height
##   <int> <chr>  <dbl>
## 1     1 A        0.7
## 2     1 B        0.5
## 3     1 C        0.3
## 4     1 D        0.4
## 5     2 A        1
## 6     2 B        0.7
## # … with 14 more rows