3.2 长宽转换
我们在使用 excel 的数据透视表时,即将“长数据” 转化成“宽数据,”这就是数据的长宽转化。有关pivot详情,请查看vignette("pivot")。
接下来我们用tidyr包自带的插图案例查看相关函数用法。
3.2.1 宽转长
有时候方便我们肉眼观察,可能一个数据集会有很多列, 方便观察,但是不方便统计分析,这是我们需要把数据做处理,从“宽数据变成长数据”即宽转长。如下案例:
| 分公司 | 商品A | 商品B | 商品C | 商品D | 商品E | 商品F | 商品G | 商品H | 商品I |
|---|---|---|---|---|---|---|---|---|---|
| 北京 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
| 深圳 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
| 上海 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
| 广州 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
| 成都 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
| 重庆 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
| 长沙 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
将上面的数据集从“宽数据集”转化成“长数据集”。
library(tidyverse,warn.conflicts = FALSE)
library(tidyr)准备数据集 dt
dt <- tibble::tribble(
~分公司, ~商品A, ~商品B, ~商品C, ~商品D, ~商品E, ~商品F, ~商品G, ~商品H, ~商品I,
"北京", 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
"深圳", 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
"上海", 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
"广州", 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
"成都", 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
"重庆", 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
"长沙", 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L
)经过转化,数据由 10 列变成 3 列;
dt %>%
pivot_longer(cols = 商品A:商品I,names_to = '商品名称',values_to = '销量') %>%
head()
#> # A tibble: 6 x 3
#> 分公司 商品名称 销量
#> <chr> <chr> <int>
#> 1 北京 商品A 1
#> 2 北京 商品B 2
#> 3 北京 商品C 3
#> 4 北京 商品D 4
#> 5 北京 商品E 5
#> 6 北京 商品F 63.2.2 用法
pivot_longer(
data,
cols,
names_to = "name",
names_prefix = NULL,
names_sep = NULL,
names_pattern = NULL,
names_ptypes = list(),
names_transform = list(),
names_repair = "check_unique",
values_to = "value",
values_drop_na = FALSE,
values_ptypes = list(),
values_transform = list(),
...
)data: 数据集cols: 需要需要重塑,在上例中除了分公司的其他全部列names_to: 新增的列名,即重塑列的新列名,如上列的商品名称values_to: 新增存储数据新列名,如上列的销量names_prefix: 一个正则表达式用来删除列名中匹配到的字符values_drop_na: 是否删除空值行,默认为FALSE
其余参数的使用比较复杂,对于初学而言可以不掌握,当有更高阶需求的时候自行查函数手册了解用法即可。接下来了解几种常见的使用场景。
3.2.2.1 列名带数字
billboard %>%
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
values_to = "rank",
values_drop_na = TRUE
)
#> # A tibble: 5,307 x 5
#> artist track date.entered week rank
#> <chr> <chr> <date> <chr> <dbl>
#> 1 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk1 87
#> 2 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk2 82
#> 3 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk3 72
#> 4 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk4 77
#> 5 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk5 87
#> 6 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk6 94
#> # ... with 5,301 more rowsnames_prefix 调整内容前缀,配合names_transform参数使用
billboard %>%
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
names_prefix = "wk",
names_transform = list(week = as.integer),
values_to = "rank",
values_drop_na = TRUE,
)
#> # A tibble: 5,307 x 5
#> artist track date.entered week rank
#> <chr> <chr> <date> <int> <dbl>
#> 1 2 Pac Baby Don't Cry (Keep... 2000-02-26 1 87
#> 2 2 Pac Baby Don't Cry (Keep... 2000-02-26 2 82
#> 3 2 Pac Baby Don't Cry (Keep... 2000-02-26 3 72
#> 4 2 Pac Baby Don't Cry (Keep... 2000-02-26 4 77
#> 5 2 Pac Baby Don't Cry (Keep... 2000-02-26 5 87
#> 6 2 Pac Baby Don't Cry (Keep... 2000-02-26 6 94
#> # ... with 5,301 more rows经过以上转换week列属性变成了整数,当然达到以上效果有其他的途径,如下:
library(tidyverse,warn.conflicts = TRUE)
# method 1
billboard %>%
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
names_transform = list(week = readr::parse_number),
values_to = "rank",
values_drop_na = TRUE,
)
# method 2
billboard %>%
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
values_to = "rank",
values_drop_na = TRUE,
) %>%
mutate(week = str_remove(week, "wk") %>% as.integer())3.2.2.2 多变量列名
该案列设计比较复杂的正则表达式,new_?(.*)_(.)(.*)需要一定正则表达式基础。
new_?表示匹配new或new_,(.*)匹配任意0次或多次任意字符。
who %>% pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = c("diagnosis", "gender", "age"),
names_pattern = "new_?(.*)_(.)(.*)",
values_to = "count"
)
#> # A tibble: 405,440 x 8
#> country iso2 iso3 year diagnosis gender age count
#> <chr> <chr> <chr> <int> <chr> <chr> <chr> <int>
#> 1 Afghanistan AF AFG 1980 sp m 014 NA
#> 2 Afghanistan AF AFG 1980 sp m 1524 NA
#> 3 Afghanistan AF AFG 1980 sp m 2534 NA
#> 4 Afghanistan AF AFG 1980 sp m 3544 NA
#> 5 Afghanistan AF AFG 1980 sp m 4554 NA
#> 6 Afghanistan AF AFG 1980 sp m 5564 NA
#> # ... with 405,434 more rows进一步处理列gender,age 。
who %>% pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = c("diagnosis", "gender", "age"),
names_pattern = "new_?(.*)_(.)(.*)",
names_transform = list(
gender = ~ readr::parse_factor(.x, levels = c("f", "m")),
age = ~ readr::parse_factor(
.x,
levels = c("014", "1524", "2534", "3544", "4554", "5564", "65"),
ordered = TRUE
)
),
values_to = "count",
)
#> # A tibble: 405,440 x 8
#> country iso2 iso3 year diagnosis gender age count
#> <chr> <chr> <chr> <int> <chr> <fct> <ord> <int>
#> 1 Afghanistan AF AFG 1980 sp m 014 NA
#> 2 Afghanistan AF AFG 1980 sp m 1524 NA
#> 3 Afghanistan AF AFG 1980 sp m 2534 NA
#> 4 Afghanistan AF AFG 1980 sp m 3544 NA
#> 5 Afghanistan AF AFG 1980 sp m 4554 NA
#> 6 Afghanistan AF AFG 1980 sp m 5564 NA
#> # ... with 405,434 more rows3.2.2.3 一行多观测值
family <- tribble(
~family, ~dob_child1, ~dob_child2, ~gender_child1, ~gender_child2,
1L, "1998-11-26", "2000-01-29", 1L, 2L,
2L, "1996-06-22", NA, 2L, NA,
3L, "2002-07-11", "2004-04-05", 2L, 2L,
4L, "2004-10-10", "2009-08-27", 1L, 1L,
5L, "2000-12-05", "2005-02-28", 2L, 1L,
)
family <- family %>% mutate_at(vars(starts_with("dob")), parse_date)
family
#> # A tibble: 5 x 5
#> family dob_child1 dob_child2 gender_child1 gender_child2
#> <int> <date> <date> <int> <int>
#> 1 1 1998-11-26 2000-01-29 1 2
#> 2 2 1996-06-22 NA 2 NA
#> 3 3 2002-07-11 2004-04-05 2 2
#> 4 4 2004-10-10 2009-08-27 1 1
#> 5 5 2000-12-05 2005-02-28 2 1
family %>%
pivot_longer(
!family,
names_to = c(".value", "child"),
names_sep = "_",
values_drop_na = TRUE
)
#> # A tibble: 9 x 4
#> family child dob gender
#> <int> <chr> <date> <int>
#> 1 1 child1 1998-11-26 1
#> 2 1 child2 2000-01-29 2
#> 3 2 child1 1996-06-22 2
#> 4 3 child1 2002-07-11 2
#> 5 3 child2 2004-04-05 2
#> 6 4 child1 2004-10-10 1
#> # ... with 3 more rowsanscombe %>%
pivot_longer(everything(),
names_to = c(".value", "set"),
names_pattern = "(.)(.)"
) %>%
arrange(set)
#> # A tibble: 44 x 3
#> set x y
#> <chr> <dbl> <dbl>
#> 1 1 10 8.04
#> 2 1 8 6.95
#> 3 1 13 7.58
#> 4 1 9 8.81
#> 5 1 11 8.33
#> 6 1 14 9.96
#> # ... with 38 more rowspnl <- tibble(
x = 1:4,
a = c(1, 1,0, 0),
b = c(0, 1, 1, 1),
y1 = rnorm(4),
y2 = rnorm(4),
z1 = rep(3, 4),
z2 = rep(-2, 4),
)
pnl %>%
pivot_longer(
!c(x, a, b),
names_to = c(".value", "time"),
names_pattern = "(.)(.)"
)
#> # A tibble: 8 x 6
#> x a b time y z
#> <int> <dbl> <dbl> <chr> <dbl> <dbl>
#> 1 1 1 0 1 -1.40 3
#> 2 1 1 0 2 0.622 -2
#> 3 2 1 1 1 0.255 3
#> 4 2 1 1 2 1.15 -2
#> 5 3 0 1 1 -2.44 3
#> 6 3 0 1 2 -1.82 -2
#> # ... with 2 more rows3.2.2.4 重复列名
df <- tibble(id = 1:3, y = 4:6, y = 5:7, y = 7:9, .name_repair = "minimal")
df %>% pivot_longer(!id, names_to = "name", values_to = "value")
#> # A tibble: 9 x 3
#> id name value
#> <int> <chr> <int>
#> 1 1 y 4
#> 2 1 y 5
#> 3 1 y 7
#> 4 2 y 5
#> 5 2 y 6
#> 6 2 y 8
#> # ... with 3 more rows3.2.3 长转宽
pivot_wider()功能与pivot_longer()相反。通过增加列数减少行数使数据集变得更宽,通常我们在汇总时候使用,达到类似Excel透视表结果。
3.2.4 用法
fish_encounters %>% pivot_wider(names_from = station, values_from = seen)
#> # A tibble: 19 x 12
#> fish Release I80_1 Lisbon Rstr Base_TD BCE BCW BCE2 BCW2 MAE MAW
#> <fct> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 4842 1 1 1 1 1 1 1 1 1 1 1
#> 2 4843 1 1 1 1 1 1 1 1 1 1 1
#> 3 4844 1 1 1 1 1 1 1 1 1 1 1
#> 4 4845 1 1 1 1 1 NA NA NA NA NA NA
#> 5 4847 1 1 1 NA NA NA NA NA NA NA NA
#> 6 4848 1 1 1 1 NA NA NA NA NA NA NA
#> # ... with 13 more rows3.2.4.1 缺失值填充
使用values_fill 参数填充缺失值。如下使用 0 填充缺失值:
fish_encounters %>% pivot_wider(
names_from = station,
values_from = seen,
values_fill = 0
)
#> # A tibble: 19 x 12
#> fish Release I80_1 Lisbon Rstr Base_TD BCE BCW BCE2 BCW2 MAE MAW
#> <fct> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 4842 1 1 1 1 1 1 1 1 1 1 1
#> 2 4843 1 1 1 1 1 1 1 1 1 1 1
#> 3 4844 1 1 1 1 1 1 1 1 1 1 1
#> 4 4845 1 1 1 1 1 0 0 0 0 0 0
#> 5 4847 1 1 1 0 0 0 0 0 0 0 0
#> 6 4848 1 1 1 1 0 0 0 0 0 0 0
#> # ... with 13 more rows3.2.4.2 聚合
当将数据框长转宽时,通过指定values_fn参数,指定聚合方式,如下所示:
warpbreaks <- warpbreaks %>% as_tibble()
warpbreaks %>%
pivot_wider(
names_from = wool,
values_from = breaks,
values_fn = mean
)
#> # A tibble: 3 x 3
#> tension A B
#> <fct> <dbl> <dbl>
#> 1 L 44.6 28.2
#> 2 M 24 28.8
#> 3 H 24.6 18.8多个聚合方式
df <- tibble::tribble(
~name, ~TYPE, ~num, ~NUM,
"a", 1L, 1L, 9L,
"a", 1L, 2L, 10L,
"a", 2L, 3L, 11L,
"a", 2L, 4L, 12L,
"b", 1L, 5L, 13L,
"b", 2L, 6L, 14L,
"b", 2L, 7L, 15L,
"b", 2L, 8L, 16L
)
df %>% pivot_wider(names_from = c(TYPE),values_from = c(num,NUM),values_fn = list(num = sum ,NUM = mean))
#> # A tibble: 2 x 5
#> name num_1 num_2 NUM_1 NUM_2
#> <chr> <int> <int> <dbl> <dbl>
#> 1 a 3 7 9.5 11.5
#> 2 b 5 21 13 153.2.4.3 从多个变量生成新列名
production <- expand_grid(
product = c("A", "B"),
country = c("AI", "EI"),
year = 2000:2014
) %>%
filter((product == "A" & country == "AI") | product == "B") %>%
mutate(production = rnorm(nrow(.)))
production
#> # A tibble: 45 x 4
#> product country year production
#> <chr> <chr> <int> <dbl>
#> 1 A AI 2000 -0.244
#> 2 A AI 2001 -0.283
#> 3 A AI 2002 -0.554
#> 4 A AI 2003 0.629
#> 5 A AI 2004 2.07
#> 6 A AI 2005 -1.63
#> # ... with 39 more rowsproduction %>% pivot_wider(
names_from = c(product, country),
values_from = production
)
#> # A tibble: 15 x 4
#> year A_AI B_AI B_EI
#> <int> <dbl> <dbl> <dbl>
#> 1 2000 -0.244 0.738 -0.313
#> 2 2001 -0.283 1.89 1.07
#> 3 2002 -0.554 -0.0974 0.0700
#> 4 2003 0.629 -0.936 -0.639
#> 5 2004 2.07 -0.0160 -0.0500
#> 6 2005 -1.63 -0.827 -0.251
#> # ... with 9 more rows通过names_sep和names_prefix参数控制新的列名,或通过names_glue
production %>% pivot_wider(
names_from = c(product, country),
values_from = production,
names_sep = ".",
names_prefix = "prod."
)
#> # A tibble: 15 x 4
#> year prod.A.AI prod.B.AI prod.B.EI
#> <int> <dbl> <dbl> <dbl>
#> 1 2000 -0.244 0.738 -0.313
#> 2 2001 -0.283 1.89 1.07
#> 3 2002 -0.554 -0.0974 0.0700
#> 4 2003 0.629 -0.936 -0.639
#> 5 2004 2.07 -0.0160 -0.0500
#> 6 2005 -1.63 -0.827 -0.251
#> # ... with 9 more rowsproduction %>% pivot_wider(
names_from = c(product, country),
values_from = production,
names_glue = "prod_{product}_{country}"
)
#> # A tibble: 15 x 4
#> year prod_A_AI prod_B_AI prod_B_EI
#> <int> <dbl> <dbl> <dbl>
#> 1 2000 -0.244 0.738 -0.313
#> 2 2001 -0.283 1.89 1.07
#> 3 2002 -0.554 -0.0974 0.0700
#> 4 2003 0.629 -0.936 -0.639
#> 5 2004 2.07 -0.0160 -0.0500
#> 6 2005 -1.63 -0.827 -0.251
#> # ... with 9 more rows3.2.4.4 多值变宽
us_rent_income %>%
pivot_wider(names_from = variable, values_from = c(estimate, moe))
#> # A tibble: 52 x 6
#> GEOID NAME estimate_income estimate_rent moe_income moe_rent
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 01 Alabama 24476 747 136 3
#> 2 02 Alaska 32940 1200 508 13
#> 3 04 Arizona 27517 972 148 4
#> 4 05 Arkansas 23789 709 165 5
#> 5 06 California 29454 1358 109 3
#> 6 08 Colorado 32401 1125 109 5
#> # ... with 46 more rows