3.2 长宽转换
我们在使用 excel 的数据透视表时,即将“长数据” 转化成“宽数据,”这就是数据的长宽转化。有关pivot
详情,请查看vignette("pivot")
。
接下来我们用tidyr
包自带的插图案例查看相关函数用法。
3.2.1 宽转长
有时候方便我们肉眼观察,可能一个数据集会有很多列, 方便观察,但是不方便统计分析,这是我们需要把数据做处理,从“宽数据变成长数据”即宽转长。如下案例:
分公司 | 商品A | 商品B | 商品C | 商品D | 商品E | 商品F | 商品G | 商品H | 商品I |
---|---|---|---|---|---|---|---|---|---|
北京 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
深圳 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
上海 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
广州 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
成都 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
重庆 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
长沙 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
将上面的数据集从“宽数据集”转化成“长数据集”。
library(tidyverse,warn.conflicts = FALSE)
library(tidyr)
准备数据集 dt
<- tibble::tribble(
dt ~分公司, ~商品A, ~商品B, ~商品C, ~商品D, ~商品E, ~商品F, ~商品G, ~商品H, ~商品I,
"北京", 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
"深圳", 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
"上海", 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
"广州", 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
"成都", 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
"重庆", 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
"长沙", 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L
)
经过转化,数据由 10 列变成 3 列;
%>%
dt pivot_longer(cols = 商品A:商品I,names_to = '商品名称',values_to = '销量') %>%
head()
#> # A tibble: 6 x 3
#> 分公司 商品名称 销量
#> <chr> <chr> <int>
#> 1 北京 商品A 1
#> 2 北京 商品B 2
#> 3 北京 商品C 3
#> 4 北京 商品D 4
#> 5 北京 商品E 5
#> 6 北京 商品F 6
3.2.2 用法
pivot_longer(
data,
cols,names_to = "name",
names_prefix = NULL,
names_sep = NULL,
names_pattern = NULL,
names_ptypes = list(),
names_transform = list(),
names_repair = "check_unique",
values_to = "value",
values_drop_na = FALSE,
values_ptypes = list(),
values_transform = list(),
... )
data
: 数据集cols
: 需要需要重塑,在上例中除了分公司的其他全部列names_to
: 新增的列名,即重塑列的新列名,如上列的商品名称values_to
: 新增存储数据新列名,如上列的销量names_prefix
: 一个正则表达式用来删除列名中匹配到的字符values_drop_na
: 是否删除空值行,默认为FALSE
其余参数的使用比较复杂,对于初学而言可以不掌握,当有更高阶需求的时候自行查函数手册了解用法即可。接下来了解几种常见的使用场景。
3.2.2.1 列名带数字
%>%
billboard pivot_longer(
cols = starts_with("wk"),
names_to = "week",
values_to = "rank",
values_drop_na = TRUE
)#> # A tibble: 5,307 x 5
#> artist track date.entered week rank
#> <chr> <chr> <date> <chr> <dbl>
#> 1 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk1 87
#> 2 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk2 82
#> 3 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk3 72
#> 4 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk4 77
#> 5 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk5 87
#> 6 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk6 94
#> # ... with 5,301 more rows
names_prefix
调整内容前缀,配合names_transform
参数使用
%>%
billboard pivot_longer(
cols = starts_with("wk"),
names_to = "week",
names_prefix = "wk",
names_transform = list(week = as.integer),
values_to = "rank",
values_drop_na = TRUE,
)#> # A tibble: 5,307 x 5
#> artist track date.entered week rank
#> <chr> <chr> <date> <int> <dbl>
#> 1 2 Pac Baby Don't Cry (Keep... 2000-02-26 1 87
#> 2 2 Pac Baby Don't Cry (Keep... 2000-02-26 2 82
#> 3 2 Pac Baby Don't Cry (Keep... 2000-02-26 3 72
#> 4 2 Pac Baby Don't Cry (Keep... 2000-02-26 4 77
#> 5 2 Pac Baby Don't Cry (Keep... 2000-02-26 5 87
#> 6 2 Pac Baby Don't Cry (Keep... 2000-02-26 6 94
#> # ... with 5,301 more rows
经过以上转换week
列属性变成了整数,当然达到以上效果有其他的途径,如下:
library(tidyverse,warn.conflicts = TRUE)
# method 1
%>%
billboard pivot_longer(
cols = starts_with("wk"),
names_to = "week",
names_transform = list(week = readr::parse_number),
values_to = "rank",
values_drop_na = TRUE,
)
# method 2
%>%
billboard pivot_longer(
cols = starts_with("wk"),
names_to = "week",
values_to = "rank",
values_drop_na = TRUE,
%>%
) mutate(week = str_remove(week, "wk") %>% as.integer())
3.2.2.2 多变量列名
该案列设计比较复杂的正则表达式,new_?(.*)_(.)(.*)
需要一定正则表达式基础。
new_?
表示匹配new
或new_
,(.*)
匹配任意0次或多次任意字符。
%>% pivot_longer(
who cols = new_sp_m014:newrel_f65,
names_to = c("diagnosis", "gender", "age"),
names_pattern = "new_?(.*)_(.)(.*)",
values_to = "count"
)#> # A tibble: 405,440 x 8
#> country iso2 iso3 year diagnosis gender age count
#> <chr> <chr> <chr> <int> <chr> <chr> <chr> <int>
#> 1 Afghanistan AF AFG 1980 sp m 014 NA
#> 2 Afghanistan AF AFG 1980 sp m 1524 NA
#> 3 Afghanistan AF AFG 1980 sp m 2534 NA
#> 4 Afghanistan AF AFG 1980 sp m 3544 NA
#> 5 Afghanistan AF AFG 1980 sp m 4554 NA
#> 6 Afghanistan AF AFG 1980 sp m 5564 NA
#> # ... with 405,434 more rows
进一步处理列gender
,age
。
%>% pivot_longer(
who cols = new_sp_m014:newrel_f65,
names_to = c("diagnosis", "gender", "age"),
names_pattern = "new_?(.*)_(.)(.*)",
names_transform = list(
gender = ~ readr::parse_factor(.x, levels = c("f", "m")),
age = ~ readr::parse_factor(
.x,levels = c("014", "1524", "2534", "3544", "4554", "5564", "65"),
ordered = TRUE
)
),values_to = "count",
)#> # A tibble: 405,440 x 8
#> country iso2 iso3 year diagnosis gender age count
#> <chr> <chr> <chr> <int> <chr> <fct> <ord> <int>
#> 1 Afghanistan AF AFG 1980 sp m 014 NA
#> 2 Afghanistan AF AFG 1980 sp m 1524 NA
#> 3 Afghanistan AF AFG 1980 sp m 2534 NA
#> 4 Afghanistan AF AFG 1980 sp m 3544 NA
#> 5 Afghanistan AF AFG 1980 sp m 4554 NA
#> 6 Afghanistan AF AFG 1980 sp m 5564 NA
#> # ... with 405,434 more rows
3.2.2.3 一行多观测值
<- tribble(
family ~family, ~dob_child1, ~dob_child2, ~gender_child1, ~gender_child2,
"1998-11-26", "2000-01-29", 1L, 2L,
1L, "1996-06-22", NA, 2L, NA,
2L, "2002-07-11", "2004-04-05", 2L, 2L,
3L, "2004-10-10", "2009-08-27", 1L, 1L,
4L, "2000-12-05", "2005-02-28", 2L, 1L,
5L,
)<- family %>% mutate_at(vars(starts_with("dob")), parse_date)
family
family#> # A tibble: 5 x 5
#> family dob_child1 dob_child2 gender_child1 gender_child2
#> <int> <date> <date> <int> <int>
#> 1 1 1998-11-26 2000-01-29 1 2
#> 2 2 1996-06-22 NA 2 NA
#> 3 3 2002-07-11 2004-04-05 2 2
#> 4 4 2004-10-10 2009-08-27 1 1
#> 5 5 2000-12-05 2005-02-28 2 1
%>%
family pivot_longer(
!family,
names_to = c(".value", "child"),
names_sep = "_",
values_drop_na = TRUE
)#> # A tibble: 9 x 4
#> family child dob gender
#> <int> <chr> <date> <int>
#> 1 1 child1 1998-11-26 1
#> 2 1 child2 2000-01-29 2
#> 3 2 child1 1996-06-22 2
#> 4 3 child1 2002-07-11 2
#> 5 3 child2 2004-04-05 2
#> 6 4 child1 2004-10-10 1
#> # ... with 3 more rows
%>%
anscombe pivot_longer(everything(),
names_to = c(".value", "set"),
names_pattern = "(.)(.)"
%>%
) arrange(set)
#> # A tibble: 44 x 3
#> set x y
#> <chr> <dbl> <dbl>
#> 1 1 10 8.04
#> 2 1 8 6.95
#> 3 1 13 7.58
#> 4 1 9 8.81
#> 5 1 11 8.33
#> 6 1 14 9.96
#> # ... with 38 more rows
<- tibble(
pnl x = 1:4,
a = c(1, 1,0, 0),
b = c(0, 1, 1, 1),
y1 = rnorm(4),
y2 = rnorm(4),
z1 = rep(3, 4),
z2 = rep(-2, 4),
)
%>%
pnl pivot_longer(
!c(x, a, b),
names_to = c(".value", "time"),
names_pattern = "(.)(.)"
)#> # A tibble: 8 x 6
#> x a b time y z
#> <int> <dbl> <dbl> <chr> <dbl> <dbl>
#> 1 1 1 0 1 -1.40 3
#> 2 1 1 0 2 0.622 -2
#> 3 2 1 1 1 0.255 3
#> 4 2 1 1 2 1.15 -2
#> 5 3 0 1 1 -2.44 3
#> 6 3 0 1 2 -1.82 -2
#> # ... with 2 more rows
3.2.2.4 重复列名
<- tibble(id = 1:3, y = 4:6, y = 5:7, y = 7:9, .name_repair = "minimal")
df %>% pivot_longer(!id, names_to = "name", values_to = "value")
df #> # A tibble: 9 x 3
#> id name value
#> <int> <chr> <int>
#> 1 1 y 4
#> 2 1 y 5
#> 3 1 y 7
#> 4 2 y 5
#> 5 2 y 6
#> 6 2 y 8
#> # ... with 3 more rows
3.2.3 长转宽
pivot_wider()
功能与pivot_longer()
相反。通过增加列数减少行数使数据集变得更宽,通常我们在汇总时候使用,达到类似Excel透视表结果。
3.2.4 用法
%>% pivot_wider(names_from = station, values_from = seen)
fish_encounters #> # A tibble: 19 x 12
#> fish Release I80_1 Lisbon Rstr Base_TD BCE BCW BCE2 BCW2 MAE MAW
#> <fct> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 4842 1 1 1 1 1 1 1 1 1 1 1
#> 2 4843 1 1 1 1 1 1 1 1 1 1 1
#> 3 4844 1 1 1 1 1 1 1 1 1 1 1
#> 4 4845 1 1 1 1 1 NA NA NA NA NA NA
#> 5 4847 1 1 1 NA NA NA NA NA NA NA NA
#> 6 4848 1 1 1 1 NA NA NA NA NA NA NA
#> # ... with 13 more rows
3.2.4.1 缺失值填充
使用values_fill 参数填充缺失值。如下使用 0 填充缺失值:
%>% pivot_wider(
fish_encounters names_from = station,
values_from = seen,
values_fill = 0
)#> # A tibble: 19 x 12
#> fish Release I80_1 Lisbon Rstr Base_TD BCE BCW BCE2 BCW2 MAE MAW
#> <fct> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 4842 1 1 1 1 1 1 1 1 1 1 1
#> 2 4843 1 1 1 1 1 1 1 1 1 1 1
#> 3 4844 1 1 1 1 1 1 1 1 1 1 1
#> 4 4845 1 1 1 1 1 0 0 0 0 0 0
#> 5 4847 1 1 1 0 0 0 0 0 0 0 0
#> 6 4848 1 1 1 1 0 0 0 0 0 0 0
#> # ... with 13 more rows
3.2.4.2 聚合
当将数据框长转宽时,通过指定values_fn
参数,指定聚合方式,如下所示:
<- warpbreaks %>% as_tibble()
warpbreaks %>%
warpbreaks pivot_wider(
names_from = wool,
values_from = breaks,
values_fn = mean
)#> # A tibble: 3 x 3
#> tension A B
#> <fct> <dbl> <dbl>
#> 1 L 44.6 28.2
#> 2 M 24 28.8
#> 3 H 24.6 18.8
多个聚合方式
<- tibble::tribble(
df ~name, ~TYPE, ~num, ~NUM,
"a", 1L, 1L, 9L,
"a", 1L, 2L, 10L,
"a", 2L, 3L, 11L,
"a", 2L, 4L, 12L,
"b", 1L, 5L, 13L,
"b", 2L, 6L, 14L,
"b", 2L, 7L, 15L,
"b", 2L, 8L, 16L
)
%>% pivot_wider(names_from = c(TYPE),values_from = c(num,NUM),values_fn = list(num = sum ,NUM = mean))
df #> # A tibble: 2 x 5
#> name num_1 num_2 NUM_1 NUM_2
#> <chr> <int> <int> <dbl> <dbl>
#> 1 a 3 7 9.5 11.5
#> 2 b 5 21 13 15
3.2.4.3 从多个变量生成新列名
<- expand_grid(
production product = c("A", "B"),
country = c("AI", "EI"),
year = 2000:2014
%>%
) filter((product == "A" & country == "AI") | product == "B") %>%
mutate(production = rnorm(nrow(.)))
production#> # A tibble: 45 x 4
#> product country year production
#> <chr> <chr> <int> <dbl>
#> 1 A AI 2000 -0.244
#> 2 A AI 2001 -0.283
#> 3 A AI 2002 -0.554
#> 4 A AI 2003 0.629
#> 5 A AI 2004 2.07
#> 6 A AI 2005 -1.63
#> # ... with 39 more rows
%>% pivot_wider(
production names_from = c(product, country),
values_from = production
)#> # A tibble: 15 x 4
#> year A_AI B_AI B_EI
#> <int> <dbl> <dbl> <dbl>
#> 1 2000 -0.244 0.738 -0.313
#> 2 2001 -0.283 1.89 1.07
#> 3 2002 -0.554 -0.0974 0.0700
#> 4 2003 0.629 -0.936 -0.639
#> 5 2004 2.07 -0.0160 -0.0500
#> 6 2005 -1.63 -0.827 -0.251
#> # ... with 9 more rows
通过names_sep
和names_prefix
参数控制新的列名,或通过names_glue
%>% pivot_wider(
production names_from = c(product, country),
values_from = production,
names_sep = ".",
names_prefix = "prod."
)#> # A tibble: 15 x 4
#> year prod.A.AI prod.B.AI prod.B.EI
#> <int> <dbl> <dbl> <dbl>
#> 1 2000 -0.244 0.738 -0.313
#> 2 2001 -0.283 1.89 1.07
#> 3 2002 -0.554 -0.0974 0.0700
#> 4 2003 0.629 -0.936 -0.639
#> 5 2004 2.07 -0.0160 -0.0500
#> 6 2005 -1.63 -0.827 -0.251
#> # ... with 9 more rows
%>% pivot_wider(
production names_from = c(product, country),
values_from = production,
names_glue = "prod_{product}_{country}"
)#> # A tibble: 15 x 4
#> year prod_A_AI prod_B_AI prod_B_EI
#> <int> <dbl> <dbl> <dbl>
#> 1 2000 -0.244 0.738 -0.313
#> 2 2001 -0.283 1.89 1.07
#> 3 2002 -0.554 -0.0974 0.0700
#> 4 2003 0.629 -0.936 -0.639
#> 5 2004 2.07 -0.0160 -0.0500
#> 6 2005 -1.63 -0.827 -0.251
#> # ... with 9 more rows
3.2.4.4 多值变宽
%>%
us_rent_income pivot_wider(names_from = variable, values_from = c(estimate, moe))
#> # A tibble: 52 x 6
#> GEOID NAME estimate_income estimate_rent moe_income moe_rent
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 01 Alabama 24476 747 136 3
#> 2 02 Alaska 32940 1200 508 13
#> 3 04 Arizona 27517 972 148 4
#> 4 05 Arkansas 23789 709 165 5
#> 5 06 California 29454 1358 109 3
#> 6 08 Colorado 32401 1125 109 5
#> # ... with 46 more rows