第 7 章 高级数据操作
介绍 data.table 处理数据的方式,对标 dplyr 的基本操作
7.1 基础介绍
# 用一个真实的数据集替换,让每一个操作都有实际含义和价值 mtcars
DT <- data.table(
x = rep(c("b", "a", "c"), each = 3),
v = c(1, 1, 1, 2, 2, 1, 1, 2, 2),
y = c(1, 3, 6), a = 1:9, b = 9:1
)
DT
## x v y a b
## 1: b 1 1 1 9
## 2: b 1 3 2 8
## 3: b 1 6 3 7
## 4: a 2 1 4 6
## 5: a 2 3 5 5
## 6: a 1 6 6 4
## 7: c 1 1 7 3
## 8: c 2 3 8 2
## 9: c 2 6 9 1
## y V1
## 1: 1 9
## 2: 0 4
## bool V1
## 1: 1 9
## 2: 0 4
DT[, .SD[2], by = x] # 每组第二行
## x v y a b
## 1: b 1 3 2 8
## 2: a 2 3 5 5
## 3: c 2 3 8 2
DT[, tail(.SD, 2), by = x] # 每组最后两行
## x v y a b
## 1: b 1 3 2 8
## 2: b 1 6 3 7
## 3: a 2 3 5 5
## 4: a 1 6 6 4
## 5: c 2 3 8 2
## 6: c 2 6 9 1
# 除了 x 列外,所有列都按 x 分组求和
DT[, lapply(.SD, sum), by = x]
## x v y a b
## 1: b 3 10 6 24
## 2: a 5 10 15 15
## 3: c 5 10 24 6
# 各个列都按 x 分组取最小
DT[, .SD[which.min(v)], by = x] # 分组嵌套查询
## x v y a b
## 1: b 1 1 1 9
## 2: a 1 6 6 4
## 3: c 1 1 7 3
## x y MySum MyMin MyMax
## 1: b 1 2 1 1
## 2: b 0 1 1 1
## 3: a 1 4 2 2
## 4: a 0 1 1 1
## 5: c 1 3 1 2
## 6: c 0 2 2 2
DT[, .(a = .(a), b = .(b)), by = x] # 按 x 分组,将 a,b 两列的值列出来
## x a b
## 1: b 1,2,3 9,8,7
## 2: a 4,5,6 6,5,4
## 3: c 7,8,9 3,2,1
## x seq
## 1: b 1
## 2: b 2
## 3: b 3
## 4: b 4
## 5: b 5
## 6: b 6
## 7: b 7
## 8: b 8
## 9: b 9
## 10: a 4
## 11: a 5
## 12: a 6
## 13: c 7
## 14: c 6
## 15: c 5
## 16: c 4
## 17: c 3
# 按 x 分组对 v 求和,然后过滤出和小于 20 的行
DT[, sum(v), by = x][V1 < 20] # 组合查询
## x V1
## 1: b 3
## 2: a 5
## 3: c 5
## x V1
## 1: a 5
## 2: c 5
## 3: b 3
## x N v y a b
## 1: b 3 3 10 6 24
## 2: a 3 5 10 15 15
## 3: c 3 5 10 24 6
# 两个复杂的操作,还没弄清楚这个技术存在的意义
DT[,
{
tmp <- mean(y)
.(a = a - tmp, b = b - tmp)
},
by = x
] # anonymous lambda in 'j', j accepts any valid
## x a b
## 1: b -2.3333333 5.6666667
## 2: b -1.3333333 4.6666667
## 3: b -0.3333333 3.6666667
## 4: a 0.6666667 2.6666667
## 5: a 1.6666667 1.6666667
## 6: a 2.6666667 0.6666667
## 7: c 3.6666667 -0.3333333
## 8: c 4.6666667 -1.3333333
## 9: c 5.6666667 -2.3333333
# using rleid, get max(y) and min of all cols in .SDcols for each consecutive run of 'v'
DT[, c(.(y = max(y)), lapply(.SD, min)), by = rleid(v), .SDcols = v:b]
## rleid y v y a b
## 1: 1 6 1 1 1 7
## 2: 2 3 2 1 4 5
## 3: 3 6 1 1 6 3
## 4: 4 6 2 3 8 1
7.1.1 过滤
mtcars_df <- as.data.table(mtcars)
过滤 cyl = 6 并且 gear = 4 的记录
mtcars_df[cyl == 6 & gear == 4]
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 4: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
过滤操作是针对数据框的行(记录)
mtcars_df[cyl == 6 & gear == 4, .(mpg, disp)]
## mpg disp
## 1: 21.0 160.0
## 2: 21.0 160.0
## 3: 19.2 167.6
## 4: 17.8 167.6
## mpg disp
## 1: 21.0 160.0
## 2: 21.0 160.0
## 3: 19.2 167.6
## 4: 17.8 167.6
7.1.2 变换
根据已有的列生成新的列,或者修改已有的列,一次只能修改一列
## mpg cyl disp hp drat wt qsec vs am gear carb mean_mpg mean_disp
## 1: 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 20.09062 230.7219
## 2: 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 20.09062 230.7219
## 3: 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 20.09062 230.7219
## 4: 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 20.09062 230.7219
## 5: 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 20.09062 230.7219
## 6: 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 20.09062 230.7219
## mean_mpg mean_disp
## 1: 20.09062 230.7219
# mtcars_df[, .(mean_mpg := mean(mpg), mean_disp := mean(disp))] # 报错
# 正确的姿势
mtcars_df[, `:=`(mean_mpg = mean(mpg), mean_disp = mean(disp))][, .(mpg, disp, mean_mpg, mean_disp)] |> head()
## mpg disp mean_mpg mean_disp
## 1: 21.0 160 20.09062 230.7219
## 2: 21.0 160 20.09062 230.7219
## 3: 22.8 108 20.09062 230.7219
## 4: 21.4 258 20.09062 230.7219
## 5: 18.7 360 20.09062 230.7219
## 6: 18.1 225 20.09062 230.7219
## mean_mpg mean_disp
## 1 20.09062 230.7219
mtcars |>
dplyr::mutate(mean_mpg = mean(mpg), mean_disp = mean(disp)) |>
dplyr::select(mpg, disp, mean_mpg, mean_disp) |>
head()
## mpg disp mean_mpg mean_disp
## Mazda RX4 21.0 160 20.09062 230.7219
## Mazda RX4 Wag 21.0 160 20.09062 230.7219
## Datsun 710 22.8 108 20.09062 230.7219
## Hornet 4 Drive 21.4 258 20.09062 230.7219
## Hornet Sportabout 18.7 360 20.09062 230.7219
## Valiant 18.1 225 20.09062 230.7219
7.1.3 聚合
分组统计 多个分组变量
dcast(mtcars_df, cyl ~ gear, value.var = "mpg", fun = mean)
## cyl 3 4 5
## 1: 4 21.50 26.925 28.2
## 2: 6 19.75 19.750 19.7
## 3: 8 15.05 NaN 15.4
## 3 4 5
## 4 21.50 26.925 28.2
## 6 19.75 19.750 19.7
## 8 15.05 NA 15.4
mtcars_df[, .(mean_mpg = mean(mpg)), by = .(cyl, gear)]
## cyl gear mean_mpg
## 1: 6 4 19.750
## 2: 4 4 26.925
## 3: 6 3 19.750
## 4: 8 3 15.050
## 5: 4 3 21.500
## 6: 4 5 28.200
## 7: 8 5 15.400
## 8: 6 5 19.700
aggregate(data = mtcars_df, mpg ~ cyl + gear, FUN = mean)
## cyl gear mpg
## 1 4 3 21.500
## 2 6 3 19.750
## 3 8 3 15.050
## 4 4 4 26.925
## 5 6 4 19.750
## 6 4 5 28.200
## 7 6 5 19.700
## 8 8 5 15.400
## # A tibble: 8 × 3
## # Groups: cyl [3]
## cyl gear mean_mpg
## <dbl> <dbl> <dbl>
## 1 4 3 21.5
## 2 4 4 26.9
## 3 4 5 28.2
## 4 6 3 19.8
## 5 6 4 19.8
## 6 6 5 19.7
## 7 8 3 15.0
## 8 8 5 15.4
7.1.4 命名
修改列名,另存一份生效
sub_mtcars_df <- mtcars_df[, .(mean_mpg = mean(mpg)), by = .(cyl, gear)]
setNames(sub_mtcars_df, c("cyl", "gear", "ave_mpg"))
## cyl gear ave_mpg
## 1: 6 4 19.750
## 2: 4 4 26.925
## 3: 6 3 19.750
## 4: 8 3 15.050
## 5: 4 3 21.500
## 6: 4 5 28.200
## 7: 8 5 15.400
## 8: 6 5 19.700
# 注意 sub_mtcars_df 并没有修改列名
sub_mtcars_df
## cyl gear mean_mpg
## 1: 6 4 19.750
## 2: 4 4 26.925
## 3: 6 3 19.750
## 4: 8 3 15.050
## 5: 4 3 21.500
## 6: 4 5 28.200
## 7: 8 5 15.400
## 8: 6 5 19.700
修改列名并直接起作用,在原来的数据集上生效
setnames(sub_mtcars_df, old = c("mean_mpg"), new = c("ave_mpg"))
# sub_mtcars_df 已经修改了列名
sub_mtcars_df
## cyl gear ave_mpg
## 1: 6 4 19.750
## 2: 4 4 26.925
## 3: 6 3 19.750
## 4: 8 3 15.050
## 5: 4 3 21.500
## 6: 4 5 28.200
## 7: 8 5 15.400
## 8: 6 5 19.700
修改列名最好使用 data.table 包的函数 setnames()
明确指出了要修改的列名,
7.1.5 排序
按照某(些)列从大到小或从小到大的顺序排列, 先按 cyl 升序,然后按 gear 降序
mtcars_df[, .(mpg, cyl, gear)][cyl == 4][order(cyl, -gear)]
## mpg cyl gear
## 1: 26.0 4 5
## 2: 30.4 4 5
## 3: 22.8 4 4
## 4: 24.4 4 4
## 5: 22.8 4 4
## 6: 32.4 4 4
## 7: 30.4 4 4
## 8: 33.9 4 4
## 9: 27.3 4 4
## 10: 21.4 4 4
## 11: 21.5 4 3
mtcars |>
dplyr::select(mpg, cyl, gear) |>
dplyr::filter(cyl == 4) |>
dplyr::arrange(cyl, desc(gear))
## mpg cyl gear
## Porsche 914-2 26.0 4 5
## Lotus Europa 30.4 4 5
## Datsun 710 22.8 4 4
## Merc 240D 24.4 4 4
## Merc 230 22.8 4 4
## Fiat 128 32.4 4 4
## Honda Civic 30.4 4 4
## Toyota Corolla 33.9 4 4
## Fiat X1-9 27.3 4 4
## Volvo 142E 21.4 4 4
## Toyota Corona 21.5 4 3
7.1.6 变形
melt 宽的变长的
DT <- data.table(
i_1 = c(1:5, NA),
i_2 = c(NA, 6, 7, 8, 9, 10),
f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)),
f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered = TRUE),
c_1 = sample(c(letters[1:3], NA), 6, TRUE),
d_1 = as.Date(c(1:3, NA, 4:5), origin = "2013-09-01"),
d_2 = as.Date(6:1, origin = "2012-01-01")
)
DT[, .(i_1, i_2, f_1, f_2)]
## i_1 i_2 f_1 f_2
## 1: 1 NA c z
## 2: 2 6 b a
## 3: 3 7 <NA> x
## 4: 4 8 <NA> c
## 5: 5 9 c x
## 6: NA 10 b x
## i_1 i_2 variable value
## 1: 1 NA f_1 c
## 2: 2 6 f_1 b
## 3: 3 7 f_1 <NA>
## 4: 4 8 f_1 <NA>
## 5: 5 9 f_1 c
## 6: NA 10 f_1 b
## 7: 1 NA f_2 z
## 8: 2 6 f_2 a
## 9: 3 7 f_2 x
## 10: 4 8 f_2 c
## 11: 5 9 f_2 x
## 12: NA 10 f_2 x
dcast 长的变宽的
sleep <- as.data.table(sleep)
dcast(sleep, group ~ ID, value.var = "extra")
## group 1 2 3 4 5 6 7 8 9 10
## 1: 1 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0.0 2.0
## 2: 2 1.9 0.8 1.1 0.1 -0.1 4.4 5.5 1.6 4.6 3.4
# 如果有多个值
dcast(mtcars_df, cyl ~ gear, value.var = "mpg")
## cyl 3 4 5
## 1: 4 1 8 2
## 2: 6 2 4 1
## 3: 8 12 0 2
dcast(mtcars_df, cyl ~ gear, value.var = "mpg", fun = mean)
## cyl 3 4 5
## 1: 4 21.50 26.925 28.2
## 2: 6 19.75 19.750 19.7
## 3: 8 15.05 NaN 15.4
tidyr 包提供数据变形的函数 tidyr::pivot_longer()
和 tidyr::pivot_wider()
相比于 Base R 提供的 reshape()
和 data.table 提供的 melt()
和 dcast()
更加形象的命名
tidyr::pivot_wider(data = sleep, names_from = "ID", values_from = "extra")
## # A tibble: 2 × 11
## group `1` `2` `3` `4` `5` `6` `7` `8` `9` `10`
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0 2
## 2 2 1.9 0.8 1.1 0.1 -0.1 4.4 5.5 1.6 4.6 3.4
reshape(data = sleep, v.names = "extra", idvar = "group", timevar = "ID", direction = "wide")
## group extra.1 extra.2 extra.3 extra.4 extra.5 extra.6 extra.7 extra.8
## 1: 1 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8
## 2: 2 1.9 0.8 1.1 0.1 -0.1 4.4 5.5 1.6
## extra.9 extra.10
## 1: 0.0 2.0
## 2: 4.6 3.4
-
idvar
分组变量 -
timevar
组内编号 -
v.names
个体观察值 -
sep
新的列名是由参数v.names
(extra) 和参数值timevar
(ID) 拼接起来的,默认sep = "."
推荐使用下划线来做分割sep = "_"
head(ToothGrowth)
## len supp dose
## 1 4.2 VC 0.5
## 2 11.5 VC 0.5
## 3 7.3 VC 0.5
## 4 5.8 VC 0.5
## 5 6.4 VC 0.5
## 6 10.0 VC 0.5
ToothGrowth$time <- rep(1:10, 6)
reshape(ToothGrowth,
v.names = "len", idvar = c("supp", "dose"),
timevar = "time", direction = "wide"
)
## supp dose len.1 len.2 len.3 len.4 len.5 len.6 len.7 len.8 len.9 len.10
## 1 VC 0.5 4.2 11.5 7.3 5.8 6.4 10.0 11.2 11.2 5.2 7.0
## 11 VC 1.0 16.5 16.5 15.2 17.3 22.5 17.3 13.6 14.5 18.8 15.5
## 21 VC 2.0 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5 23.3 29.5
## 31 OJ 0.5 15.2 21.5 17.6 9.7 14.5 10.0 8.2 9.4 16.5 9.7
## 41 OJ 1.0 19.7 23.3 23.6 26.4 20.0 25.2 25.8 21.2 14.5 27.3
## 51 OJ 2.0 25.5 26.4 22.4 24.5 24.8 30.9 26.4 27.3 29.4 23.0
以数据集 ToothGrowth 为例,变量 supp(大组),dose(小组) 和 time(组内个体编号) 一起决定唯一的一个数据 len,特别适合纵向数据的变形操作
7.1.7 分组
分组切片,取每组第一个和最后一个值
Loblolly |>
dplyr::group_by(Seed) |>
dplyr::arrange(height, age, Seed) |>
dplyr::slice(1, dplyr::n())
## # A tibble: 28 × 3
## # Groups: Seed [14]
## height age Seed
## <dbl> <dbl> <ord>
## 1 3.93 3 329
## 2 56.4 25 329
## 3 4.12 3 327
## 4 56.8 25 327
## 5 4.38 3 325
## 6 58.5 25 325
## 7 3.91 3 307
## 8 59.1 25 307
## 9 3.46 3 331
## 10 59.5 25 331
## # … with 18 more rows
dplyr::slice()
和函数 slice.index()
有关系吗?
7.1.8 合并
合并操作对应于数据库中的连接操作, dplyr 包的哲学就来源于对数据库操作的进一步抽象, data.table 包的 merge 函数就对应为 dplyr 包的 join 函数
data.table::merge
和 dplyr::join
给出一个表格,数据操作, data.table 实现, dplyr 实现
dt1 <- data.table(A = letters[1:10], X = 1:10, key = "A")
dt2 <- data.table(A = letters[5:14], Y = 1:10, key = "A")
merge(dt1, dt2) # 内连接
## A X Y
## 1: e 5 1
## 2: f 6 2
## 3: g 7 3
## 4: h 8 4
## 5: i 9 5
## 6: j 10 6
参数 key 的作用相当于建立一个索引,通过它实现更快的数据操作速度
key = c("x","y","z")
或者 key = "x,y,z"
其中 x,y,z 是列名
data(band_members, band_instruments, package = "dplyr")
band_members
## # A tibble: 3 × 2
## name band
## <chr> <chr>
## 1 Mick Stones
## 2 John Beatles
## 3 Paul Beatles
band_instruments
## # A tibble: 3 × 2
## name plays
## <chr> <chr>
## 1 John guitar
## 2 Paul bass
## 3 Keith guitar
dplyr::inner_join(band_members, band_instruments)
## # A tibble: 2 × 3
## name band plays
## <chr> <chr> <chr>
## 1 John Beatles guitar
## 2 Paul Beatles bass
list 列表里每个元素都是 data.frame 时,最适合用 data.table::rbindlist
合并
# 合并列表 https://recology.info/2018/10/limiting-dependencies/
function(x) {
tibble::as_tibble((x <- data.table::setDF(
data.table::rbindlist(x, use.names = TRUE, fill = TRUE, idcol = "id")
)
))
}
## function(x) {
## tibble::as_tibble((x <- data.table::setDF(
## data.table::rbindlist(x, use.names = TRUE, fill = TRUE, idcol = "id")
## )
## ))
## }
7.2 高频操作
以面向问题的方式介绍 Base R 提供的数据操作,然后过渡到 data.table,它是加强版的 Base R。
base | dplyr |
---|---|
df[order(x), , drop = FALSE] |
arrange(df, x) |
df[!duplicated(x), , drop = FALSE] , unique()
|
distinct(df, x) |
df[x & !is.na(x), , drop = FALSE] , subset()
|
filter(df, x) |
df$z <- df$x + df$y , transform()
|
mutate(df, z = x + y) |
df$x |
pull(df, x) |
N/A | rename(df, y = x) |
df[c("x", "y")] , subset()
|
select(df, x, y) |
df[grepl(names(df), "^x")] |
select(df, starts_with("x") |
mean(df$x) |
summarise(df, mean(x)) |
df[c(1, 2, 5), , drop = FALSE] |
slice(df, c(1, 2, 5)) |
base | dplyr |
---|---|
merge(df1, df2) |
inner_join(df1, df2) |
merge(df1, df2, all.x = TRUE) |
left_join(df1, df2) |
merge(df1, df2, all.y = TRUE) |
right_join(df1, df2) |
merge(df1, df2, all = TRUE) |
full_join(df1, df2) |
df1[df1$x %in% df2$x, , drop = FALSE] |
semi_join(df1, df2) |
df1[!df1$x %in% df2$x, , drop = FALSE] |
anti_join(df1, df2) |
class(mtcars)
## [1] "data.frame"
library(data.table)
mtcars <- as.data.table(mtcars)
class(mtcars)
## [1] "data.table" "data.frame"
7.2.1 选择多列
## cyl gear
## 1: 6 4
## 2: 6 4
## 3: 4 4
## cyl gear
## 1: 6 4
## 2: 6 4
## 3: 4 4
## cyl gear
## 1: 6 4
## 2: 6 4
## 3: 4 4
反选多列,选择除了 cyl 和 gear 的列
## 或者 mtcars[, setdiff(names(mtcars), c("cyl", "gear"))]
mtcars[, !(names(mtcars) %in% c("cyl", "gear"))] |> head(3)
## [1] TRUE FALSE TRUE
## mpg disp hp drat wt qsec vs am carb
## 1: 21.0 160 110 3.90 2.620 16.46 0 1 4
## 2: 21.0 160 110 3.90 2.875 17.02 0 1 4
## 3: 22.8 108 93 3.85 2.320 18.61 1 1 1
7.2.2 过滤多行
# base
mtcars[mtcars$cyl == 6 & mtcars$gear == 4, ]
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 4: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
subset(mtcars, subset = cyl == 6 & gear == 4)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 4: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
# data.table
mtcars[cyl == 6 & gear == 4, ]
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 4: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
# dplyr
dplyr::filter(mtcars, cyl == 6 & gear == 4)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 4: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
7.2.3 去重多行
# base
mtcars[!duplicated(mtcars[, c("cyl", "gear")])]
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 3: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 4: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 5: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 6: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 7: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 8: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
# data.table
mtcars[!duplicated(mtcars, by = c("cyl", "gear")), ]
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 3: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 4: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 5: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 6: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 7: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 8: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 3: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 4: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 5: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 6: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 7: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 8: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
# dplyr
dplyr::distinct(mtcars, cyl, gear, .keep_all = TRUE)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 3: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 4: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 5: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 6: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 7: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 8: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
7.2.5 新添多列
mtcars[cyl == 6, `:=`(disp_mean = mean(disp), hp_mean = mean(hp))][cyl == 6, .(cyl, disp, hp, disp_mean, hp_mean)]
## cyl disp hp disp_mean hp_mean
## 1: 6 160.0 110 183.3143 122.2857
## 2: 6 160.0 110 183.3143 122.2857
## 3: 6 258.0 110 183.3143 122.2857
## 4: 6 225.0 105 183.3143 122.2857
## 5: 6 167.6 123 183.3143 122.2857
## 6: 6 167.6 123 183.3143 122.2857
## 7: 6 145.0 175 183.3143 122.2857
7.2.7 筛选多列
按照某一规则筛选多列
library(data.table)
iris <- as.data.table(iris)
iris[, head(.SD, 6), .SDcols = function(x) is.numeric(x)]
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1: 5.1 3.5 1.4 0.2
## 2: 4.9 3.0 1.4 0.2
## 3: 4.7 3.2 1.3 0.2
## 4: 4.6 3.1 1.5 0.2
## 5: 5.0 3.6 1.4 0.2
## 6: 5.4 3.9 1.7 0.4
7.2.8 修改多列类型
## Classes 'data.table' and 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : int 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: int 160 160 108 258 360 225 360 146 140 167 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "index")= int(0)
7.2.9 取每组第一行
先将 mtcars 按 cyl 升序,gear 降序排列,然后按 cyl, gear 和 am 分组取第一行
## cyl gear am mpg disp hp drat wt qsec vs carb
## 1: 4 5 1 26.0 120 91 4.43 2.140 16.70 0 2
## 2: 4 4 1 22.8 108 93 3.85 2.320 18.61 1 1
## 3: 4 4 0 24.4 146 62 3.69 3.190 20.00 1 2
## 4: 4 3 0 21.5 120 97 3.70 2.465 20.01 1 1
## 5: 6 5 1 19.7 145 175 3.62 2.770 15.50 0 6
## 6: 6 4 1 21.0 160 110 3.90 2.620 16.46 0 4
## 7: 6 4 0 19.2 167 123 3.92 3.440 18.30 1 4
## 8: 6 3 0 21.4 258 110 3.08 3.215 19.44 1 1
## 9: 8 5 1 15.8 351 264 4.22 3.170 14.50 0 4
## 10: 8 3 0 18.7 360 175 3.15 3.440 17.02 0 2
## cyl gear am mpg disp hp drat wt qsec vs carb
## 1: 4 5 1 26.0 120 91 4.43 2.140 16.70 0 2
## 2: 4 4 1 22.8 108 93 3.85 2.320 18.61 1 1
## 3: 4 4 0 24.4 146 62 3.69 3.190 20.00 1 2
## 4: 4 3 0 21.5 120 97 3.70 2.465 20.01 1 1
## 5: 6 5 1 19.7 145 175 3.62 2.770 15.50 0 6
## 6: 6 4 1 21.0 160 110 3.90 2.620 16.46 0 4
## 7: 6 4 0 19.2 167 123 3.92 3.440 18.30 1 4
## 8: 6 3 0 21.4 258 110 3.08 3.215 19.44 1 1
## 9: 8 5 1 15.8 351 264 4.22 3.170 14.50 0 4
## 10: 8 3 0 18.7 360 175 3.15 3.440 17.02 0 2
7.2.10 计算环比同比
以数据集 AirPassengers 为例,重新整理后见表 7.3
library(magrittr)
dat <- data.frame(
year = rep(1949:1960, each = 12),
month = month.abb, num = AirPassengers
) %>%
reshape(.,
v.names = "num", idvar = "year", timevar = "month",
direction = "wide", sep = ""
) %>%
setNames(., gsub(pattern = "(num)", replacement = "", x = colnames(.)))
rownames(dat) <- subset(dat, select = year, drop = TRUE)
air_passengers <- subset(dat, select = -year)
knitr::kable(air_passengers,
caption = "1949-1960年国际航班乘客数量变化",
align = "c", row.names = TRUE
)
Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1949 | 112 | 118 | 132 | 129 | 121 | 135 | 148 | 148 | 136 | 119 | 104 | 118 |
1950 | 115 | 126 | 141 | 135 | 125 | 149 | 170 | 170 | 158 | 133 | 114 | 140 |
1951 | 145 | 150 | 178 | 163 | 172 | 178 | 199 | 199 | 184 | 162 | 146 | 166 |
1952 | 171 | 180 | 193 | 181 | 183 | 218 | 230 | 242 | 209 | 191 | 172 | 194 |
1953 | 196 | 196 | 236 | 235 | 229 | 243 | 264 | 272 | 237 | 211 | 180 | 201 |
1954 | 204 | 188 | 235 | 227 | 234 | 264 | 302 | 293 | 259 | 229 | 203 | 229 |
1955 | 242 | 233 | 267 | 269 | 270 | 315 | 364 | 347 | 312 | 274 | 237 | 278 |
1956 | 284 | 277 | 317 | 313 | 318 | 374 | 413 | 405 | 355 | 306 | 271 | 306 |
1957 | 315 | 301 | 356 | 348 | 355 | 422 | 465 | 467 | 404 | 347 | 305 | 336 |
1958 | 340 | 318 | 362 | 348 | 363 | 435 | 491 | 505 | 404 | 359 | 310 | 337 |
1959 | 360 | 342 | 406 | 396 | 420 | 472 | 548 | 559 | 463 | 407 | 362 | 405 |
1960 | 417 | 391 | 419 | 461 | 472 | 535 | 622 | 606 | 508 | 461 | 390 | 432 |
横向计算环比,如1949年2月相比1月增长多少、3月相比2月增长多少,以此类推,就是计算环比?纵向计算同比,如1950年1月相比1949年1月增长多少、1951年相比1950年1月增长多少?
# 环比横向/同比纵向
mom <- function(x) diff(x, lag = 1) / x[-length(x)] # month to month
# 格式化输出
format_mom <- function(x) formatC(mom(x), format = "f", digits = 4)
library(formattable)
# 同比变化
air_passengers %>%
apply(., 2, format_mom) %>%
as.data.frame() %>%
formattable(., list(
Jan = color_tile("white", "pink"),
Feb = color_tile("white", "springgreen4"),
Mar = percent
))
library(DT)
datatable(air_passengers)
7.2.11 合并多个数据框
将所有列都保留,以 full_join()
方式合并
df1 <- iris[1:10, c(1, 5)]
df2 <- iris[11:15, c(1, 2, 5)]
df3 <- iris[16:30, c(1, 3, 5)]
all_dfs <- list(df1, df2, df3)
# base
Reduce(function(x, y, ...) merge(x, y, ..., all = TRUE), all_dfs)
## Sepal.Length Species Sepal.Width Petal.Length
## 1: 4.3 setosa 3.0 NA
## 2: 4.4 setosa NA NA
## 3: 4.6 setosa NA 1.0
## 4: 4.6 setosa NA 1.0
## 5: 4.7 setosa NA 1.6
## 6: 4.8 setosa 3.4 1.9
## 7: 4.8 setosa 3.0 1.9
## 8: 4.9 setosa NA NA
## 9: 4.9 setosa NA NA
## 10: 5.0 setosa NA 1.6
## 11: 5.0 setosa NA 1.6
## 12: 5.0 setosa NA 1.6
## 13: 5.0 setosa NA 1.6
## 14: 5.1 setosa NA 1.4
## 15: 5.1 setosa NA 1.5
## 16: 5.1 setosa NA 1.5
## 17: 5.1 setosa NA 1.7
## 18: 5.2 setosa NA 1.5
## 19: 5.2 setosa NA 1.4
## 20: 5.4 setosa 3.7 1.3
## 21: 5.4 setosa 3.7 1.7
## 22: 5.7 setosa NA 1.5
## 23: 5.7 setosa NA 1.7
## 24: 5.8 setosa 4.0 NA
## Sepal.Length Species Sepal.Width Petal.Length
## Sepal.Length Species Sepal.Width Petal.Length
## 1: 5.1 setosa NA 1.4
## 2: 5.1 setosa NA 1.5
## 3: 5.1 setosa NA 1.5
## 4: 5.1 setosa NA 1.7
## 5: 4.9 setosa NA NA
## 6: 4.7 setosa NA 1.6
## 7: 4.6 setosa NA 1.0
## 8: 5.0 setosa NA 1.6
## 9: 5.0 setosa NA 1.6
## 10: 5.4 setosa 3.7 1.3
## 11: 5.4 setosa 3.7 1.7
## 12: 4.6 setosa NA 1.0
## 13: 5.0 setosa NA 1.6
## 14: 5.0 setosa NA 1.6
## 15: 4.4 setosa NA NA
## 16: 4.9 setosa NA NA
## 17: 4.8 setosa 3.4 1.9
## 18: 4.8 setosa 3.0 1.9
## 19: 4.3 setosa 3.0 NA
## 20: 5.8 setosa 4.0 NA
## 21: 5.7 setosa NA 1.5
## 22: 5.7 setosa NA 1.7
## 23: 5.2 setosa NA 1.5
## 24: 5.2 setosa NA 1.4
## Sepal.Length Species Sepal.Width Petal.Length
合并完应该有30行,为啥只有24行?这是因为 merge()
函数对主键 key 相同的记录会合并,要想不合并,需要调用 rbindlist()
函数 https://d.cosx.org/d/421235
rbind()
列数相同的两个 data.frame 按行合并,cbind()
行数相同的两个 data.frame 按列合并,merge()
对行、列数没有要求
rbindlist(all_dfs, fill = TRUE)
## Sepal.Length Species Sepal.Width Petal.Length
## 1: 5.1 setosa NA NA
## 2: 4.9 setosa NA NA
## 3: 4.7 setosa NA NA
## 4: 4.6 setosa NA NA
## 5: 5.0 setosa NA NA
## 6: 5.4 setosa NA NA
## 7: 4.6 setosa NA NA
## 8: 5.0 setosa NA NA
## 9: 4.4 setosa NA NA
## 10: 4.9 setosa NA NA
## 11: 5.4 setosa 3.7 NA
## 12: 4.8 setosa 3.4 NA
## 13: 4.8 setosa 3.0 NA
## 14: 4.3 setosa 3.0 NA
## 15: 5.8 setosa 4.0 NA
## 16: 5.7 setosa NA 1.5
## 17: 5.4 setosa NA 1.3
## 18: 5.1 setosa NA 1.4
## 19: 5.7 setosa NA 1.7
## 20: 5.1 setosa NA 1.5
## 21: 5.4 setosa NA 1.7
## 22: 5.1 setosa NA 1.5
## 23: 4.6 setosa NA 1.0
## 24: 5.1 setosa NA 1.7
## 25: 4.8 setosa NA 1.9
## 26: 5.0 setosa NA 1.6
## 27: 5.0 setosa NA 1.6
## 28: 5.2 setosa NA 1.5
## 29: 5.2 setosa NA 1.4
## 30: 4.7 setosa NA 1.6
## Sepal.Length Species Sepal.Width Petal.Length
# dplyr
dplyr::bind_rows(all_dfs)
## Sepal.Length Species Sepal.Width Petal.Length
## 1: 5.1 setosa NA NA
## 2: 4.9 setosa NA NA
## 3: 4.7 setosa NA NA
## 4: 4.6 setosa NA NA
## 5: 5.0 setosa NA NA
## 6: 5.4 setosa NA NA
## 7: 4.6 setosa NA NA
## 8: 5.0 setosa NA NA
## 9: 4.4 setosa NA NA
## 10: 4.9 setosa NA NA
## 11: 5.4 setosa 3.7 NA
## 12: 4.8 setosa 3.4 NA
## 13: 4.8 setosa 3.0 NA
## 14: 4.3 setosa 3.0 NA
## 15: 5.8 setosa 4.0 NA
## 16: 5.7 setosa NA 1.5
## 17: 5.4 setosa NA 1.3
## 18: 5.1 setosa NA 1.4
## 19: 5.7 setosa NA 1.7
## 20: 5.1 setosa NA 1.5
## 21: 5.4 setosa NA 1.7
## 22: 5.1 setosa NA 1.5
## 23: 4.6 setosa NA 1.0
## 24: 5.1 setosa NA 1.7
## 25: 4.8 setosa NA 1.9
## 26: 5.0 setosa NA 1.6
## 27: 5.0 setosa NA 1.6
## 28: 5.2 setosa NA 1.5
## 29: 5.2 setosa NA 1.4
## 30: 4.7 setosa NA 1.6
## Sepal.Length Species Sepal.Width Petal.Length
7.2.12 分组聚合多个指标
https://stackoverflow.com/questions/24151602/calculate-multiple-aggregations-with-lapply-sd
# base
aggregate(
data = mtcars, cbind(mpg, hp) ~ cyl,
FUN = function(x) c(mean = mean(x), median = median(x))
)
## cyl mpg.mean mpg.median hp.mean hp.median
## 1 4 26.66364 26.00000 82.63636 91.00000
## 2 6 19.74286 19.70000 122.28571 110.00000
## 3 8 15.10000 15.20000 209.21429 192.50000
# 数据一致性 https://d.cosx.org/d/420763-base-r
with(
aggregate(cbind(mpg, hp) ~ cyl, mtcars,
FUN = function(x) c(mean = mean(x), median = median(x))
),
cbind.data.frame(cyl, mpg, hp)
)
## cyl mean median mean median
## 1 4 26.66364 26.0 82.63636 91.0
## 2 6 19.74286 19.7 122.28571 110.0
## 3 8 15.10000 15.2 209.21429 192.5
# data.table
mtcars[, as.list(unlist(lapply(.SD, function(x) {
list(
mean = mean(x),
median = median(x)
)
}))),
by = "cyl", .SDcols = c("mpg", "hp")
]
## cyl mpg.mean mpg.median hp.mean hp.median
## 1: 6 19.74286 19.7 122.28571 110.0
## 2: 4 26.66364 26.0 82.63636 91.0
## 3: 8 15.10000 15.2 209.21429 192.5
# dplyr
mtcars |>
dplyr::group_by(cyl) |>
dplyr::summarise(
mean_mpg = mean(mpg), mean_hp = mean(hp),
median_mpg = mean(mpg), median_hp = mean(hp)
)
## # A tibble: 3 × 5
## cyl mean_mpg mean_hp median_mpg median_hp
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 4 26.7 82.6 26.7 82.6
## 2 6 19.7 122. 19.7 122.
## 3 8 15.1 209. 15.1 209.
7.2.13 重命名多个列
tmp <- aggregate(
data = mtcars, cbind(mpg, hp) ~ cyl,
FUN = median
)
tmp <- as.data.table(tmp)
setnames(tmp, old = c("mpg", "hp"), new = c("median_mpg", "median_hp"))
tmp
## cyl median_mpg median_hp
## 1: 4 26.0 91.0
## 2: 6 19.7 110.0
## 3: 8 15.2 192.5
7.2.14 对多个列依次排序
https://stackoverflow.com/questions/1296646/how-to-sort-a-dataframe-by-multiple-columns
# base
tmp[order(median_mpg, -median_hp), ]
## cyl median_mpg median_hp
## 1: 8 15.2 192.5
## 2: 6 19.7 110.0
## 3: 4 26.0 91.0
# data.table
setorder(tmp, median_mpg, -median_hp)
# dplyr
dplyr::arrange(tmp, median_mpg, desc(median_hp))
## cyl median_mpg median_hp
## 1: 8 15.2 192.5
## 2: 6 19.7 110.0
## 3: 4 26.0 91.0
7.2.15 重排多个列的位置
# https://stackoverflow.com/questions/19619666/change-column-position-of-data-table
setcolorder(tmp, c("median_mpg", setdiff(names(tmp), "median_mpg")))
tmp
## median_mpg cyl median_hp
## 1: 15.2 8 192.5
## 2: 19.7 6 110.0
## 3: 26.0 4 91.0
## median_mpg cyl median_hp
## 1: 15.2 8 192.5
## 2: 19.7 6 110.0
## 3: 26.0 4 91.0
7.2.16 整理回归结果
dat <- split(iris, iris$Species)
mod <- lapply(dat, function(x) lm(Petal.Length ~ Sepal.Length, x))
mod <- lapply(mod, function(x) coef(summary(x)))
mod <- Map(function(x, y) {
x <- as.data.frame(x)
x$Species <- y
x
}, mod, names(dat))
mod <- do.call(rbind, mod)
mod
## Estimate Std. Error t value Pr(>|t|) Species
## setosa.(Intercept) 0.8030518 0.34387807 2.3352806 2.375647e-02 setosa
## setosa.Sepal.Length 0.1316317 0.06852690 1.9208760 6.069778e-02 setosa
## versicolor.(Intercept) 0.1851155 0.51421351 0.3599974 7.204283e-01 versicolor
## versicolor.Sepal.Length 0.6864698 0.08630708 7.9538056 2.586190e-10 versicolor
## virginica.(Intercept) 0.6104680 0.41710685 1.4635770 1.498279e-01 virginica
## virginica.Sepal.Length 0.7500808 0.06302606 11.9011203 6.297786e-16 virginica
# 管道操作
split(iris, iris$Species) %>%
lapply(., function(x) coef(summary(lm(Petal.Length ~ Sepal.Length, x)))) %>%
Map(function(x, y) {
x <- as.data.frame(x)
x$Species <- y
x
}, ., levels(iris$Species)) %>%
do.call(rbind, .)
## Estimate Std. Error t value Pr(>|t|) Species
## setosa.(Intercept) 0.8030518 0.34387807 2.3352806 2.375647e-02 setosa
## setosa.Sepal.Length 0.1316317 0.06852690 1.9208760 6.069778e-02 setosa
## versicolor.(Intercept) 0.1851155 0.51421351 0.3599974 7.204283e-01 versicolor
## versicolor.Sepal.Length 0.6864698 0.08630708 7.9538056 2.586190e-10 versicolor
## virginica.(Intercept) 0.6104680 0.41710685 1.4635770 1.498279e-01 virginica
## virginica.Sepal.Length 0.7500808 0.06302606 11.9011203 6.297786e-16 virginica
# dplyr 操作,需要 dplyr >= 1.0.0
iris %>%
dplyr::group_by(Species) %>%
dplyr::summarise(broom::tidy(lm(Petal.Length ~ Sepal.Length)))
## # A tibble: 6 × 6
## # Groups: Species [3]
## Species term estimate std.error statistic p.value
## <fct> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 setosa (Intercept) 0.803 0.344 2.34 2.38e- 2
## 2 setosa Sepal.Length 0.132 0.0685 1.92 6.07e- 2
## 3 versicolor (Intercept) 0.185 0.514 0.360 7.20e- 1
## 4 versicolor Sepal.Length 0.686 0.0863 7.95 2.59e-10
## 5 virginica (Intercept) 0.610 0.417 1.46 1.50e- 1
## 6 virginica Sepal.Length 0.750 0.0630 11.9 6.30e-16
7.2.17 :=
和 .()
mtcars[, mpg_rate := round(mpg / sum(mpg) * 100, digits = 2), by = .(cyl, vs, am)]
mtcars[, .(mpg_rate, mpg, cyl, vs, am)]
## mpg_rate mpg cyl vs am
## 1: 34.04 21.0 6 0 1
## 2: 34.04 21.0 6 0 1
## 3: 11.48 22.8 4 1 1
## 4: 27.97 21.4 6 1 0
## 5: 10.35 18.7 8 0 0
## 6: 23.66 18.1 6 1 0
## 7: 7.92 14.3 8 0 0
## 8: 35.52 24.4 4 1 0
## 9: 33.19 22.8 4 1 0
## 10: 25.10 19.2 6 1 0
## 11: 23.27 17.8 6 1 0
## 12: 9.08 16.4 8 0 0
## 13: 9.58 17.3 8 0 0
## 14: 8.42 15.2 8 0 0
## 15: 5.76 10.4 8 0 0
## 16: 5.76 10.4 8 0 0
## 17: 8.14 14.7 8 0 0
## 18: 16.31 32.4 4 1 1
## 19: 15.31 30.4 4 1 1
## 20: 17.07 33.9 4 1 1
## 21: 31.30 21.5 4 1 0
## 22: 8.58 15.5 8 0 0
## 23: 8.42 15.2 8 0 0
## 24: 7.36 13.3 8 0 0
## 25: 10.63 19.2 8 0 0
## 26: 13.75 27.3 4 1 1
## 27: 100.00 26.0 4 0 1
## 28: 15.31 30.4 4 1 1
## 29: 51.30 15.8 8 0 1
## 30: 31.93 19.7 6 0 1
## 31: 48.70 15.0 8 0 1
## 32: 10.78 21.4 4 1 1
## mpg_rate mpg cyl vs am
## cyl vs am mpg_rate
## 1: 6 0 1 34.04
## 2: 6 0 1 34.04
## 3: 6 0 1 31.93
## 4: 4 1 1 11.48
## 5: 4 1 1 16.31
## 6: 4 1 1 15.31
## 7: 4 1 1 17.07
## 8: 4 1 1 13.75
## 9: 4 1 1 15.31
## 10: 4 1 1 10.78
## 11: 6 1 0 27.97
## 12: 6 1 0 23.66
## 13: 6 1 0 25.10
## 14: 6 1 0 23.27
## 15: 8 0 0 10.35
## 16: 8 0 0 7.92
## 17: 8 0 0 9.08
## 18: 8 0 0 9.58
## 19: 8 0 0 8.42
## 20: 8 0 0 5.76
## 21: 8 0 0 5.76
## 22: 8 0 0 8.14
## 23: 8 0 0 8.58
## 24: 8 0 0 8.42
## 25: 8 0 0 7.36
## 26: 8 0 0 10.63
## 27: 4 1 0 35.52
## 28: 4 1 0 33.19
## 29: 4 1 0 31.30
## 30: 4 0 1 100.00
## 31: 8 0 1 51.30
## 32: 8 0 1 48.70
## cyl vs am mpg_rate
7.2.18 去掉含有缺失值的记录
airquality[complete.cases(airquality), ] |> head()
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 7 23 299 8.6 65 5 7
## 8 19 99 13.8 59 5 8
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 7 23 299 8.6 65 5 7
## 8 19 99 13.8 59 5 8
7.2.19 集合操作
match 和 %in%
https://d.cosx.org/d/421314
`%nin%` <- Negate("%in%")
# `%in%` <- function(x, table) match(x, table, nomatch = 0) > 0 # %in% 函数的定义
x <- letters[1:5]
y <- letters[3:8]
x %in% y
## [1] FALSE FALSE TRUE TRUE TRUE
x %nin% y
## [1] TRUE TRUE FALSE FALSE FALSE
返回一个逻辑向量,x 中的元素匹配到了就返回 TRUE,否则 FALSE, %nin%
是 %in%
的取反效果
match(x, y)
## [1] NA NA 1 2 3
x 在 y 中的匹配情况,匹配到了,就返回在 y 中匹配的位置,没有匹配到就返回 NA
setdiff(x, y)
## [1] "a" "b"
intersect(x, y)
## [1] "c" "d" "e"
union(x, y)
## [1] "a" "b" "c" "d" "e" "f" "g" "h"
7.2.20 对数值向量按既定分组计数
此数据处理过程陆续使用了 transform()
、cut()
和 aggregate()
三个函数
# 对数值向量按既定分组计数
dat <- data.frame(y = 1:12)
dat <- transform(dat, x = cut(y, breaks = c(0, 6, 9, 15)))
dat <- aggregate(data = dat, y ~ x, FUN = length)
ggplot(data = dat, aes(x = x, y = y)) +
geom_col()
data.frame(y = 1:12) %>%
transform(x = cut(y, breaks = c(0, 6, 9, 15))) %>%
aggregate(data = ., y ~ x, FUN = length) %>%
ggplot(data = ., aes(x = x, y = y)) +
geom_col()
对数值向量按分位数分组计数
7.2.21 分组排序
按变量 a 分组计算,之后按变量 b 降序排列
dat <- aggregate(data = iris, cbind(Sepal.Width, Sepal.Length) ~ Species, FUN = mean)
# 按 Species 降序排列
dat[order(dat$Species, decreasing = T), ]
## Species Sepal.Width Sepal.Length
## 3 virginica 2.974 6.588
## 2 versicolor 2.770 5.936
## 1 setosa 3.428 5.006
7.2.22 分组获取 Top 值
分组按既定规律取数,比如按 Species 分组取 Top 6
# 分组取 Top 6
do.call(rbind, lapply(split(iris, iris$Species),
FUN = function(x) head(x[order(x$Sepal.Length, decreasing = T), ], 6)
))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1: 5.8 4.0 1.2 0.2 setosa
## 2: 5.7 4.4 1.5 0.4 setosa
## 3: 5.7 3.8 1.7 0.3 setosa
## 4: 5.5 4.2 1.4 0.2 setosa
## 5: 5.5 3.5 1.3 0.2 setosa
## 6: 5.4 3.9 1.7 0.4 setosa
## 7: 7.0 3.2 4.7 1.4 versicolor
## 8: 6.9 3.1 4.9 1.5 versicolor
## 9: 6.8 2.8 4.8 1.4 versicolor
## 10: 6.7 3.1 4.4 1.4 versicolor
## 11: 6.7 3.0 5.0 1.7 versicolor
## 12: 6.7 3.1 4.7 1.5 versicolor
## 13: 7.9 3.8 6.4 2.0 virginica
## 14: 7.7 3.8 6.7 2.2 virginica
## 15: 7.7 2.6 6.9 2.3 virginica
## 16: 7.7 2.8 6.7 2.0 virginica
## 17: 7.7 3.0 6.1 2.3 virginica
## 18: 7.6 3.0 6.6 2.1 virginica
7.2.23 分组抽样
# 分组抽样
do.call(rbind, lapply(split(iris, iris$Species),
FUN = function(x) x[sample(1:nrow(x), size = 6), ]
))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1: 4.8 3.4 1.9 0.2 setosa
## 2: 4.8 3.4 1.6 0.2 setosa
## 3: 5.0 3.2 1.2 0.2 setosa
## 4: 4.7 3.2 1.6 0.2 setosa
## 5: 4.5 2.3 1.3 0.3 setosa
## 6: 4.4 3.2 1.3 0.2 setosa
## 7: 6.7 3.1 4.7 1.5 versicolor
## 8: 6.5 2.8 4.6 1.5 versicolor
## 9: 6.1 3.0 4.6 1.4 versicolor
## 10: 6.8 2.8 4.8 1.4 versicolor
## 11: 5.2 2.7 3.9 1.4 versicolor
## 12: 5.6 2.7 4.2 1.3 versicolor
## 13: 7.1 3.0 5.9 2.1 virginica
## 14: 6.4 3.2 5.3 2.3 virginica
## 15: 6.5 3.0 5.2 2.0 virginica
## 16: 6.1 3.0 4.9 1.8 virginica
## 17: 6.8 3.0 5.5 2.1 virginica
## 18: 6.9 3.1 5.1 2.3 virginica
7.2.24 分组计算分位数
## 0% 25% 50% 75% 100%
## Sepal.Length 1 1 1 1 1
## Sepal.Width 1 1 1 1 1
## Petal.Length 1 1 1 1 1
## Petal.Width 1 1 1 1 1
## Species 0 0 0 0 0
## Species Sepal.Length.0% Sepal.Length.25% Sepal.Length.50% Sepal.Length.75%
## 1 setosa 4.300 4.800 5.000 5.200
## 2 versicolor 4.900 5.600 5.900 6.300
## 3 virginica 4.900 6.225 6.500 6.900
## Sepal.Length.100% Sepal.Width.0% Sepal.Width.25% Sepal.Width.50%
## 1 5.800 2.300 3.200 3.400
## 2 7.000 2.000 2.525 2.800
## 3 7.900 2.200 2.800 3.000
## Sepal.Width.75% Sepal.Width.100%
## 1 3.675 4.400
## 2 3.000 3.400
## 3 3.175 3.800
# 对 Sepal.Length 按 Species 分组计算分位数
do.call("rbind", tapply(iris$Sepal.Length, iris$Species, quantile))
## 0% 25% 50% 75% 100%
## setosa 4.3 4.800 5.0 5.2 5.8
## versicolor 4.9 5.600 5.9 6.3 7.0
## virginica 4.9 6.225 6.5 6.9 7.9
# 分组取平均 mean /中位数 median
aggregate(data = iris, . ~ Species, FUN = mean)
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 5.006 3.428 1.462 0.246
## 2 versicolor 5.936 2.770 4.260 1.326
## 3 virginica 6.588 2.974 5.552 2.026
7.2.25 计算日粒度的 DoD/WoW/MoM/YoY
截止写作时间,data.table 提供的滑动窗口聚合统计函数 frollmean()
、frollsum()
和 frollapply()
还处于实验阶段。 shift 提供漂移功能,向前前置 lead 或向后延迟 lag。
dat <- data.frame(dt = seq(
from = as.Date("2021-01-01"),
to = Sys.Date(), by = "1 day"
))
dat <- within(dat, {
uv = round(1000 * runif(n = nrow(dat)))
uv_dod_d = ifelse(nrow(dat) <= 1, NA, c(NA, diff(uv, lag = 1)))
uv_wow_d = ifelse(nrow(dat) <= 7, NA, c(rep(NA, 7), diff(uv, lag = 7)))
uv_mom_d = ifelse(nrow(dat) <= 30, NA, c(rep(NA, 30), diff(uv, lag = 30)))
uv_yoy_d = ifelse(nrow(dat) <= 365, NA, c(rep(NA, 365), diff(uv, lag = 365)))
})
7.3 运行环境
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] magrittr_2.0.3 data.table_1.14.2
##
## loaded via a namespace (and not attached):
## [1] highr_0.9 bslib_0.3.1 compiler_4.2.2 pillar_1.7.0
## [5] jquerylib_0.1.4 tools_4.2.2 sysfonts_0.8.8 digest_0.6.29
## [9] downlit_0.4.0 jsonlite_1.8.0 evaluate_0.15 memoise_2.0.1
## [13] lifecycle_1.0.1 tibble_3.1.7 pkgconfig_2.0.3 rlang_1.0.2
## [17] DBI_1.1.2 cli_3.3.0 curl_4.3.2 yaml_2.3.5
## [21] xfun_0.31 fastmap_1.1.0 stringr_1.4.0 dplyr_1.0.9
## [25] xml2_1.3.3 knitr_1.39 generics_0.1.2 fs_1.5.2
## [29] sass_0.4.1 vctrs_0.4.1 tidyselect_1.1.2 glue_1.6.2
## [33] R6_2.5.1 fansi_1.0.3 rmarkdown_2.14 bookdown_0.26
## [37] tidyr_1.2.0 purrr_0.3.4 backports_1.4.1 htmltools_0.5.2
## [41] ellipsis_0.3.2 assertthat_0.2.1 utf8_1.2.2 stringi_1.7.6
## [45] broom_0.8.0 cachem_1.0.6 crayon_1.5.1