## 7.1 基础介绍

# 用一个真实的数据集替换，让每一个操作都有实际含义和价值 mtcars
DT <- data.table(
x = rep(c("b", "a", "c"), each = 3),
v = c(1, 1, 1, 2, 2, 1, 1, 2, 2),
y = c(1, 3, 6), a = 1:9, b = 9:1
)
DT
##    x v y a b
## 1: b 1 1 1 9
## 2: b 1 3 2 8
## 3: b 1 6 3 7
## 4: a 2 1 4 6
## 5: a 2 3 5 5
## 6: a 1 6 6 4
## 7: c 1 1 7 3
## 8: c 2 3 8 2
## 9: c 2 6 9 1
# 分组求和
DT[, sum(v), by = .(y %% 2)]
##    y V1
## 1: 1  9
## 2: 0  4
DT[, sum(v), by = .(bool = y %% 2)]
##    bool V1
## 1:    1  9
## 2:    0  4
DT[, .SD[2], by = x] # 每组第二行
##    x v y a b
## 1: b 1 3 2 8
## 2: a 2 3 5 5
## 3: c 2 3 8 2
DT[, tail(.SD, 2), by = x] # 每组最后两行
##    x v y a b
## 1: b 1 3 2 8
## 2: b 1 6 3 7
## 3: a 2 3 5 5
## 4: a 1 6 6 4
## 5: c 2 3 8 2
## 6: c 2 6 9 1
# 除了 x 列外，所有列都按 x 分组求和
DT[, lapply(.SD, sum), by = x]
##    x v  y  a  b
## 1: b 3 10  6 24
## 2: a 5 10 15 15
## 3: c 5 10 24  6
# 各个列都按 x 分组取最小
DT[, .SD[which.min(v)], by = x] # 分组嵌套查询
##    x v y a b
## 1: b 1 1 1 9
## 2: a 1 6 6 4
## 3: c 1 1 7 3
DT[, list(MySum = sum(v), MyMin = min(v), MyMax = max(v)), by = .(x, y %% 2)] # 表达式嵌套
##    x y MySum MyMin MyMax
## 1: b 1     2     1     1
## 2: b 0     1     1     1
## 3: a 1     4     2     2
## 4: a 0     1     1     1
## 5: c 1     3     1     2
## 6: c 0     2     2     2
DT[, .(a = .(a), b = .(b)), by = x] # 按 x 分组，将 a,b 两列的值列出来
##    x     a     b
## 1: b 1,2,3 9,8,7
## 2: a 4,5,6 6,5,4
## 3: c 7,8,9 3,2,1
DT[, .(seq = min(a):max(b)), by = x] # 列操作不仅仅是聚合
##     x seq
##  1: b   1
##  2: b   2
##  3: b   3
##  4: b   4
##  5: b   5
##  6: b   6
##  7: b   7
##  8: b   8
##  9: b   9
## 10: a   4
## 11: a   5
## 12: a   6
## 13: c   7
## 14: c   6
## 15: c   5
## 16: c   4
## 17: c   3
# 按 x 分组对 v 求和，然后过滤出和小于 20 的行
DT[, sum(v), by = x][V1 < 20] # 组合查询
##    x V1
## 1: b  3
## 2: a  5
## 3: c  5
DT[, sum(v), by = x][order(-V1)] # 对结果排序
##    x V1
## 1: a  5
## 2: c  5
## 3: b  3
DT[, c(.N, lapply(.SD, sum)), by = x] # 计算每一组的和，每一组的观测数
##    x N v  y  a  b
## 1: b 3 3 10  6 24
## 2: a 3 5 10 15 15
## 3: c 3 5 10 24  6
# 两个复杂的操作，还没弄清楚这个技术存在的意义
DT[,
{
tmp <- mean(y)
.(a = a - tmp, b = b - tmp)
},
by = x
] # anonymous lambda in 'j', j accepts any valid
##    x          a          b
## 1: b -2.3333333  5.6666667
## 2: b -1.3333333  4.6666667
## 3: b -0.3333333  3.6666667
## 4: a  0.6666667  2.6666667
## 5: a  1.6666667  1.6666667
## 6: a  2.6666667  0.6666667
## 7: c  3.6666667 -0.3333333
## 8: c  4.6666667 -1.3333333
## 9: c  5.6666667 -2.3333333
# using rleid, get max(y) and min of all cols in .SDcols for each consecutive run of 'v'
DT[, c(.(y = max(y)), lapply(.SD, min)), by = rleid(v), .SDcols = v:b]
##    rleid y v y a b
## 1:     1 6 1 1 1 7
## 2:     2 3 2 1 4 5
## 3:     3 6 1 1 6 3
## 4:     4 6 2 3 8 1

### 7.1.1 过滤

mtcars_df <- as.data.table(mtcars)

mtcars_df[cyl == 6 & gear == 4]
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1: 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2: 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## 3: 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 4: 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4

mtcars_df[cyl == 6 & gear == 4, .(mpg, disp)]
##     mpg  disp
## 1: 21.0 160.0
## 2: 21.0 160.0
## 3: 19.2 167.6
## 4: 17.8 167.6
subset(x = mtcars_df, subset = cyl == 6 & gear == 4, select = c(mpg, disp))
##     mpg  disp
## 1: 21.0 160.0
## 2: 21.0 160.0
## 3: 19.2 167.6
## 4: 17.8 167.6
mtcars |>
dplyr::filter(cyl == 6 & gear == 4) |>
dplyr::select(mpg, disp)
##                mpg  disp
## Mazda RX4     21.0 160.0
## Mazda RX4 Wag 21.0 160.0
## Merc 280      19.2 167.6
## Merc 280C     17.8 167.6

### 7.1.2 变换

mtcars_df[, mean_mpg := mean(mpg)][, mean_disp := mean(disp)]
mtcars_df[1:6, ]
##     mpg cyl disp  hp drat    wt  qsec vs am gear carb mean_mpg mean_disp
## 1: 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4 20.09062  230.7219
## 2: 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4 20.09062  230.7219
## 3: 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1 20.09062  230.7219
## 4: 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1 20.09062  230.7219
## 5: 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2 20.09062  230.7219
## 6: 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1 20.09062  230.7219
mtcars_df[, .(mean_mpg = mean(mpg), mean_disp = mean(disp))]
##    mean_mpg mean_disp
## 1: 20.09062  230.7219
# mtcars_df[, .(mean_mpg := mean(mpg), mean_disp := mean(disp))] # 报错
# 正确的姿势
mtcars_df[, :=(mean_mpg = mean(mpg), mean_disp = mean(disp))][, .(mpg, disp, mean_mpg, mean_disp)] |>  head()
##     mpg disp mean_mpg mean_disp
## 1: 21.0  160 20.09062  230.7219
## 2: 21.0  160 20.09062  230.7219
## 3: 22.8  108 20.09062  230.7219
## 4: 21.4  258 20.09062  230.7219
## 5: 18.7  360 20.09062  230.7219
## 6: 18.1  225 20.09062  230.7219
mtcars |>
dplyr::summarise(mean_mpg = mean(mpg), mean_disp = mean(disp))
##   mean_mpg mean_disp
## 1 20.09062  230.7219
mtcars |>
dplyr::mutate(mean_mpg = mean(mpg), mean_disp = mean(disp)) |>
dplyr::select(mpg, disp, mean_mpg, mean_disp) |>
head()
##                    mpg disp mean_mpg mean_disp
## Mazda RX4         21.0  160 20.09062  230.7219
## Mazda RX4 Wag     21.0  160 20.09062  230.7219
## Datsun 710        22.8  108 20.09062  230.7219
## Hornet 4 Drive    21.4  258 20.09062  230.7219
## Hornet Sportabout 18.7  360 20.09062  230.7219
## Valiant           18.1  225 20.09062  230.7219

### 7.1.3 聚合

dcast(mtcars_df, cyl ~ gear, value.var = "mpg", fun = mean)
##    cyl     3      4    5
## 1:   4 21.50 26.925 28.2
## 2:   6 19.75 19.750 19.7
## 3:   8 15.05    NaN 15.4
tapply(mtcars$mpg, list(mtcars$cyl, mtcars$gear), mean) ## 3 4 5 ## 4 21.50 26.925 28.2 ## 6 19.75 19.750 19.7 ## 8 15.05 NA 15.4 mtcars_df[, .(mean_mpg = mean(mpg)), by = .(cyl, gear)] ## cyl gear mean_mpg ## 1: 6 4 19.750 ## 2: 4 4 26.925 ## 3: 6 3 19.750 ## 4: 8 3 15.050 ## 5: 4 3 21.500 ## 6: 4 5 28.200 ## 7: 8 5 15.400 ## 8: 6 5 19.700 aggregate(data = mtcars_df, mpg ~ cyl + gear, FUN = mean) ## cyl gear mpg ## 1 4 3 21.500 ## 2 6 3 19.750 ## 3 8 3 15.050 ## 4 4 4 26.925 ## 5 6 4 19.750 ## 6 4 5 28.200 ## 7 6 5 19.700 ## 8 8 5 15.400 mtcars |> dplyr::group_by(cyl, gear) |> dplyr::summarise(mean_mpg = mean(mpg)) ## # A tibble: 8 × 3 ## # Groups: cyl [3] ## cyl gear mean_mpg ## <dbl> <dbl> <dbl> ## 1 4 3 21.5 ## 2 4 4 26.9 ## 3 4 5 28.2 ## 4 6 3 19.8 ## 5 6 4 19.8 ## 6 6 5 19.7 ## 7 8 3 15.0 ## 8 8 5 15.4 ### 7.1.4 命名 修改列名，另存一份生效 sub_mtcars_df <- mtcars_df[, .(mean_mpg = mean(mpg)), by = .(cyl, gear)] setNames(sub_mtcars_df, c("cyl", "gear", "ave_mpg")) ## cyl gear ave_mpg ## 1: 6 4 19.750 ## 2: 4 4 26.925 ## 3: 6 3 19.750 ## 4: 8 3 15.050 ## 5: 4 3 21.500 ## 6: 4 5 28.200 ## 7: 8 5 15.400 ## 8: 6 5 19.700 # 注意 sub_mtcars_df 并没有修改列名 sub_mtcars_df ## cyl gear mean_mpg ## 1: 6 4 19.750 ## 2: 4 4 26.925 ## 3: 6 3 19.750 ## 4: 8 3 15.050 ## 5: 4 3 21.500 ## 6: 4 5 28.200 ## 7: 8 5 15.400 ## 8: 6 5 19.700 修改列名并直接起作用，在原来的数据集上生效 setnames(sub_mtcars_df, old = c("mean_mpg"), new = c("ave_mpg")) # sub_mtcars_df 已经修改了列名 sub_mtcars_df ## cyl gear ave_mpg ## 1: 6 4 19.750 ## 2: 4 4 26.925 ## 3: 6 3 19.750 ## 4: 8 3 15.050 ## 5: 4 3 21.500 ## 6: 4 5 28.200 ## 7: 8 5 15.400 ## 8: 6 5 19.700 修改列名最好使用 data.table 包的函数 setnames() 明确指出了要修改的列名， ### 7.1.5 排序 按照某（些）列从大到小或从小到大的顺序排列， 先按 cyl 升序，然后按 gear 降序 mtcars_df[, .(mpg, cyl, gear)][cyl == 4][order(cyl, -gear)] ## mpg cyl gear ## 1: 26.0 4 5 ## 2: 30.4 4 5 ## 3: 22.8 4 4 ## 4: 24.4 4 4 ## 5: 22.8 4 4 ## 6: 32.4 4 4 ## 7: 30.4 4 4 ## 8: 33.9 4 4 ## 9: 27.3 4 4 ## 10: 21.4 4 4 ## 11: 21.5 4 3 mtcars |> dplyr::select(mpg, cyl, gear) |> dplyr::filter(cyl == 4) |> dplyr::arrange(cyl, desc(gear)) ## mpg cyl gear ## Porsche 914-2 26.0 4 5 ## Lotus Europa 30.4 4 5 ## Datsun 710 22.8 4 4 ## Merc 240D 24.4 4 4 ## Merc 230 22.8 4 4 ## Fiat 128 32.4 4 4 ## Honda Civic 30.4 4 4 ## Toyota Corolla 33.9 4 4 ## Fiat X1-9 27.3 4 4 ## Volvo 142E 21.4 4 4 ## Toyota Corona 21.5 4 3 ### 7.1.6 变形 melt 宽的变长的 DT <- data.table( i_1 = c(1:5, NA), i_2 = c(NA, 6, 7, 8, 9, 10), f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)), f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered = TRUE), c_1 = sample(c(letters[1:3], NA), 6, TRUE), d_1 = as.Date(c(1:3, NA, 4:5), origin = "2013-09-01"), d_2 = as.Date(6:1, origin = "2012-01-01") ) DT[, .(i_1, i_2, f_1, f_2)] ## i_1 i_2 f_1 f_2 ## 1: 1 NA a z ## 2: 2 6 a a ## 3: 3 7 c x ## 4: 4 8 c c ## 5: 5 9 b x ## 6: NA 10 a x melt(DT, id = 1:2, measure = c("f_1", "f_2")) ## i_1 i_2 variable value ## 1: 1 NA f_1 a ## 2: 2 6 f_1 a ## 3: 3 7 f_1 c ## 4: 4 8 f_1 c ## 5: 5 9 f_1 b ## 6: NA 10 f_1 a ## 7: 1 NA f_2 z ## 8: 2 6 f_2 a ## 9: 3 7 f_2 x ## 10: 4 8 f_2 c ## 11: 5 9 f_2 x ## 12: NA 10 f_2 x dcast 长的变宽的 sleep <- as.data.table(sleep) dcast(sleep, group ~ ID, value.var = "extra") ## group 1 2 3 4 5 6 7 8 9 10 ## 1: 1 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0.0 2.0 ## 2: 2 1.9 0.8 1.1 0.1 -0.1 4.4 5.5 1.6 4.6 3.4 # 如果有多个值 dcast(mtcars_df, cyl ~ gear, value.var = "mpg") ## cyl 3 4 5 ## 1: 4 1 8 2 ## 2: 6 2 4 1 ## 3: 8 12 0 2 dcast(mtcars_df, cyl ~ gear, value.var = "mpg", fun = mean) ## cyl 3 4 5 ## 1: 4 21.50 26.925 28.2 ## 2: 6 19.75 19.750 19.7 ## 3: 8 15.05 NaN 15.4 tidyr 包提供数据变形的函数 tidyr::pivot_longer()tidyr::pivot_wider() 相比于 Base R 提供的 reshape() 和 data.table 提供的 melt()dcast() 更加形象的命名 tidyr::pivot_wider(data = sleep, names_from = "ID", values_from = "extra") ## # A tibble: 2 × 11 ## group 1 2 3 4 5 6 7 8 9 10 ## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 1 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0 2 ## 2 2 1.9 0.8 1.1 0.1 -0.1 4.4 5.5 1.6 4.6 3.4 reshape(data = sleep, v.names = "extra", idvar = "group", timevar = "ID", direction = "wide") ## group extra.1 extra.2 extra.3 extra.4 extra.5 extra.6 extra.7 extra.8 ## 1: 1 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 ## 2: 2 1.9 0.8 1.1 0.1 -0.1 4.4 5.5 1.6 ## extra.9 extra.10 ## 1: 0.0 2.0 ## 2: 4.6 3.4 • idvar 分组变量 • timevar 组内编号 • v.names 个体观察值 • sep 新的列名是由参数 v.names (extra) 和参数值 timevar (ID) 拼接起来的，默认 sep = "." 推荐使用下划线来做分割 sep = "_" head(ToothGrowth) ## len supp dose ## 1 4.2 VC 0.5 ## 2 11.5 VC 0.5 ## 3 7.3 VC 0.5 ## 4 5.8 VC 0.5 ## 5 6.4 VC 0.5 ## 6 10.0 VC 0.5 ToothGrowth$time <- rep(1:10, 6)
reshape(ToothGrowth,
v.names = "len", idvar = c("supp", "dose"),
timevar = "time", direction = "wide"
)
##    supp dose len.1 len.2 len.3 len.4 len.5 len.6 len.7 len.8 len.9 len.10
## 1    VC  0.5   4.2  11.5   7.3   5.8   6.4  10.0  11.2  11.2   5.2    7.0
## 11   VC  1.0  16.5  16.5  15.2  17.3  22.5  17.3  13.6  14.5  18.8   15.5
## 21   VC  2.0  23.6  18.5  33.9  25.5  26.4  32.5  26.7  21.5  23.3   29.5
## 31   OJ  0.5  15.2  21.5  17.6   9.7  14.5  10.0   8.2   9.4  16.5    9.7
## 41   OJ  1.0  19.7  23.3  23.6  26.4  20.0  25.2  25.8  21.2  14.5   27.3
## 51   OJ  2.0  25.5  26.4  22.4  24.5  24.8  30.9  26.4  27.3  29.4   23.0

### 7.1.7 分组

Loblolly |>
dplyr::group_by(Seed) |>
dplyr::arrange(height, age, Seed) |>
dplyr::slice(1, dplyr::n())
## # A tibble: 28 × 3
## # Groups:   Seed [14]
##    height   age Seed
##     <dbl> <dbl> <ord>
##  1   3.93     3 329
##  2  56.4     25 329
##  3   4.12     3 327
##  4  56.8     25 327
##  5   4.38     3 325
##  6  58.5     25 325
##  7   3.91     3 307
##  8  59.1     25 307
##  9   3.46     3 331
## 10  59.5     25 331
## # ℹ 18 more rows

dplyr::slice() 和函数 slice.index() 有关系吗？

### 7.1.8 合并

data.table::mergedplyr::join

dt1 <- data.table(A = letters[1:10], X = 1:10, key = "A")
dt2 <- data.table(A = letters[5:14], Y = 1:10, key = "A")
merge(dt1, dt2) # 内连接
##    A  X Y
## 1: e  5 1
## 2: f  6 2
## 3: g  7 3
## 4: h  8 4
## 5: i  9 5
## 6: j 10 6

key = c("x","y","z") 或者 key = "x,y,z" 其中 x,y,z 是列名

data(band_members, band_instruments, package = "dplyr")
band_members
## # A tibble: 3 × 2
##   name  band
##   <chr> <chr>
## 1 Mick  Stones
## 2 John  Beatles
## 3 Paul  Beatles
band_instruments
## # A tibble: 3 × 2
##   name  plays
##   <chr> <chr>
## 1 John  guitar
## 2 Paul  bass
## 3 Keith guitar
dplyr::inner_join(band_members, band_instruments)
## # A tibble: 2 × 3
##   name  band    plays
##   <chr> <chr>   <chr>
## 1 John  Beatles guitar
## 2 Paul  Beatles bass

list 列表里每个元素都是 data.frame 时，最适合用 data.table::rbindlist 合并

# 合并列表 https://recology.info/2018/10/limiting-dependencies/
function(x) {
tibble::as_tibble((x <- data.table::setDF(
data.table::rbindlist(x, use.names = TRUE, fill = TRUE, idcol = "id")
)
))
}
## function(x) {
##   tibble::as_tibble((x <- data.table::setDF(
##     data.table::rbindlist(x, use.names = TRUE, fill = TRUE, idcol = "id")
##   )
##   ))
## }