## 6.23 高频数据操作

set.seed(2020)
dat <- data.frame(
num_a = rep(seq(4), each = 4), num_b = rep(seq(4), times = 4),
group_a = sample(x = letters[1:3], size = 16, replace = T),
group_b = sample(x = LETTERS[1:3], size = 16, replace = T)
)
dat <- as.data.table(dat)
dat
##     num_a num_b group_a group_b
##  1:     1     1       c       B
##  2:     1     2       b       B
##  3:     1     3       a       B
##  4:     1     4       a       C
##  5:     2     1       b       B
##  6:     2     2       b       C
##  7:     2     3       a       B
##  8:     2     4       a       A
##  9:     3     1       b       C
## 10:     3     2       b       B
## 11:     3     3       b       B
## 12:     3     4       a       B
## 13:     4     1       b       C
## 14:     4     2       c       B
## 15:     4     3       b       C
## 16:     4     4       a       C

### 6.23.2 分组计数

dat[, .(length(num_a)), by = .(group_a)] # dat[, .N , by = .(group_a)]
##    group_a V1
## 1:       c  2
## 2:       b  8
## 3:       a  6
dat[, .(length(num_a)), by = .(group_b)]
##    group_b V1
## 1:       B  9
## 2:       C  6
## 3:       A  1
dat[, .(length(num_a)), by = .(group_a, group_b)]
##    group_a group_b V1
## 1:       c       B  2
## 2:       b       B  4
## 3:       a       B  3
## 4:       a       C  2
## 5:       b       C  4
## 6:       a       A  1

### 6.23.3 分组抽样

group_a 为组别， a、 b、 c 分别有 6、 8、 2 条记录

# 无放回的抽样
dt_sample_1 <- dat[, .SD[sample(x = .N, size = 2, replace = FALSE)], by = group_a]
# 有放回的随机抽样
dt_sample_2 <- dat[, .SD[sample(x = .N, size = 3, replace = TRUE)], by = group_a]

dat[, .SD[sample(x = .N, size = min(4, .N))], by = group_a]
##     group_a num_a num_b group_b
##  1:       c     1     1       B
##  2:       c     4     2       B
##  3:       b     3     2       B
##  4:       b     2     2       C
##  5:       b     2     1       B
##  6:       b     3     3       B
##  7:       a     1     3       B
##  8:       a     2     3       B
##  9:       a     2     4       A
## 10:       a     1     4       C

### 6.23.4 分组排序

data.table 包的分组排序问题 https://d.cosx.org/d/421650-datatable/3

dat[with(dat, order(-ave(num_a, group_a, FUN = max), -num_a)), ]
##     num_a num_b group_a group_b
##  1:     4     1       b       C
##  2:     4     2       c       B
##  3:     4     3       b       C
##  4:     4     4       a       C
##  5:     3     1       b       C
##  6:     3     2       b       B
##  7:     3     3       b       B
##  8:     3     4       a       B
##  9:     2     1       b       B
## 10:     2     2       b       C
## 11:     2     3       a       B
## 12:     2     4       a       A
## 13:     1     1       c       B
## 14:     1     2       b       B
## 15:     1     3       a       B
## 16:     1     4       a       C
# num_a 降序排列，然后对 group_a 升序排列
dat[with(dat, order(-num_a, group_a)), ]
##     num_a num_b group_a group_b
##  1:     4     4       a       C
##  2:     4     1       b       C
##  3:     4     3       b       C
##  4:     4     2       c       B
##  5:     3     4       a       B
##  6:     3     1       b       C
##  7:     3     2       b       B
##  8:     3     3       b       B
##  9:     2     3       a       B
## 10:     2     4       a       A
## 11:     2     1       b       B
## 12:     2     2       b       C
## 13:     1     3       a       B
## 14:     1     4       a       C
## 15:     1     2       b       B
## 16:     1     1       c       B
# 简写
dat[order(-num_a, group_a)]
##     num_a num_b group_a group_b
##  1:     4     4       a       C
##  2:     4     1       b       C
##  3:     4     3       b       C
##  4:     4     2       c       B
##  5:     3     4       a       B
##  6:     3     1       b       C
##  7:     3     2       b       B
##  8:     3     3       b       B
##  9:     2     3       a       B
## 10:     2     4       a       A
## 11:     2     1       b       B
## 12:     2     2       b       C
## 13:     1     3       a       B
## 14:     1     4       a       C
## 15:     1     2       b       B
## 16:     1     1       c       B

setorder() 函数直接修改原始数据记录的排序

setorder(dat, -num_a, group_a)

dat[, .(length(!is.na(num_a))), by = .(group_a)]
##    group_a V1
## 1:       c  2
## 2:       b  8
## 3:       a  6

dat[, .(length(unique(num_a))), by = .(group_a)]
##    group_a V1
## 1:       c  2
## 2:       b  4
## 3:       a  4

iris <- as.data.table(iris)
iris[order(-Sepal.Length), .SD[1:3], by = "Species"]
##       Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1:  virginica          7.9         3.8          6.4         2.0
## 2:  virginica          7.7         3.8          6.7         2.2
## 3:  virginica          7.7         2.6          6.9         2.3
## 4: versicolor          7.0         3.2          4.7         1.4
## 5: versicolor          6.9         3.1          4.9         1.5
## 6: versicolor          6.8         2.8          4.8         1.4
## 7:     setosa          5.8         4.0          1.2         0.2
## 8:     setosa          5.7         4.4          1.5         0.4
## 9:     setosa          5.7         3.8          1.7         0.3

dat <- head(iris)
ind <- do.call(what = "order", args = dat[, c(5, 1, 2, 3)])
dat[ind, ]
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1:          4.6         3.1          1.5         0.2  setosa
## 2:          4.7         3.2          1.3         0.2  setosa
## 3:          4.9         3.0          1.4         0.2  setosa
## 4:          5.0         3.6          1.4         0.2  setosa
## 5:          5.1         3.5          1.4         0.2  setosa
## 6:          5.4         3.9          1.7         0.4  setosa

iris = as.data.table(iris)
iris[order(-Sepal.Length), .SD[1:3], by="Species"]
##       Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1:  virginica          7.9         3.8          6.4         2.0
## 2:  virginica          7.7         3.8          6.7         2.2
## 3:  virginica          7.7         2.6          6.9         2.3
## 4: versicolor          7.0         3.2          4.7         1.4
## 5: versicolor          6.9         3.1          4.9         1.5
## 6: versicolor          6.8         2.8          4.8         1.4
## 7:     setosa          5.8         4.0          1.2         0.2
## 8:     setosa          5.7         4.4          1.5         0.4
## 9:     setosa          5.7         3.8          1.7         0.3

ind <- do.call(what = "order", args = iris[,c(5,1,2,3)])
head(iris[ind, ])
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1:          4.3         3.0          1.1         0.1  setosa
## 2:          4.4         2.9          1.4         0.2  setosa
## 3:          4.4         3.0          1.3         0.2  setosa
## 4:          4.4         3.2          1.3         0.2  setosa
## 5:          4.5         2.3          1.3         0.3  setosa
## 6:          4.6         3.1          1.5         0.2  setosa

Sepal.Length Sepal.Width Petal.Length Petal.Width Species
5.1 3.5 1.4 0.2 setosa
4.9 3.0 1.4 0.2 setosa
4.7 3.2 1.3 0.2 setosa
4.6 3.1 1.5 0.2 setosa
5.0 3.6 1.4 0.2 setosa
5.4 3.9 1.7 0.4 setosa
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
4.3 3.0 1.1 0.1 setosa
4.4 2.9 1.4 0.2 setosa
4.4 3.0 1.3 0.2 setosa
4.4 3.2 1.3 0.2 setosa
4.5 2.3 1.3 0.3 setosa
4.6 3.1 1.5 0.2 setosa