## 6.10 数据聚合

apropos("apply")
##  [1] "apply"      "dendrapply" "eapply"     "frollapply" "kernapply"
##  [6] "lapply"     "mapply"     "rapply"     "sapply"     "tapply"
## [11] "vapply"
# 分组求和 colSums colMeans max
unique(iris$Species) ## [1] setosa versicolor virginica ## Levels: setosa versicolor virginica # 分类求和 # colSums(iris[iris$Species == "setosa", -5])
# colSums(iris[iris$Species == "virginica", -5]) colSums(iris[iris$Species == "versicolor", -5])
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width
##        296.8        138.5        213.0         66.3
# apply(iris[iris$Species == "setosa", -5], 2, sum) # apply(iris[iris$Species == "setosa", -5], 2, mean)
# apply(iris[iris$Species == "setosa", -5], 2, min) # apply(iris[iris$Species == "setosa", -5], 2, max)
apply(iris[iris$Species == "setosa", -5], 2, quantile) ## Sepal.Length Sepal.Width Petal.Length Petal.Width ## 0% 4.3 2.300 1.000 0.1 ## 25% 4.8 3.200 1.400 0.2 ## 50% 5.0 3.400 1.500 0.2 ## 75% 5.2 3.675 1.575 0.3 ## 100% 5.8 4.400 1.900 0.6 aggregate: Compute Summary Statistics of Data Subsets # 按分类变量 Species 分组求和 # aggregate(subset(iris, select = -Species), by = list(iris[, "Species"]), FUN = sum) aggregate(iris[, -5], list(iris[, 5]), sum) ## Group.1 Sepal.Length Sepal.Width Petal.Length Petal.Width ## 1 setosa 250.3 171.4 73.1 12.3 ## 2 versicolor 296.8 138.5 213.0 66.3 ## 3 virginica 329.4 148.7 277.6 101.3 # 先确定位置，假设有很多分类变量 ind <- which("Species" == colnames(iris)) # 分组统计 aggregate(iris[, -ind], list(iris[, ind]), sum) ## Group.1 Sepal.Length Sepal.Width Petal.Length Petal.Width ## 1 setosa 250.3 171.4 73.1 12.3 ## 2 versicolor 296.8 138.5 213.0 66.3 ## 3 virginica 329.4 148.7 277.6 101.3 按照 Species 划分的类别，分组计算，使用公式表示形式，右边一定是分类变量，否则会报错误或者警告，输出奇怪的结果，请读者尝试运行aggregate(Species ~ Sepal.Length, data = iris, mean)。公式法表示分组计算，~ 左手边可以做加 +-*/ 取余 %% 等数学运算。下面以数据集 iris 为例，只对 Sepal.Length 按 Species 分组计算 aggregate(Sepal.Length ~ Species, data = iris, mean) ## Species Sepal.Length ## 1 setosa 5.006 ## 2 versicolor 5.936 ## 3 virginica 6.588 与上述分组统计结果一样的命令，在大数据集上， 与 aggregate 相比，tapply 要快很多，by 是 tapply 的包裹，处理速度差不多。读者可以构造伪随机数据集验证。 # tapply(iris$Sepal.Length, list(iris$Species), mean) with(iris, tapply(Sepal.Length, Species, mean)) ## setosa versicolor virginica ## 5.006 5.936 6.588 by(iris$Sepal.Length, iris$Species, mean) ## iris$Species: setosa
## [1] 5.006
## ------------------------------------------------------------
## iris$Species: versicolor ## [1] 5.936 ## ------------------------------------------------------------ ## iris$Species: virginica
## [1] 6.588

aggregate(. ~ Species, data = iris, mean)
##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa        5.006       3.428        1.462       0.246
## 2 versicolor        5.936       2.770        4.260       1.326
## 3  virginica        6.588       2.974        5.552       2.026

aggregate(Sepal.Length + Sepal.Width ~ Species, data = iris, mean)
##      Species Sepal.Length + Sepal.Width
## 1     setosa                      8.434
## 2 versicolor                      8.706
## 3  virginica                      9.562

# 查看数据
str(ChickWeight)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame':   578 obs. of  4 variables:
##  $weight: num 42 51 59 64 76 93 106 125 149 171 ... ##$ Time  : num  0 2 4 6 8 10 12 14 16 18 ...
##  $Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ... ##$ Diet  : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "formula")=Class 'formula'  language weight ~ Time | Chick
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
##  - attr(*, "outer")=Class 'formula'  language ~Diet
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
##  - attr(*, "labels")=List of 2
##   ..$x: chr "Time" ## ..$ y: chr "Body weight"
##  - attr(*, "units")=List of 2
##   ..$x: chr "(days)" ## ..$ y: chr "(gm)"

head(ChickWeight)
##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1
## 3     59    4     1    1
## 4     64    6     1    1
## 5     76    8     1    1
....
str(ChickWeight)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame':   578 obs. of  4 variables:
##  $weight: num 42 51 59 64 76 93 106 125 149 171 ... ##$ Time  : num  0 2 4 6 8 10 12 14 16 18 ...
##  $Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ... ##$ Diet  : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "formula")=Class 'formula'  language weight ~ Time | Chick
....

aggregate(weight ~ Chick, data = ChickWeight, mean)
##    Chick    weight
## 1     18  37.00000
## 2     16  49.71429
## 3     15  60.12500
## 4     13  67.83333
## 5      9  81.16667
....
aggregate(weight ~ Diet, data = ChickWeight, mean)
##   Diet   weight
## 1    1 102.6455
## 2    2 122.6167
## 3    3 142.9500
## 4    4 135.2627

# 查看数据集
head(CO2)
##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled   95   16.0
## 2   Qn1 Quebec nonchilled  175   30.4
## 3   Qn1 Quebec nonchilled  250   34.8
## 4   Qn1 Quebec nonchilled  350   37.2
## 5   Qn1 Quebec nonchilled  500   35.3
## 6   Qn1 Quebec nonchilled  675   39.2
str(CO2)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame':   84 obs. of  5 variables:
##  $Plant : Ord.factor w/ 12 levels "Qn1"<"Qn2"<"Qn3"<..: 1 1 1 1 1 1 1 2 2 2 ... ##$ Type     : Factor w/ 2 levels "Quebec","Mississippi": 1 1 1 1 1 1 1 1 1 1 ...
##  $Treatment: Factor w/ 2 levels "nonchilled","chilled": 1 1 1 1 1 1 1 1 1 1 ... ##$ conc     : num  95 175 250 350 500 675 1000 95 175 250 ...
##  $uptake : num 16 30.4 34.8 37.2 35.3 39.2 39.7 13.6 27.3 37.1 ... ## - attr(*, "formula")=Class 'formula' language uptake ~ conc | Plant ## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> ## - attr(*, "outer")=Class 'formula' language ~Treatment * Type ## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> ## - attr(*, "labels")=List of 2 ## ..$ x: chr "Ambient carbon dioxide concentration"
##   ..$y: chr "CO2 uptake rate" ## - attr(*, "units")=List of 2 ## ..$ x: chr "(uL/L)"
##   ..$y: chr "(umol/m^2 s)" 对单个变量分组统计 aggregate(uptake ~ Plant, data = CO2, mean) ## Plant uptake ## 1 Qn1 33.22857 ## 2 Qn2 35.15714 ## 3 Qn3 37.61429 ## 4 Qc1 29.97143 ## 5 Qc3 32.58571 ## 6 Qc2 32.70000 ## 7 Mn3 24.11429 ## 8 Mn2 27.34286 ## 9 Mn1 26.40000 ## 10 Mc2 12.14286 ## 11 Mc3 17.30000 ## 12 Mc1 18.00000 aggregate(uptake ~ Type, data = CO2, mean) ## Type uptake ## 1 Quebec 33.54286 ## 2 Mississippi 20.88333 aggregate(uptake ~ Treatment, data = CO2, mean) ## Treatment uptake ## 1 nonchilled 30.64286 ## 2 chilled 23.78333 对多个变量分组统计，查看二氧化碳吸收速率uptake随类型Type和处理方式Treatment aggregate(uptake ~ Type + Treatment, data = CO2, mean) ## Type Treatment uptake ## 1 Quebec nonchilled 35.33333 ## 2 Mississippi nonchilled 25.95238 ## 3 Quebec chilled 31.75238 ## 4 Mississippi chilled 15.81429 tapply(CO2$uptake, list(CO2$Type, CO2$Treatment), mean)
##             nonchilled  chilled
## Quebec        35.33333 31.75238
## Mississippi   25.95238 15.81429
by(CO2$uptake, list(CO2$Type, CO2$Treatment), mean) ## : Quebec ## : nonchilled ## [1] 35.33333 ## ------------------------------------------------------------ ## : Mississippi ## : nonchilled ## [1] 25.95238 ## ------------------------------------------------------------ ## : Quebec ## : chilled ## [1] 31.75238 ## ------------------------------------------------------------ ## : Mississippi ## : chilled ## [1] 15.81429 在这个例子中 tapply 和 by 的输出结果的表示形式不一样，aggregate 返回一个 data.frame 数据框，tapply 返回一个表格 table，by 返回特殊的数据类型 by。 Function by is an object-oriented wrapper for tapply applied to data frames. # 分组求和 # by(iris[, 1], INDICES = list(iris$Species), FUN = sum)
# by(iris[, 2], INDICES = list(iris$Species), FUN = sum) by(iris[, 3], INDICES = list(iris$Species), FUN = sum)
## : setosa
## [1] 73.1
## ------------------------------------------------------------
## : versicolor
## [1] 213
## ------------------------------------------------------------
## : virginica
## [1] 277.6
by(iris[1:3], INDICES = list(iris$Species), FUN = sum) ## : setosa ## [1] 494.8 ## ------------------------------------------------------------ ## : versicolor ## [1] 648.3 ## ------------------------------------------------------------ ## : virginica ## [1] 755.7 by(iris[1:3], INDICES = list(iris$Species), FUN = summary)
## : setosa
##   Sepal.Length    Sepal.Width     Petal.Length
##  Min.   :4.300   Min.   :2.300   Min.   :1.000
##  1st Qu.:4.800   1st Qu.:3.200   1st Qu.:1.400
##  Median :5.000   Median :3.400   Median :1.500
##  Mean   :5.006   Mean   :3.428   Mean   :1.462
##  3rd Qu.:5.200   3rd Qu.:3.675   3rd Qu.:1.575
##  Max.   :5.800   Max.   :4.400   Max.   :1.900
## ------------------------------------------------------------
## : versicolor
##   Sepal.Length    Sepal.Width     Petal.Length
##  Min.   :4.900   Min.   :2.000   Min.   :3.00
##  1st Qu.:5.600   1st Qu.:2.525   1st Qu.:4.00
##  Median :5.900   Median :2.800   Median :4.35
##  Mean   :5.936   Mean   :2.770   Mean   :4.26
##  3rd Qu.:6.300   3rd Qu.:3.000   3rd Qu.:4.60
##  Max.   :7.000   Max.   :3.400   Max.   :5.10
## ------------------------------------------------------------
## : virginica
##   Sepal.Length    Sepal.Width     Petal.Length
##  Min.   :4.900   Min.   :2.200   Min.   :4.500
##  1st Qu.:6.225   1st Qu.:2.800   1st Qu.:5.100
##  Median :6.500   Median :3.000   Median :5.550
##  Mean   :6.588   Mean   :2.974   Mean   :5.552
##  3rd Qu.:6.900   3rd Qu.:3.175   3rd Qu.:5.875
##  Max.   :7.900   Max.   :3.800   Max.   :6.900
by(iris, INDICES = list(iris$Species), FUN = summary) ## : setosa ## Sepal.Length Sepal.Width Petal.Length Petal.Width ## Min. :4.300 Min. :2.300 Min. :1.000 Min. :0.100 ## 1st Qu.:4.800 1st Qu.:3.200 1st Qu.:1.400 1st Qu.:0.200 ## Median :5.000 Median :3.400 Median :1.500 Median :0.200 ## Mean :5.006 Mean :3.428 Mean :1.462 Mean :0.246 ## 3rd Qu.:5.200 3rd Qu.:3.675 3rd Qu.:1.575 3rd Qu.:0.300 ## Max. :5.800 Max. :4.400 Max. :1.900 Max. :0.600 ## Species ## setosa :50 ## versicolor: 0 ## virginica : 0 ## ## ## ## ------------------------------------------------------------ ## : versicolor ## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## Min. :4.900 Min. :2.000 Min. :3.00 Min. :1.000 setosa : 0 ## 1st Qu.:5.600 1st Qu.:2.525 1st Qu.:4.00 1st Qu.:1.200 versicolor:50 ## Median :5.900 Median :2.800 Median :4.35 Median :1.300 virginica : 0 ## Mean :5.936 Mean :2.770 Mean :4.26 Mean :1.326 ## 3rd Qu.:6.300 3rd Qu.:3.000 3rd Qu.:4.60 3rd Qu.:1.500 ## Max. :7.000 Max. :3.400 Max. :5.10 Max. :1.800 ## ------------------------------------------------------------ ## : virginica ## Sepal.Length Sepal.Width Petal.Length Petal.Width ## Min. :4.900 Min. :2.200 Min. :4.500 Min. :1.400 ## 1st Qu.:6.225 1st Qu.:2.800 1st Qu.:5.100 1st Qu.:1.800 ## Median :6.500 Median :3.000 Median :5.550 Median :2.000 ## Mean :6.588 Mean :2.974 Mean :5.552 Mean :2.026 ## 3rd Qu.:6.900 3rd Qu.:3.175 3rd Qu.:5.875 3rd Qu.:2.300 ## Max. :7.900 Max. :3.800 Max. :6.900 Max. :2.500 ## Species ## setosa : 0 ## versicolor: 0 ## virginica :50 ## ## ##  Group Averages Over Level Combinations of Factors 分组平均 str(warpbreaks) ## 'data.frame': 54 obs. of 3 variables: ##$ breaks : num  26 30 54 25 70 52 51 26 67 18 ...
##  $wool : Factor w/ 2 levels "A","B": 1 1 1 1 1 1 1 1 1 1 ... ##$ tension: Factor w/ 3 levels "L","M","H": 1 1 1 1 1 1 1 1 1 2 ...
head(warpbreaks)
##   breaks wool tension
## 1     26    A       L
## 2     30    A       L
## 3     54    A       L
## 4     25    A       L
## 5     70    A       L
## 6     52    A       L
ave(warpbreaks$breaks, warpbreaks$wool)
##  [1] 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704
##  [9] 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704
## [17] 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704
## [25] 31.03704 31.03704 31.03704 25.25926 25.25926 25.25926 25.25926 25.25926
## [33] 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926
## [41] 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926
## [49] 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926
with(warpbreaks, ave(breaks, tension, FUN = function(x) mean(x, trim = 0.1)))
##  [1] 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875
## [10] 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125
## [19] 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625
## [28] 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875
## [37] 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125
## [46] 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625
# 分组求和
with(warpbreaks, ave(breaks, tension, FUN = function(x) sum(x)))
##  [1] 655 655 655 655 655 655 655 655 655 475 475 475 475 475 475 475 475 475 390
## [20] 390 390 390 390 390 390 390 390 655 655 655 655 655 655 655 655 655 475 475
## [39] 475 475 475 475 475 475 475 390 390 390 390 390 390 390 390 390
# 分组求和
with(iris, ave(Sepal.Length, Species, FUN = function(x) sum(x)))
##   [1] 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3
##  [13] 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3
##  [25] 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3
##  [37] 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3
##  [49] 250.3 250.3 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8
##  [61] 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8
##  [73] 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8
##  [85] 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8
##  [97] 296.8 296.8 296.8 296.8 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4
## [109] 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4
## [121] 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4
## [133] 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4
## [145] 329.4 329.4 329.4 329.4 329.4 329.4