## 1.4summarize()

## 计算平均出发延误时间
summarize(flights,delay = mean(dep_delay,na.rm = TRUE))
#> # A tibble: 1 x 1
#>   delay
#>   <dbl>
#> 1  12.6

flights %>%
group_by(year,month,day) %>%
summarize(delay = mean(dep_delay,na.rm = T))
#> # A tibble: 365 x 4
#> # Groups:   year, month [12]
#>    year month   day delay
#>   <int> <int> <int> <dbl>
#> 1  2013     1     1 11.5
#> 2  2013     1     2 13.9
#> 3  2013     1     3 11.0
#> 4  2013     1     4  8.95
#> 5  2013     1     5  5.73
#> 6  2013     1     6  7.15
#> # ... with 359 more rows

aggregate()函数的写法：

aggregate(dep_delay~year+month+day,
FUN = mean,
data = flights) %>%
#>    year month day dep_delay
#> 1  2013     1   1    11.549
#> 2  2013     2   1    10.853
#> 3  2013     3   1    11.016
#> 4  2013     4   1    12.421
#> 5  2013     5   1     2.903
#> 6  2013     6   1     2.778
#> 7  2013     7   1    56.234
#> 8  2013     8   1    34.574
#> 9  2013     9   1     4.233
#> 10 2013    10   1    -0.099
#> 11 2013    11   1    13.273
#> 12 2013    12   1     9.004
#> 13 2013     1   2    13.859
#> 14 2013     2   2     5.422
#> 15 2013     3   2     8.027
#> 16 2013     4   2     8.260
#> 17 2013     5   2     6.389
#> 18 2013     6   2    34.013
#> 19 2013     7   2    19.285
#> 20 2013     8   2    13.254

### 1.4.1 Missing values in summarize()

flights %>%
group_by(year,month,day) %>%
summarize(delay = mean(dep_delay))
#> # A tibble: 365 x 4
#> # Groups:   year, month [12]
#>    year month   day delay
#>   <int> <int> <int> <dbl>
#> 1  2013     1     1    NA
#> 2  2013     1     2    NA
#> 3  2013     1     3    NA
#> 4  2013     1     4    NA
#> 5  2013     1     5    NA
#> 6  2013     1     6    NA
#> # ... with 359 more rows

not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))

not_cancelled %>%
group_by(year,month,day) %>%
summarize(delay = mean(dep_delay))
#> # A tibble: 365 x 4
#> # Groups:   year, month [12]
#>    year month   day delay
#>   <int> <int> <int> <dbl>
#> 1  2013     1     1 11.4
#> 2  2013     1     2 13.7
#> 3  2013     1     3 10.9
#> 4  2013     1     4  8.97
#> 5  2013     1     5  5.73
#> 6  2013     1     6  7.15
#> # ... with 359 more rows

### 1.4.2 计数函数

n() 函数是一个与摘要函数 summarize() 配合的计数函数，它不需要任何参数，单独使用时，它计算的就是行计数：

flights %>%
summarize(n = n())
#> # A tibble: 1 x 1
#>        n
#>    <int>
#> 1 336776

group_by() 联合使用时，它可以计算分组变量的每个水平上各有多少个观测：

## 每个月各有多少趟航班
flights %>%
group_by(month) %>%
summarize(n = n())  ## 等价于summarize(n = sum(month))
#> # A tibble: 12 x 2
#>   month     n
#>   <int> <int>
#> 1     1 27004
#> 2     2 24951
#> 3     3 28834
#> 4     4 28330
#> 5     5 28796
#> 6     6 28243
#> # ... with 6 more rows

n()会把缺失值也包含到计数中，如果想要计算出非缺失值的数量，可以使用sum(is.na(x))。如果想要计算唯一值的数量，可以使用n_distinct()

## 哪个目的地有最多的航空公司？
flights %>%
group_by(dest) %>%
summarize(carriers = n_distinct(carrier)) %>%
arrange(desc(carriers))
#> # A tibble: 105 x 2
#>   dest  carriers
#>   <chr>    <int>
#> 1 ATL          7
#> 2 BOS          7
#> 3 CLT          7
#> 4 ORD          7
#> 5 TPA          7
#> 6 AUS          6
#> # ... with 99 more rows

tally(x, wt = NULL, sort = FALSE, name = "n")

count(x, ..., wt = NULL, sort = FALSE, name = "n",
.drop = group_by_drop_default(x))

add_tally(x, wt, sort = FALSE, name = "n")

add_count(x, ..., wt = NULL, sort = FALSE, name = "n")

x %>% group_by(var) %>% tally() 是简化版的 group_by(var) + summarize(n())
x %>% count(var) 等价于 x %>% gruop_by(var) %>% tally()

x %>% group_by(var) %>% add_tally原数据集 中增添一列，记录 var 的不同水平的计数，等价于 x %>% add_count(var)，注意这两个函数返回值的维度和原数据框相同(摘要数据框往往不利于细节观察)！！它们等价于 group_by(var) %>% mutate(n())

# 无分组时，tally()即为样本数
mtcars %>%
tally()
#>    n
#> 1 32

# tally() 的一般用法
mtcars %>%
group_by(cyl) %>%
tally()
#> # A tibble: 3 x 2
#>     cyl     n
#>   <dbl> <int>
#> 1     4    11
#> 2     6     7
#> 3     8    14

# count() 等价 group_by() + tally()
mtcars %>%
count(cyl)
#> # A tibble: 3 x 2
#>     cyl     n
#>   <dbl> <int>
#> 1     4    11
#> 2     6     7
#> 3     8    14

# count() 也可以在已有分组上继续分组
mtcars %>%
group_by(gear) %>%
count(carb)
#> # A tibble: 11 x 3
#> # Groups:   gear [3]
#>    gear  carb     n
#>   <dbl> <dbl> <int>
#> 1     3     1     3
#> 2     3     2     4
#> 3     3     3     3
#> 4     3     4     5
#> 5     4     1     4
#> 6     4     2     4
#> # ... with 5 more rows

# add_tally() is short-hand for mutate()
mtcars %>%
#> # A tibble: 32 x 12
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb     n
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4    32
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4    32
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1    32
#> 4  21.4     6   258   110  3.08  3.22  19.4     1     0     3     1    32
#> 5  18.7     8   360   175  3.15  3.44  17.0     0     0     3     2    32
#> 6  18.1     6   225   105  2.76  3.46  20.2     1     0     3     1    32
#> # ... with 26 more rows

mtcars %>%
select(cyl, count)
#> # A tibble: 32 x 2
#>     cyl count
#>   <dbl> <int>
#> 1     6     7
#> 2     6     7
#> 3     4    11
#> 4     6     7
#> 5     8    14
#> 6     6     7
#> # ... with 26 more rows

# add_count() is useful for groupwise filtering
# e.g.: show details for species that have a single member
starwars %>%
filter(n == 1)
#> # A tibble: 29 x 14
#>   name  height  mass hair_color skin_color eye_color birth_year gender homeworld
#>   <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr>  <chr>
#> 1 Gree~    173    74 <NA>       green      black             44 male   Rodia
#> 2 Jabb~    175  1358 <NA>       green-tan~ orange           600 herma~ Nal Hutta
#> 3 Yoda      66    17 white      green      brown            896 male   <NA>
#> 4 Bossk    190   113 none       green      red               53 male   Trandosha
#> 5 Ackb~    180    83 none       brown mot~ orange            41 male   Mon Cala
#> 6 Wick~     88    20 brown      brown      brown              8 male   Endor
#> # ... with 23 more rows, and 5 more variables: species <chr>, films <list>,
#> #   vehicles <list>, starships <list>, n <int>

not_cancelled %>%
count(dest, sort = T, name = "count")
#> # A tibble: 104 x 2
#>   dest  count
#>   <chr> <int>
#> 1 ATL   16837
#> 2 ORD   16566
#> 3 LAX   16026
#> 4 BOS   15022
#> 5 MCO   13967
#> 6 CLT   13674
#> # ... with 98 more rows

not_cancelled %>%
count(tailnum, wt = distance)
#> # A tibble: 4,037 x 2
#>   tailnum      n
#>   <chr>    <dbl>
#> 1 D942DN    3418
#> 2 N0EGMQ  239143
#> 3 N10156  109664
#> 4 N102UW   25722
#> 5 N103US   24619
#> 6 N104UW   24616
#> # ... with 4,031 more rows

delays <- not_cancelled %>%
group_by(tailnum) %>%
summarize(delay = mean(arr_delay))

ggplot(delays) +
geom_histogram(aes(delay))

## n = n()对group_by中的变量水平进行计数，生成一个计数变量命名为n
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarize(
delay = mean(arr_delay),
n = n())

ggplot(delays) + geom_point(aes(x = n,y = delay),alpha = 0.1)

library(Lahman)
batters <- Batting %>%
group_by(playerID) %>%
summarize(
ba = sum(H,na.rm = T) / sum(AB,na.rm = T),
ab = sum(AB,na.rm = T))

batters %>%
filter(ab > 100) %>%
ggplot(aes(ab, ba)) +
geom_point() +
geom_smooth(se = FALSE)

• 总大数越多，不同击球手的打击率之间变动越小
• 能力（ba）和击球机会数量（ab）之间存在正相关。这是因为球队会控制击球手的出场，很显然，球队会优先选择最好的队员。

batters %>%
arrange(desc(ba))
#> # A tibble: 19,428 x 3
#>   playerID     ba    ab
#>   <chr>     <dbl> <int>
#> 1 abramge01     1     1
#> 2 alberan01     1     1
#> 3 allarko01     1     1
#> 4 banisje01     1     1
#> 5 bartocl01     1     1
#> 6 bassdo01      1     1
#> # ... with 19,422 more rows

### 1.4.3 逻辑值的计数和比例:sum(x > 10) 和 mean(y == 0)

## 每天中有多少架航班是在早上5点前出发的？（这通常表明前一天延误的航班数量）
not_cancelled %>%
group_by(year,month,day) %>%
summarize(n_early = sum(dep_time < 500))
#> # A tibble: 365 x 4
#> # Groups:   year, month [12]
#>    year month   day n_early
#>   <int> <int> <int>   <int>
#> 1  2013     1     1       0
#> 2  2013     1     2       3
#> 3  2013     1     3       4
#> 4  2013     1     4       3
#> 5  2013     1     5       3
#> 6  2013     1     6       2
#> # ... with 359 more rows
## 每天中到达时间误超过一小时的航班比例是多少？
not_cancelled %>%
group_by(year,month,day) %>%
summarize(hour_perc = mean(arr_delay > 60))
#> # A tibble: 365 x 4
#> # Groups:   year, month [12]
#>    year month   day hour_perc
#>   <int> <int> <int>     <dbl>
#> 1  2013     1     1    0.0722
#> 2  2013     1     2    0.0851
#> 3  2013     1     3    0.0567
#> 4  2013     1     4    0.0396
#> 5  2013     1     5    0.0349
#> 6  2013     1     6    0.0470
#> # ... with 359 more rows

### 1.4.4 其他常用的摘要函数

R中还提供了许多常用的摘要函数

## 将聚合函数和逻辑筛选组合起来使用
not_cancelled %>% group_by(year,month,day) %>% summarize(
## 延误时间的中位数
arr_delay1 = median(arr_delay),
## 正延误时间的中位数
arr_delay2 = median(arr_delay[arr_delay > 0])
)
#> # A tibble: 365 x 5
#> # Groups:   year, month [12]
#>    year month   day arr_delay1 arr_delay2
#>   <int> <int> <int>      <dbl>      <dbl>
#> 1  2013     1     1          3         17
#> 2  2013     1     2          4         16
#> 3  2013     1     3          1         16
#> 4  2013     1     4         -8         16
#> 5  2013     1     5         -7         11
#> 6  2013     1     6         -1         15
#> # ... with 359 more rows

## 为什么到某些目的地距离比到其他目的地更多变？
not_cancelled %>% group_by(dest) %>% summarize(distance_sd = sd(distance)) %>% arrange(desc(distance_sd))
#> # A tibble: 104 x 2
#>   dest  distance_sd
#>   <chr>       <dbl>
#> 1 EGE         10.5
#> 2 SAN         10.4
#> 3 SFO         10.2
#> 4 HNL         10.0
#> 5 SEA          9.98
#> 6 LAS          9.91
#> # ... with 98 more rows

## 每天最早和最晚的航班何时出发？
not_cancelled %>% group_by(year,month,day) %>% summarize(first = min(dep_time),last = max(dep_time))
#> # A tibble: 365 x 5
#> # Groups:   year, month [12]
#>    year month   day first  last
#>   <int> <int> <int> <int> <int>
#> 1  2013     1     1   517  2356
#> 2  2013     1     2    42  2354
#> 3  2013     1     3    32  2349
#> 4  2013     1     4    25  2358
#> 5  2013     1     5    14  2357
#> 6  2013     1     6    16  2355
#> # ... with 359 more rows

## 找出每天排在第10的的出发时间记录
not_cancelled %>%
group_by(month,year,day) %>%
summarize(tenth_dep = nth(dep_time,10))
#> # A tibble: 365 x 4
#> # Groups:   month, year [12]
#>   month  year   day tenth_dep
#>   <int> <int> <int>     <int>
#> 1     1  2013     1       558
#> 2     1  2013     2       554
#> 3     1  2013     3       552
#> 4     1  2013     4       553
#> 5     1  2013     5       555
#> 6     1  2013     6       558
#> # ... with 359 more rows

### 1.4.5 多个分组变量的消耗

daily <- not_cancelled %>% group_by(year,month,day)
## 每天有多少架航班记录
(per_day <- daily %>%
summarize(flights = n()))
#> # A tibble: 365 x 4
#> # Groups:   year, month [12]
#>    year month   day flights
#>   <int> <int> <int>   <int>
#> 1  2013     1     1     831
#> 2  2013     1     2     928
#> 3  2013     1     3     900
#> 4  2013     1     4     908
#> 5  2013     1     5     717
#> 6  2013     1     6     829
#> # ... with 359 more rows

## 每月有多少架航班记录
(per_month <-
per_day %>%
summarize(flights = sum(flights)))
#> # A tibble: 12 x 3
#> # Groups:   year [1]
#>    year month flights
#>   <int> <int>   <int>
#> 1  2013     1   26398
#> 2  2013     2   23611
#> 3  2013     3   27902
#> 4  2013     4   27564
#> 5  2013     5   28128
#> 6  2013     6   27075
#> # ... with 6 more rows
## 等价于not_cancelled %>% group_by(year,month) %>% summarize(flights = n())

## 每年有多少架航班记录
(per_year <-
per_month %>%
summarize(flights = sum(flights)))
#> # A tibble: 1 x 2
#>    year flights
#>   <int>   <int>
#> 1  2013  327346
## 等价于not_cancelled %>% group_by(year) %>% summarize(flights = n()) 

daily %>%
ungroup() %>%
summarize(flights = n())  ## 对数据集整体进行摘要统计
#> # A tibble: 1 x 1
#>   flights
#>     <int>
#> 1  327346