1.3 select() | R for data science: tidyverse and beyond

1.3 `select()`

如今，数据集有几百个甚至几千个变量已经司空见惯。这种情况下，如何找出真正感兴趣的变量经常是一个挑战。通过基于变量名的操作，select()函数可以让我们快速生成一个有用的变量子集。

glimpse(flights)
#> Rows: 336,776
#> Columns: 19
#> $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013...
#> $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
#> $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
#> $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 55...
#> $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 60...
#> $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2,...
#> $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 8...
#> $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 8...
#> $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7,...
#> $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6"...
#> $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301...
#> $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N...
#> $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LG...
#> $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IA...
#> $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149...
#> $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 73...
#> $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6...
#> $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59...
#> $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-0...
# 选择 year,month,day 三个变量
flights %>% select(year, month, day)
#> # A tibble: 336,776 x 3
#>    year month   day
#>   <int> <int> <int>
#> 1  2013     1     1
#> 2  2013     1     1
#> 3  2013     1     1
#> 4  2013     1     1
#> 5  2013     1     1
#> 6  2013     1     1
#> # ... with 336,770 more rows

顺便说一句，如果把变量名变成字符串或者它在所有变量中的顺序也可以正常工作,如 flights %>% select("year", "month", "day")和flights %>% select(1, 2, 3)和上面代码会返回一样结果，但是这两种方法都不值得推荐。

关于 Non-standard evalution {{ var }} !!enquo(var) 或者 .data[[var]] 将会使 select()按照我们希望的那样工作。

my_select <- function(df, var) {
  select(df, var)
}

df <- tibble(x = 1:3, y = 4:6)

# wrong way
df %>% my_select(x)
#> Error: Selections can't have missing values.

# solution 1
my_select <- function(df, var) {
  select(df, {{ var }})
}
df %>% my_select(x)
#> # A tibble: 3 x 1
#>       x
#>   <int>
#> 1     1
#> 2     2
#> 3     3

# solution 2 
my_select <- function(df, var) {
  select(df, var)
}
df %>% my_select(.data$x)
#> # A tibble: 3 x 0

# solution 3
my_select <- function(df, var) {
  select(df, !!enquo(var))
}
df %>% my_select(x)
#> # A tibble: 3 x 1
#>       x
#>   <int>
#> 1     1
#> 2     2
#> 3     3

这里的问题同样适用于 dplyr 中的其他采用惰性求值的函数。另外, all_of(var) 和 .env$var 可以处理相反的问题。

var <- c("x", "y")

df %>% select(all_of(var))
#> # A tibble: 3 x 2
#>       x     y
#>   <int> <int>
#> 1     1     4
#> 2     2     5
#> 3     3     6
df %>% select(.env$var)
#> # A tibble: 3 x 2
#>       x     y
#>   <int> <int>
#> 1     1     4
#> 2     2     5
#> 3     3     6

select() 还可以重命名变量，但应该避免这样使用它，因为这样会丢掉所有未明确提及的变量。我们应该使用 select() 函数的变体 rename() 函数来重命名变量，它会把未提及的那些变量按照原名字放到生成的数据框里：

# 将 tail_num 重命名为 tailnum
rename(flights, tail_num = tailnum)
#> # A tibble: 336,776 x 19
#>    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
#> 1  2013     1     1      517            515         2      830            819
#> 2  2013     1     1      533            529         4      850            830
#> 3  2013     1     1      542            540         2      923            850
#> 4  2013     1     1      544            545        -1     1004           1022
#> 5  2013     1     1      554            600        -6      812            837
#> 6  2013     1     1      554            558        -4      740            728
#> # ... with 336,770 more rows, and 11 more variables: arr_delay <dbl>,
#> #   carrier <chr>, flight <int>, tail_num <chr>, origin <chr>, dest <chr>,
#> #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

其他常见的select()函数用法如下所示：

## 选择“year”和“day”之间的所有变量，冒号
flights %>% select(year:day)
#> # A tibble: 336,776 x 3
#>    year month   day
#>   <int> <int> <int>
#> 1  2013     1     1
#> 2  2013     1     1
#> 3  2013     1     1
#> 4  2013     1     1
#> 5  2013     1     1
#> 6  2013     1     1
#> # ... with 336,770 more rows

## 选择不在“year”和“day”之间的所有列，减号
flights %>% select(-(year:day))
#> # A tibble: 336,776 x 16
#>   dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
#>      <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>  
#> 1      517            515         2      830            819        11 UA     
#> 2      533            529         4      850            830        20 UA     
#> 3      542            540         2      923            850        33 AA     
#> 4      544            545        -1     1004           1022       -18 B6     
#> 5      554            600        -6      812            837       -25 DL     
#> 6      554            558        -4      740            728        12 UA     
#> # ... with 336,770 more rows, and 9 more variables: flight <int>,
#> #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#> #   hour <dbl>, minute <dbl>, time_hour <dttm>

还可以在select()函数中使用一些辅助函数：
* starts_with("abc")：匹配以名字以“abc”开头的列

flights %>% select(starts_with("arr"))  ## 所有以arr开头的列
#> # A tibble: 336,776 x 2
#>   arr_time arr_delay
#>      <int>     <dbl>
#> 1      830        11
#> 2      850        20
#> 3      923        33
#> 4     1004       -18
#> 5      812       -25
#> 6      740        12
#> # ... with 336,770 more rows

ends_with("xyz"): 匹配名字以“xyz”结尾的列

## 所有以"delay"结尾的列
flights %>% select(ends_with("delay"))
#> # A tibble: 336,776 x 2
#>   dep_delay arr_delay
#>       <dbl>     <dbl>
#> 1         2        11
#> 2         4        20
#> 3         2        33
#> 4        -1       -18
#> 5        -6       -25
#> 6        -4        12
#> # ... with 336,770 more rows

contains("ijk")，匹配名字包含“ijk”的列

## 所有包含"time"的列
flights %>% select(contains("time"))
#> # A tibble: 336,776 x 6
#>   dep_time sched_dep_time arr_time sched_arr_time air_time time_hour          
#>      <int>          <int>    <int>          <int>    <dbl> <dttm>             
#> 1      517            515      830            819      227 2013-01-01 05:00:00
#> 2      533            529      850            830      227 2013-01-01 05:00:00
#> 3      542            540      923            850      160 2013-01-01 05:00:00
#> 4      544            545     1004           1022      183 2013-01-01 05:00:00
#> 5      554            600      812            837      116 2013-01-01 06:00:00
#> 6      554            558      740            728      150 2013-01-01 05:00:00
#> # ... with 336,770 more rows

matches("(.)\\1")：选择名字符合正则表达式要求的列，后面将具体讲述正则表达式
num_range("x",c(1,2,3))，选择名字为“x1”、“x2”、“x3”的列

df <- tibble(x1 = 1,x2 = 2,x3 = 3)
df %>% select(num_range("X", 1:3))
#> # A tibble: 1 x 0

one_of(character_1,···,character_n):如果某个列的名字出现在序列里，则选出它

flights %>% select(one_of("arr_delay", "dep_delay", "xyz"))
#> # A tibble: 336,776 x 2
#>   arr_delay dep_delay
#>       <dbl>     <dbl>
#> 1        11         2
#> 2        20         4
#> 3        33         2
#> 4       -18        -1
#> 5       -25        -6
#> 6        12        -4
#> # ... with 336,770 more rows

everything()：匹配所有(剩余)变量，当想要将几个变量移到数据集开头时，这种方法很有用

## 将time_hour和air_time两个变量移到flights数据的开头
flights %>% select(time_hour,air_time, everything())
#> # A tibble: 336,776 x 19
#>   time_hour           air_time  year month   day dep_time sched_dep_time
#>   <dttm>                 <dbl> <int> <int> <int>    <int>          <int>
#> 1 2013-01-01 05:00:00      227  2013     1     1      517            515
#> 2 2013-01-01 05:00:00      227  2013     1     1      533            529
#> 3 2013-01-01 05:00:00      160  2013     1     1      542            540
#> 4 2013-01-01 05:00:00      183  2013     1     1      544            545
#> 5 2013-01-01 06:00:00      116  2013     1     1      554            600
#> 6 2013-01-01 05:00:00      150  2013     1     1      554            558
#> # ... with 336,770 more rows, and 12 more variables: dep_delay <dbl>,
#> #   arr_time <int>, sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#> #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
#> #   hour <dbl>, minute <dbl>

last_col(offset = n)：选择倒数第n列，不设置offset时，默认选择最后一列

flights %>% select(last_col())
#> # A tibble: 336,776 x 1
#>   time_hour          
#>   <dttm>             
#> 1 2013-01-01 05:00:00
#> 2 2013-01-01 05:00:00
#> 3 2013-01-01 05:00:00
#> 4 2013-01-01 05:00:00
#> 5 2013-01-01 06:00:00
#> 6 2013-01-01 05:00:00
#> # ... with 336,770 more rows

利用这些帮助函数，我们可以为选择列设置任意数目的条件，select()中以逗号分隔的列表示“或” 关系，如：

## 找出以“arr”开头或者以“time”结尾的列
flights %>% select(starts_with("arr"), ends_with("time"))
#> # A tibble: 336,776 x 6
#>   arr_time arr_delay dep_time sched_dep_time sched_arr_time air_time
#>      <int>     <dbl>    <int>          <int>          <int>    <dbl>
#> 1      830        11      517            515            819      227
#> 2      850        20      533            529            830      227
#> 3      923        33      542            540            850      160
#> 4     1004       -18      544            545           1022      183
#> 5      812       -25      554            600            837      116
#> 6      740        12      554            558            728      150
#> # ... with 336,770 more rows

注意:
所有帮助函数都忽略大小写:

flights %>% 
  select(ends_with("DELAY"))
#> # A tibble: 336,776 x 2
#>   dep_delay arr_delay
#>       <dbl>     <dbl>
#> 1         2        11
#> 2         4        20
#> 3         2        33
#> 4        -1       -18
#> 5        -6       -25
#> 6        -4        12
#> # ... with 336,770 more rows

如果要区分大小写，可以设置任意帮助函数的参数ignore.case = FALSE

flights %>% 
  select(ends_with("DELAY", ignore.case = F))   ## 将不会选择出任何列
#> # A tibble: 336,776 x 0

1.3.1 练习

Exercise 1.7 从 flights 中选择 dep_time、dep_delay、arr_time、arr_delay，找出尽可能多的方法

先查看这些变量的位置：

glimpse(flights)
#> Rows: 336,776
#> Columns: 19
#> $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013...
#> $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
#> $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
#> $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 55...
#> $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 60...
#> $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2,...
#> $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 8...
#> $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 8...
#> $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7,...
#> $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6"...
#> $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301...
#> $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N...
#> $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LG...
#> $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IA...
#> $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149...
#> $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 73...
#> $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6...
#> $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59...
#> $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-0...

# 常规方法
flights %>% select(dep_time, dep_delay, arr_time, arr_delay)
#> # A tibble: 336,776 x 4
#>   dep_time dep_delay arr_time arr_delay
#>      <int>     <dbl>    <int>     <dbl>
#> 1      517         2      830        11
#> 2      533         4      850        20
#> 3      542         2      923        33
#> 4      544        -1     1004       -18
#> 5      554        -6      812       -25
#> 6      554        -4      740        12
#> # ... with 336,770 more rows

# one_of()
vars = c("dep_time", "dep_delay", "arr_time", "arr_delay")
flights %>% select(one_of(vars))
#> # A tibble: 336,776 x 4
#>   dep_time dep_delay arr_time arr_delay
#>      <int>     <dbl>    <int>     <dbl>
#> 1      517         2      830        11
#> 2      533         4      850        20
#> 3      542         2      923        33
#> 4      544        -1     1004       -18
#> 5      554        -6      812       -25
#> 6      554        -4      740        12
#> # ... with 336,770 more rows

# 多个条件
flights %>% select(starts_with("arr_"), starts_with("dep_"))
#> # A tibble: 336,776 x 4
#>   arr_time arr_delay dep_time dep_delay
#>      <int>     <dbl>    <int>     <dbl>
#> 1      830        11      517         2
#> 2      850        20      533         4
#> 3      923        33      542         2
#> 4     1004       -18      544        -1
#> 5      812       -25      554        -6
#> 6      740        12      554        -4
#> # ... with 336,770 more rows

Exercise 1.8 如果在select()中多次计入一个变量名，会发生什么情况？

select()函数将会忽略重复出现的变量，只选出一列，同时也不会报错：

flights %>% select(year, month, day, year, month, day)
#> # A tibble: 336,776 x 3
#>    year month   day
#>   <int> <int> <int>
#> 1  2013     1     1
#> 2  2013     1     1
#> 3  2013     1     1
#> 4  2013     1     1
#> 5  2013     1     1
#> 6  2013     1     1
#> # ... with 336,770 more rows

##mutate()

除了选择现有的列，经常还需要添加新列。新列是现有列的函数，这就 mutate() 函数的作用。
mutate() 总是将新列添加在最后,格式为 新列名= 表达式。为了便于观察它的效果，我们需要先创建一个更狭窄的数据集，以便能看到新变量。
例如，我们希望创建两个新列gain和hours，分别表示飞机在飞行过程中弥补的延误时间 (gain = arr_dealy - dep_delay)，然后把飞行时间换算成小时 hours = air_time / 60

(flights_gain <- flights %>% 
   select(ends_with("delay"),air_time) %>% 
   mutate(gain = arr_delay - dep_delay,
            hours = air_time / 60))
#> # A tibble: 336,776 x 5
#>   dep_delay arr_delay air_time  gain hours
#>       <dbl>     <dbl>    <dbl> <dbl> <dbl>
#> 1         2        11      227     9  3.78
#> 2         4        20      227    16  3.78
#> 3         2        33      160    31  2.67
#> 4        -1       -18      183   -17  3.05
#> 5        -6       -25      116   -19  1.93
#> 6        -4        12      150    16  2.5 
#> # ... with 336,770 more rows

一旦新列被创建，就可以立即使用。例如，可能想知道对gain做时间上的平均：

flights_gain %>% mutate(gain_per_hour = gain / hours)
#> # A tibble: 336,776 x 6
#>   dep_delay arr_delay air_time  gain hours gain_per_hour
#>       <dbl>     <dbl>    <dbl> <dbl> <dbl>         <dbl>
#> 1         2        11      227     9  3.78          2.38
#> 2         4        20      227    16  3.78          4.23
#> 3         2        33      160    31  2.67         11.6 
#> 4        -1       -18      183   -17  3.05         -5.57
#> 5        -6       -25      116   -19  1.93         -9.83
#> 6        -4        12      150    16  2.5           6.4 
#> # ... with 336,770 more rows

以上的数据转换也可以通过mutate()一次完成：

flights %>% 
  select(ends_with("delay"),air_time) %>% 
  mutate(
    gain = arr_delay - dep_delay,
    hours = air_time / 60,
    gain_per_hour = gain/hours)
#> # A tibble: 336,776 x 6
#>   dep_delay arr_delay air_time  gain hours gain_per_hour
#>       <dbl>     <dbl>    <dbl> <dbl> <dbl>         <dbl>
#> 1         2        11      227     9  3.78          2.38
#> 2         4        20      227    16  3.78          4.23
#> 3         2        33      160    31  2.67         11.6 
#> 4        -1       -18      183   -17  3.05         -5.57
#> 5        -6       -25      116   -19  1.93         -9.83
#> 6        -4        12      150    16  2.5           6.4 
#> # ... with 336,770 more rows

如果只想在保留新变量，可以使用transmute()：

flights %>% 
  select(ends_with("delay"),air_time) %>% 
  transmute(
    gain = arr_delay - dep_delay,
    hours = air_time / 60,
    gain_per_hour = gain/hours)
#> # A tibble: 336,776 x 3
#>    gain hours gain_per_hour
#>   <dbl> <dbl>         <dbl>
#> 1     9  3.78          2.38
#> 2    16  3.78          4.23
#> 3    31  2.67         11.6 
#> 4   -17  3.05         -5.57
#> 5   -19  1.93         -9.83
#> 6    16  2.5           6.4 
#> # ... with 336,770 more rows

1.3.2 常用创建函数

有多种函数可以帮助mutate()创建新变量。比较重要的一点是，这些函数必须是向量化的：它能接受一个向量作为输入，并返回一个向量作为输出，而且输入和输出向量长度相等。下面介绍一些比较常用的函数。

**算术运算符 +、-、*、/、^**
它们都是向量化的，使用所谓的“循环法则(recycling rules)”。如果一个参数比另一个参数短，那么前者会自动扩展到相同的长度，但某个参数是单个数值时，这种方式是最有效的，如air_time * 60 或者 hours * 60 + minute等。
算术运算符的另一个用途是与我们后面将很快学到的聚集函数结合起来使用。例如,x / sum(x)可以计算出x的各个分量在总数中的比例，y - mean(y)计算出y的各个分量与均值之间的差异。

模运算符 %/% 和 %
%/%（整除）和%%（求余）满足x == y * (x %/% y) + （x %% y, 这两个运算符在Python中分别是// 和 % 。
模运算非常好用，因为它可以拆分整数。例如，在flights数据集中，可以根据dep_time计算出 hour 和 minute：

## air_time中表示时间的方式是“xyz”表示x点yz分
flights %>% 
  transmute(hour = air_time %/% 100,
            minute = air_time %% 100)   
#> # A tibble: 336,776 x 2
#>    hour minute
#>   <dbl>  <dbl>
#> 1     2     27
#> 2     2     27
#> 3     1     60
#> 4     1     83
#> 5     1     16
#> 6     1     50
#> # ... with 336,770 more rows

对数函数 log()/log2()/log10()
在处理取值范围变化多个数量级的数据时，对数变换很有用。其他条件相同的情况下，更推荐使用log2()函数，因为它的解释很容易，对数变换后的变量每增加一个单位，意味着原始变量加倍；减少一个单位，则原始数据变为原来的一半。

偏移函数
lead()和lag()函数分别将一个向量向前或向后移动指定的单位：

x <- 1:10
## 将x向前移动2个单位
lead(x,n=2)
#>  [1]  3  4  5  6  7  8  9 10 NA NA
## 将x向后移动一个单位（默认n=1）
lag(x)
#>  [1] NA  1  2  3  4  5  6  7  8  9

累加和滚动聚合
R的基础包提供了计算累加和、累加积、累加最小值和累加最大值的函数：cumsum()、cumprod()、cummax()、cummin()；dplyr包还提供了cummean()函数以计算累积平均值。

x <- 1:10
cumsum(x)
#>  [1]  1  3  6 10 15 21 28 36 45 55
cumprod(x)
#>  [1]       1       2       6      24     120     720    5040   40320  362880
#> [10] 3628800
cummax(x)
#>  [1]  1  2  3  4  5  6  7  8  9 10
cummin(x)
#>  [1] 1 1 1 1 1 1 1 1 1 1
cummean(x)
#>  [1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5

逻辑比较 >、<=、>、>=、==、!=
如果要进行一系列复杂的逻辑运算，最好将中间结果保存在新变量中，这样就可以检查每一步是否都符合预期。

排秩
排秩函数有很多，从min_rank()开始，它可以完成最常用的排秩任务。默认的排秩方式是，最小的值获得最前面的秩（升序），使用desc(x)可以让最大的值获得前面的名次,NA值对应的秩也是NA：

x <- c(1,2,2,NA,3,4)
min_rank(x)
#> [1]  1  2  2 NA  4  5
min_rank(desc(x))
#> [1]  5  3  3 NA  2  1

min_rank()函数把相同值赋予相同的秩，如果有n个值秩相同为x，则下一个值的秩会直接从x+n开始如果min_rank()无法满足需要，可以看一下它的一些变体：

row_number(),相同值不同秩

x <- c(1,2,2,NA,3,4)
min_rank(x)
#> [1]  1  2  2 NA  4  5
row_number(x)
#> [1]  1  2  3 NA  4  5

dense_rank：相同值的秩相同，但下一个值的秩不会跳转

x <- c(1,2,2,NA,3,4)
min_rank(x)
#> [1]  1  2  2 NA  4  5
row_number(x)
#> [1]  1  2  3 NA  4  5
dense_rank(x)
#> [1]  1  2  2 NA  3  4

percent_rank(): 将秩按照比例压缩为[0,1]的值

x <- c(1,2,2,NA,3,4)
percent_rank(x)
#> [1] 0.00 0.25 0.25   NA 0.75 1.00

ntile()：breaks the input vector into n buckets.

x <- c(1,3,4,5,6,8)
ntile(x,n=2)
#> [1] 1 1 1 2 2 2
ntile(x,n=3)
#> [1] 1 1 2 2 3 3

1.3.3 Exercises

Exercise 1.9 虽然现在的dep_time和sched_dep_time变量方便阅读，但不适合计算，因为它们实际上并不是连续型数值。将它们转换为一种更方便的表示形式，即从 0 点开始的分钟数

# 观察两个变量的存储方式
flights %>% 
  select(dep_time, sched_dep_time)
#> # A tibble: 336,776 x 2
#>   dep_time sched_dep_time
#>      <int>          <int>
#> 1      517            515
#> 2      533            529
#> 3      542            540
#> 4      544            545
#> 5      554            600
#> 6      554            558
#> # ... with 336,770 more rows

xyz 表示 x 点 yz 分，则总分钟数为x %/% 100 * 60 + x %% 100 ; 但有一个问题是，由于 0 点是用2400代表的，经过这样的转换它变为 1440，我们希望它变为 0，所以在外层再套一个%% 1440

flights %>% transmute(
  dep_time_mins = (dep_time %/% 100 * 60 + dep_time %% 100) %% 1440, 
  sched_dep_time_mins = (sched_dep_time %/% 100 * 60 + sched_dep_time %% 100) %% 1440)
#> # A tibble: 336,776 x 2
#>   dep_time_mins sched_dep_time_mins
#>           <dbl>               <dbl>
#> 1           317                 315
#> 2           333                 329
#> 3           342                 340
#> 4           344                 345
#> 5           354                 360
#> 6           354                 358
#> # ... with 336,770 more rows

比较dep_time、sched_dep_time 和 dep_delay，这三者应该是何种关系？

flights_deptime <-
  mutate(flights,
    dep_time_min = (dep_time %/% 100 * 60 + dep_time %% 100) %% 1440,
    sched_dep_time_min = (sched_dep_time %/% 100 * 60 +
                          sched_dep_time %% 100) %% 1440,
    dep_delay_diff = dep_delay - dep_time_min + sched_dep_time_min
  )
filter(flights_deptime, dep_delay_diff != 0) %>% select(dep_delay_diff)
#> # A tibble: 1,236 x 1
#>   dep_delay_diff
#>            <dbl>
#> 1           1440
#> 2           1440
#> 3           1440
#> 4           1440
#> 5           1440
#> 6           1440
#> # ... with 1,230 more rows

如上所示，经过分钟的转换后，有1236行的dep_delay 不等于 dep_time - sched_dep_time. 有趣的是，这些差值全部等于1440。
> the discrepancies could be because a flight was scheduled to depart before midnight, but was delayed after midnight. All of these discrepancies are exactly equal to 1440 (24 hours), and the flights with these discrepancies were scheduled to depart later in the day.

使用排秩函数找出10个出发延误时间最长的航班

## 使min_rank默认小的值获得小的秩，arrange()默认降序排列，其中一个函数中要使用desc()
flights %>% mutate(delay_rank = min_rank(desc(dep_delay))) %>% arrange(delay_rank) %>% select(year,month,day,dep_delay,delay_rank)
#> # A tibble: 336,776 x 5
#>    year month   day dep_delay delay_rank
#>   <int> <int> <int>     <dbl>      <int>
#> 1  2013     1     9      1301          1
#> 2  2013     6    15      1137          2
#> 3  2013     1    10      1126          3
#> 4  2013     9    20      1014          4
#> 5  2013     7    22      1005          5
#> 6  2013     4    10       960          6
#> # ... with 336,770 more rows

1:3 + 1:10会返回什么？为什么？

1:3 + 1:10
#>  [1]  2  4  6  5  7  9  8 10 12 11

当一个向量中的值不够用时，这个向量会被循环使用 1:3 + 1:10等价于c(1 + 1, 2 + 2, 3 + 3, 1 + 4, 2 + 5, 3 + 6, 1 + 7, 2 + 8, 3 + 9, 1 + 10)

c(1 + 1, 2 + 2, 3 + 3, 1 + 4, 2 + 5, 3 + 6, 1 + 7, 2 + 8, 3 + 9, 1 + 10)
#>  [1]  2  4  6  5  7  9  8 10 12 11