Chapter 5 Data transformation
Author: yueyanw
Reviewer: chunjiw at 11/3/2017
library(tidyverse)
library(nycflights13)
5.1
5.2
5.2.1
5.2.2
5.2.3
5.2.4 Exercises
- Find all flights that
Had an arrival delay of two or more hours
filter(flights, arr_delay >= 120)
## # A tibble: 10,200 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 811 630 101 1047 ## 2 2013 1 1 848 1835 853 1001 ## 3 2013 1 1 957 733 144 1056 ## 4 2013 1 1 1114 900 134 1447 ## 5 2013 1 1 1505 1310 115 1638 ## 6 2013 1 1 1525 1340 105 1831 ## 7 2013 1 1 1549 1445 64 1912 ## 8 2013 1 1 1558 1359 119 1718 ## 9 2013 1 1 1732 1630 62 2028 ## 10 2013 1 1 1803 1620 103 2008 ## # ... with 10,190 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>
Flew to Houston (
IAH
orHOU
)filter(flights, dest %in% c('IAH', 'HOU'))
## # A tibble: 9,313 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2 830 ## 2 2013 1 1 533 529 4 850 ## 3 2013 1 1 623 627 -4 933 ## 4 2013 1 1 728 732 -4 1041 ## 5 2013 1 1 739 739 0 1104 ## 6 2013 1 1 908 908 0 1228 ## 7 2013 1 1 1028 1026 2 1350 ## 8 2013 1 1 1044 1045 -1 1352 ## 9 2013 1 1 1114 900 134 1447 ## 10 2013 1 1 1205 1200 5 1503 ## # ... with 9,303 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>
- Were operated by United, American, or Delta
- Departed in summer (July, August, and September)
- Arrived more than two hours late, but didn’t leave late
- Were delayed by at least an hour, but made up over 30 minutes in flight
Departed between midnight and 6am (inclusive)
5.3
5.3.1 Exercises
- How could you use
arrange()
to sort all missing values to the start? (Hint: useis.na()
).
df <- tibble(x = c(5, 2, NA))
arrange(df, !is.na(x), x)
## # A tibble: 3 x 1
## x
## <dbl>
## 1 NA
## 2 2
## 3 5
5.4
5.5
5.6
5.6.1
5.6.2
5.6.3
5.6.4
5.6.5
5.6.6
5.6.7 Exercises
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>% count(tailnum, wt = distance)
## # A tibble: 4,037 x 2
## tailnum n
## <chr> <dbl>
## 1 D942DN 3418
## 2 N0EGMQ 239143
## 3 N10156 109664
## 4 N102UW 25722
## 5 N103US 24619
## 6 N104UW 24616
## 7 N10575 139903
## 8 N105UW 23618
## 9 N107US 21677
## 10 N108UW 32070
## # ... with 4,027 more rows
not_cancelled %>% group_by(tailnum) %>% summarize(n = sum(distance))
## # A tibble: 4,037 x 2
## tailnum n
## <chr> <dbl>
## 1 D942DN 3418
## 2 N0EGMQ 239143
## 3 N10156 109664
## 4 N102UW 25722
## 5 N103US 24619
## 6 N104UW 24616
## 7 N10575 139903
## 8 N105UW 23618
## 9 N107US 21677
## 10 N108UW 32070
## # ... with 4,027 more rows