Chapter 7 Relational Data with dyplyr
Topics covered:
- left_join keeps all observations in x;
- right join keeps all observations in y;
- full join keeps all observations in x and y;
- inner join keeps observations in both x and y;
- semi_join keeps all observations in x that have a match in y;
- anti_join drops all observations in x that have a match in y.
7.1 join
#primary key vs. unit of analysis
#check whether there's observation whose n is greater than one
names(planes)
## [1] "tailnum" "year" "type" "manufacturer" "model" "engines" "seats"
## [8] "speed" "engine"
(planes %>%
count(tailnum) %>%
filter(n>1))
## # A tibble: 0 x 2
## # … with 2 variables: tailnum <chr>, n <int>
weather %>%
count(year, month, day, hour, origin) %>%
filter(n>1)
## # A tibble: 3 x 6
## year month day hour origin n
## <int> <int> <int> <int> <chr> <int>
## 1 2013 11 3 1 EWR 2
## 2 2013 11 3 1 JFK 2
## 3 2013 11 3 1 LGA 2
flights %>%
count(year, month, day, flight) %>%
filter(n>1)
## # A tibble: 29,768 x 5
## year month day flight n
## <int> <int> <int> <int> <int>
## 1 2013 1 1 1 2
## 2 2013 1 1 3 2
## 3 2013 1 1 4 2
## 4 2013 1 1 11 3
## 5 2013 1 1 15 2
## 6 2013 1 1 21 2
## 7 2013 1 1 27 4
## 8 2013 1 1 31 2
## 9 2013 1 1 32 2
## 10 2013 1 1 35 2
## # … with 29,758 more rows
flights %>%
count(year, month, day, tailnum) %>%
filter(n>1)
## # A tibble: 64,928 x 5
## year month day tailnum n
## <int> <int> <int> <chr> <int>
## 1 2013 1 1 N0EGMQ 2
## 2 2013 1 1 N11189 2
## 3 2013 1 1 N11536 2
## 4 2013 1 1 N11544 3
## 5 2013 1 1 N11551 2
## 6 2013 1 1 N12540 2
## 7 2013 1 1 N12567 2
## 8 2013 1 1 N13123 2
## 9 2013 1 1 N13538 3
## 10 2013 1 1 N13566 3
## # … with 64,918 more rows
#mutating joins
names(flights)
## [1] "year" "month" "day" "dep_time" "sched_dep_time" "dep_delay"
## [7] "arr_time" "sched_arr_time" "arr_delay" "carrier" "flight" "tailnum"
## [13] "origin" "dest" "air_time" "distance" "hour" "minute"
## [19] "time_hour"
flights2 <-
flights %>%
select(year: day, hour, origin, dest, tailnum, carrier)
flights2
## # A tibble: 336,776 x 8
## year month day hour origin dest tailnum carrier
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA
## 2 2013 1 1 5 LGA IAH N24211 UA
## 3 2013 1 1 5 JFK MIA N619AA AA
## 4 2013 1 1 5 JFK BQN N804JB B6
## 5 2013 1 1 6 LGA ATL N668DN DL
## 6 2013 1 1 5 EWR ORD N39463 UA
## 7 2013 1 1 6 EWR FLL N516JB B6
## 8 2013 1 1 6 LGA IAD N829AS EV
## 9 2013 1 1 6 JFK MCO N593JB B6
## 10 2013 1 1 6 LGA ORD N3ALAA AA
## # … with 336,766 more rows
#add/merge the full airline names
flights2 %>%
select(-origin, -dest) %>%
left_join(airlines, by="carrier")
## # A tibble: 336,776 x 7
## year month day hour tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr>
## 1 2013 1 1 5 N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 N619AA AA American Airlines Inc.
## 4 2013 1 1 5 N804JB B6 JetBlue Airways
## 5 2013 1 1 6 N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 N39463 UA United Air Lines Inc.
## 7 2013 1 1 6 N516JB B6 JetBlue Airways
## 8 2013 1 1 6 N829AS EV ExpressJet Airlines Inc.
## 9 2013 1 1 6 N593JB B6 JetBlue Airways
## 10 2013 1 1 6 N3ALAA AA American Airlines Inc.
## # … with 336,766 more rows
#by=c("a"="b", "c"="d")
#use mutate to realize left_join
flights2 %>%
select(-origin, -dest) %>%
mutate(name=airlines$name[match(carrier, airlines$carrier)])
## # A tibble: 336,776 x 7
## year month day hour tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr>
## 1 2013 1 1 5 N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 N619AA AA American Airlines Inc.
## 4 2013 1 1 5 N804JB B6 JetBlue Airways
## 5 2013 1 1 6 N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 N39463 UA United Air Lines Inc.
## 7 2013 1 1 6 N516JB B6 JetBlue Airways
## 8 2013 1 1 6 N829AS EV ExpressJet Airlines Inc.
## 9 2013 1 1 6 N593JB B6 JetBlue Airways
## 10 2013 1 1 6 N3ALAA AA American Airlines Inc.
## # … with 336,766 more rows
#different joins
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
3, "x3"
)
x
## # A tibble: 3 x 2
## key val_x
## <dbl> <chr>
## 1 1 x1
## 2 2 x2
## 3 3 x3
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
4, "y3"
)
y
## # A tibble: 3 x 2
## key val_y
## <dbl> <chr>
## 1 1 y1
## 2 2 y2
## 3 4 y3
#inner_join returns matched rows
x %>%
left_join(y, by="key")
## # A tibble: 3 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
#duplicate keys
#one-to-many relationship:
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
2, "x3",
1, "x4"
)
x
## # A tibble: 4 x 2
## key val_x
## <dbl> <chr>
## 1 1 x1
## 2 2 x2
## 3 2 x3
## 4 1 x4
y<- tribble(
~key, ~val_y,
1, "y1",
2, "y2"
)
y
## # A tibble: 2 x 2
## key val_y
## <dbl> <chr>
## 1 1 y1
## 2 2 y2
left_join(x,y,by="key")
## # A tibble: 4 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 2 x3 y2
## 4 1 x4 y1
#many-to-many relationship returns the Cartesian product:
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
2, "x3",
3, "x4"
)
x
## # A tibble: 4 x 2
## key val_x
## <dbl> <chr>
## 1 1 x1
## 2 2 x2
## 3 2 x3
## 4 3 x4
y <- tribble(
~key, ~val_x,
1, "y1",
2, "y2",
2, "y3",
3, "y4"
)
y
## # A tibble: 4 x 2
## key val_x
## <dbl> <chr>
## 1 1 y1
## 2 2 y2
## 3 2 y3
## 4 3 y4
left_join(x,y,by="key")
## # A tibble: 6 x 3
## key val_x.x val_x.y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 2 x2 y3
## 4 2 x3 y2
## 5 2 x3 y3
## 6 3 x4 y4
#defining the key columns
#by default, use all variables that appear in both tables (year, month, day, hour, and origin)
flights2 %>%
left_join(weather)
## Joining, by = c("year", "month", "day", "hour", "origin")
## # A tibble: 336,776 x 18
## year month day hour origin dest tailnum carrier temp dewp humid wind_dir wind_speed wind_gust precip pressure
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA 39.0 28.0 64.4 260 12.7 NA 0 1012.
## 2 2013 1 1 5 LGA IAH N24211 UA 39.9 25.0 54.8 250 15.0 21.9 0 1011.
## 3 2013 1 1 5 JFK MIA N619AA AA 39.0 27.0 61.6 260 15.0 NA 0 1012.
## 4 2013 1 1 5 JFK BQN N804JB B6 39.0 27.0 61.6 260 15.0 NA 0 1012.
## 5 2013 1 1 6 LGA ATL N668DN DL 39.9 25.0 54.8 260 16.1 23.0 0 1012.
## 6 2013 1 1 5 EWR ORD N39463 UA 39.0 28.0 64.4 260 12.7 NA 0 1012.
## 7 2013 1 1 6 EWR FLL N516JB B6 37.9 28.0 67.2 240 11.5 NA 0 1012.
## 8 2013 1 1 6 LGA IAD N829AS EV 39.9 25.0 54.8 260 16.1 23.0 0 1012.
## 9 2013 1 1 6 JFK MCO N593JB B6 37.9 27.0 64.3 260 13.8 NA 0 1013.
## 10 2013 1 1 6 LGA ORD N3ALAA AA 39.9 25.0 54.8 260 16.1 23.0 0 1012.
## # … with 336,766 more rows, and 2 more variables: visib <dbl>, time_hour <dttm>
flights2 %>%
left_join(planes, by="tailnum")
## # A tibble: 336,776 x 16
## year.x month day hour origin dest tailnum carrier year.y type manufacturer model engines seats speed engine
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA 1999 Fixed win… BOEING 737-8… 2 149 NA Turbo…
## 2 2013 1 1 5 LGA IAH N24211 UA 1998 Fixed win… BOEING 737-8… 2 149 NA Turbo…
## 3 2013 1 1 5 JFK MIA N619AA AA 1990 Fixed win… BOEING 757-2… 2 178 NA Turbo…
## 4 2013 1 1 5 JFK BQN N804JB B6 2012 Fixed win… AIRBUS A320-… 2 200 NA Turbo…
## 5 2013 1 1 6 LGA ATL N668DN DL 1991 Fixed win… BOEING 757-2… 2 178 NA Turbo…
## 6 2013 1 1 5 EWR ORD N39463 UA 2012 Fixed win… BOEING 737-9… 2 191 NA Turbo…
## 7 2013 1 1 6 EWR FLL N516JB B6 2000 Fixed win… AIRBUS INDUS… A320-… 2 200 NA Turbo…
## 8 2013 1 1 6 LGA IAD N829AS EV 1998 Fixed win… CANADAIR CL-60… 2 55 NA Turbo…
## 9 2013 1 1 6 JFK MCO N593JB B6 2004 Fixed win… AIRBUS A320-… 2 200 NA Turbo…
## 10 2013 1 1 6 LGA ORD N3ALAA AA NA <NA> <NA> <NA> NA NA NA <NA>
## # … with 336,766 more rows
airports
## # A tibble: 1,458 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_York
## 2 06A Moton Field Municipal Airport 32.5 -85.7 264 -6 A America/Chicago
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chicago
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_York
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New_York
## 6 0A9 Elizabethton Municipal Airport 36.4 -82.2 1593 -5 A America/New_York
## 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/New_York
## 8 0G7 Finger Lakes Regional Airport 42.9 -76.8 492 -5 A America/New_York
## 9 0P2 Shoestring Aviation Airfield 39.8 -76.6 1000 -5 U America/New_York
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/Los_Angeles
## # … with 1,448 more rows
flights2 %>%
left_join(airports, c("dest"="faa"))
## # A tibble: 336,776 x 15
## year month day hour origin dest tailnum carrier name lat lon alt tz dst tzone
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA George Bush Interconti… 30.0 -95.3 97 -6 A America/Chi…
## 2 2013 1 1 5 LGA IAH N24211 UA George Bush Interconti… 30.0 -95.3 97 -6 A America/Chi…
## 3 2013 1 1 5 JFK MIA N619AA AA Miami Intl 25.8 -80.3 8 -5 A America/New…
## 4 2013 1 1 5 JFK BQN N804JB B6 <NA> NA NA NA NA <NA> <NA>
## 5 2013 1 1 6 LGA ATL N668DN DL Hartsfield Jackson Atl… 33.6 -84.4 1026 -5 A America/New…
## 6 2013 1 1 5 EWR ORD N39463 UA Chicago Ohare Intl 42.0 -87.9 668 -6 A America/Chi…
## 7 2013 1 1 6 EWR FLL N516JB B6 Fort Lauderdale Hollyw… 26.1 -80.2 9 -5 A America/New…
## 8 2013 1 1 6 LGA IAD N829AS EV Washington Dulles Intl 38.9 -77.5 313 -5 A America/New…
## 9 2013 1 1 6 JFK MCO N593JB B6 Orlando Intl 28.4 -81.3 96 -5 A America/New…
## 10 2013 1 1 6 LGA ORD N3ALAA AA Chicago Ohare Intl 42.0 -87.9 668 -6 A America/Chi…
## # … with 336,766 more rows
flights2 %>%
left_join(airports, c("origin"="faa"))
## # A tibble: 336,776 x 15
## year month day hour origin dest tailnum carrier name lat lon alt tz dst tzone
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA Newark Liberty Intl 40.7 -74.2 18 -5 A America/New_York
## 2 2013 1 1 5 LGA IAH N24211 UA La Guardia 40.8 -73.9 22 -5 A America/New_York
## 3 2013 1 1 5 JFK MIA N619AA AA John F Kennedy Intl 40.6 -73.8 13 -5 A America/New_York
## 4 2013 1 1 5 JFK BQN N804JB B6 John F Kennedy Intl 40.6 -73.8 13 -5 A America/New_York
## 5 2013 1 1 6 LGA ATL N668DN DL La Guardia 40.8 -73.9 22 -5 A America/New_York
## 6 2013 1 1 5 EWR ORD N39463 UA Newark Liberty Intl 40.7 -74.2 18 -5 A America/New_York
## 7 2013 1 1 6 EWR FLL N516JB B6 Newark Liberty Intl 40.7 -74.2 18 -5 A America/New_York
## 8 2013 1 1 6 LGA IAD N829AS EV La Guardia 40.8 -73.9 22 -5 A America/New_York
## 9 2013 1 1 6 JFK MCO N593JB B6 John F Kennedy Intl 40.6 -73.8 13 -5 A America/New_York
## 10 2013 1 1 6 LGA ORD N3ALAA AA La Guardia 40.8 -73.9 22 -5 A America/New_York
## # … with 336,766 more rows
#filtering joins
top_dest <- flights %>%
count(dest, sort = T) %>%
head(10)
top_dest
## # A tibble: 10 x 2
## dest n
## <chr> <int>
## 1 ORD 17283
## 2 ATL 17215
## 3 LAX 16174
## 4 BOS 15508
## 5 MCO 14082
## 6 CLT 14064
## 7 SFO 13331
## 8 FLL 12055
## 9 MIA 11728
## 10 DCA 9705
flights %>%
filter(dest %in% top_dest$dest)
## # A tibble: 141,145 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
## 1 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK
## 2 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA
## 3 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR
## 4 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR
## 5 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK
## 6 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA
## 7 2013 1 1 558 600 -2 924 917 7 UA 194 N29129 JFK
## 8 2013 1 1 558 600 -2 923 937 -14 UA 1124 N53441 EWR
## 9 2013 1 1 559 559 0 702 706 -4 B6 1806 N708JB JFK
## 10 2013 1 1 600 600 0 851 858 -7 B6 371 N595JB LGA
## # … with 141,135 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
flights %>%
semi_join(top_dest)
## Joining, by = "dest"
## # A tibble: 141,145 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
## 1 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK
## 2 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA
## 3 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR
## 4 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR
## 5 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK
## 6 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA
## 7 2013 1 1 558 600 -2 924 917 7 UA 194 N29129 JFK
## 8 2013 1 1 558 600 -2 923 937 -14 UA 1124 N53441 EWR
## 9 2013 1 1 559 559 0 702 706 -4 B6 1806 N708JB JFK
## 10 2013 1 1 600 600 0 851 858 -7 B6 371 N595JB LGA
## # … with 141,135 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
flights %>%
anti_join(planes, by="tailnum")%>%
count(tailnum, sort = T)
## # A tibble: 722 x 2
## tailnum n
## <chr> <int>
## 1 <NA> 2512
## 2 N725MQ 575
## 3 N722MQ 513
## 4 N723MQ 507
## 5 N713MQ 483
## 6 N735MQ 396
## 7 N0EGMQ 371
## 8 N534MQ 364
## 9 N542MQ 363
## 10 N531MQ 349
## # … with 712 more rows