Chapter 7 Relational Data with dyplyr

Topics covered:

  • left_join keeps all observations in x;
  • right join keeps all observations in y;
  • full join keeps all observations in x and y;
  • inner join keeps observations in both x and y;
  • semi_join keeps all observations in x that have a match in y;
  • anti_join drops all observations in x that have a match in y.

7.1 join

#primary key vs. unit of analysis
#check whether there's observation whose n is greater than one
names(planes)
## [1] "tailnum"      "year"         "type"         "manufacturer" "model"        "engines"      "seats"       
## [8] "speed"        "engine"
(planes %>%
  count(tailnum) %>%
  filter(n>1))
## # A tibble: 0 x 2
## # … with 2 variables: tailnum <chr>, n <int>
weather %>%
  count(year, month, day, hour, origin) %>%
  filter(n>1)
## # A tibble: 3 x 6
##    year month   day  hour origin     n
##   <int> <int> <int> <int> <chr>  <int>
## 1  2013    11     3     1 EWR        2
## 2  2013    11     3     1 JFK        2
## 3  2013    11     3     1 LGA        2
flights %>%
  count(year, month, day, flight) %>%
  filter(n>1)
## # A tibble: 29,768 x 5
##     year month   day flight     n
##    <int> <int> <int>  <int> <int>
##  1  2013     1     1      1     2
##  2  2013     1     1      3     2
##  3  2013     1     1      4     2
##  4  2013     1     1     11     3
##  5  2013     1     1     15     2
##  6  2013     1     1     21     2
##  7  2013     1     1     27     4
##  8  2013     1     1     31     2
##  9  2013     1     1     32     2
## 10  2013     1     1     35     2
## # … with 29,758 more rows
flights %>%
  count(year, month, day, tailnum) %>%
  filter(n>1)
## # A tibble: 64,928 x 5
##     year month   day tailnum     n
##    <int> <int> <int> <chr>   <int>
##  1  2013     1     1 N0EGMQ      2
##  2  2013     1     1 N11189      2
##  3  2013     1     1 N11536      2
##  4  2013     1     1 N11544      3
##  5  2013     1     1 N11551      2
##  6  2013     1     1 N12540      2
##  7  2013     1     1 N12567      2
##  8  2013     1     1 N13123      2
##  9  2013     1     1 N13538      3
## 10  2013     1     1 N13566      3
## # … with 64,918 more rows
#mutating joins
names(flights)
##  [1] "year"           "month"          "day"            "dep_time"       "sched_dep_time" "dep_delay"     
##  [7] "arr_time"       "sched_arr_time" "arr_delay"      "carrier"        "flight"         "tailnum"       
## [13] "origin"         "dest"           "air_time"       "distance"       "hour"           "minute"        
## [19] "time_hour"
flights2 <-
  flights %>%
  select(year: day, hour, origin, dest, tailnum, carrier)
flights2
## # A tibble: 336,776 x 8
##     year month   day  hour origin dest  tailnum carrier
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>  
##  1  2013     1     1     5 EWR    IAH   N14228  UA     
##  2  2013     1     1     5 LGA    IAH   N24211  UA     
##  3  2013     1     1     5 JFK    MIA   N619AA  AA     
##  4  2013     1     1     5 JFK    BQN   N804JB  B6     
##  5  2013     1     1     6 LGA    ATL   N668DN  DL     
##  6  2013     1     1     5 EWR    ORD   N39463  UA     
##  7  2013     1     1     6 EWR    FLL   N516JB  B6     
##  8  2013     1     1     6 LGA    IAD   N829AS  EV     
##  9  2013     1     1     6 JFK    MCO   N593JB  B6     
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA     
## # … with 336,766 more rows
#add/merge the full airline names
flights2 %>%
  select(-origin, -dest) %>%
  left_join(airlines, by="carrier")
## # A tibble: 336,776 x 7
##     year month   day  hour tailnum carrier name                    
##    <int> <int> <int> <dbl> <chr>   <chr>   <chr>                   
##  1  2013     1     1     5 N14228  UA      United Air Lines Inc.   
##  2  2013     1     1     5 N24211  UA      United Air Lines Inc.   
##  3  2013     1     1     5 N619AA  AA      American Airlines Inc.  
##  4  2013     1     1     5 N804JB  B6      JetBlue Airways         
##  5  2013     1     1     6 N668DN  DL      Delta Air Lines Inc.    
##  6  2013     1     1     5 N39463  UA      United Air Lines Inc.   
##  7  2013     1     1     6 N516JB  B6      JetBlue Airways         
##  8  2013     1     1     6 N829AS  EV      ExpressJet Airlines Inc.
##  9  2013     1     1     6 N593JB  B6      JetBlue Airways         
## 10  2013     1     1     6 N3ALAA  AA      American Airlines Inc.  
## # … with 336,766 more rows
#by=c("a"="b", "c"="d")

#use mutate to realize left_join
flights2 %>%
  select(-origin, -dest) %>%
  mutate(name=airlines$name[match(carrier, airlines$carrier)])
## # A tibble: 336,776 x 7
##     year month   day  hour tailnum carrier name                    
##    <int> <int> <int> <dbl> <chr>   <chr>   <chr>                   
##  1  2013     1     1     5 N14228  UA      United Air Lines Inc.   
##  2  2013     1     1     5 N24211  UA      United Air Lines Inc.   
##  3  2013     1     1     5 N619AA  AA      American Airlines Inc.  
##  4  2013     1     1     5 N804JB  B6      JetBlue Airways         
##  5  2013     1     1     6 N668DN  DL      Delta Air Lines Inc.    
##  6  2013     1     1     5 N39463  UA      United Air Lines Inc.   
##  7  2013     1     1     6 N516JB  B6      JetBlue Airways         
##  8  2013     1     1     6 N829AS  EV      ExpressJet Airlines Inc.
##  9  2013     1     1     6 N593JB  B6      JetBlue Airways         
## 10  2013     1     1     6 N3ALAA  AA      American Airlines Inc.  
## # … with 336,766 more rows
#different joins
x <- tribble(
  ~key, ~val_x,
  1, "x1",
  2, "x2",
  3, "x3"
)
x
## # A tibble: 3 x 2
##     key val_x
##   <dbl> <chr>
## 1     1 x1   
## 2     2 x2   
## 3     3 x3
y <- tribble(
  ~key, ~val_y,
  1, "y1",
  2, "y2",
  4, "y3"
)
y
## # A tibble: 3 x 2
##     key val_y
##   <dbl> <chr>
## 1     1 y1   
## 2     2 y2   
## 3     4 y3
#inner_join returns matched rows
x %>%
  left_join(y, by="key")
## # A tibble: 3 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA>
#duplicate keys
#one-to-many relationship:
x <- tribble(
  ~key, ~val_x,
  1, "x1",
  2, "x2",
  2, "x3",
  1, "x4"
)
x
## # A tibble: 4 x 2
##     key val_x
##   <dbl> <chr>
## 1     1 x1   
## 2     2 x2   
## 3     2 x3   
## 4     1 x4
y<- tribble(
  ~key, ~val_y,
  1, "y1",
  2, "y2"
)
y
## # A tibble: 2 x 2
##     key val_y
##   <dbl> <chr>
## 1     1 y1   
## 2     2 y2
left_join(x,y,by="key")
## # A tibble: 4 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     2 x3    y2   
## 4     1 x4    y1
#many-to-many relationship returns the Cartesian product:
x <- tribble(
  ~key, ~val_x,
  1, "x1",
  2, "x2",
  2, "x3",
  3, "x4"
)
x
## # A tibble: 4 x 2
##     key val_x
##   <dbl> <chr>
## 1     1 x1   
## 2     2 x2   
## 3     2 x3   
## 4     3 x4
y <- tribble(
  ~key, ~val_x,
  1, "y1",
  2, "y2",
  2, "y3",
  3, "y4"
)
y
## # A tibble: 4 x 2
##     key val_x
##   <dbl> <chr>
## 1     1 y1   
## 2     2 y2   
## 3     2 y3   
## 4     3 y4
left_join(x,y,by="key")
## # A tibble: 6 x 3
##     key val_x.x val_x.y
##   <dbl> <chr>   <chr>  
## 1     1 x1      y1     
## 2     2 x2      y2     
## 3     2 x2      y3     
## 4     2 x3      y2     
## 5     2 x3      y3     
## 6     3 x4      y4
#defining the key columns
#by default, use all variables that appear in both tables (year, month, day, hour, and origin)
flights2 %>%
  left_join(weather)
## Joining, by = c("year", "month", "day", "hour", "origin")
## # A tibble: 336,776 x 18
##     year month   day  hour origin dest  tailnum carrier  temp  dewp humid wind_dir wind_speed wind_gust precip pressure
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <dbl> <dbl> <dbl>    <dbl>      <dbl>     <dbl>  <dbl>    <dbl>
##  1  2013     1     1     5 EWR    IAH   N14228  UA       39.0  28.0  64.4      260       12.7      NA        0    1012.
##  2  2013     1     1     5 LGA    IAH   N24211  UA       39.9  25.0  54.8      250       15.0      21.9      0    1011.
##  3  2013     1     1     5 JFK    MIA   N619AA  AA       39.0  27.0  61.6      260       15.0      NA        0    1012.
##  4  2013     1     1     5 JFK    BQN   N804JB  B6       39.0  27.0  61.6      260       15.0      NA        0    1012.
##  5  2013     1     1     6 LGA    ATL   N668DN  DL       39.9  25.0  54.8      260       16.1      23.0      0    1012.
##  6  2013     1     1     5 EWR    ORD   N39463  UA       39.0  28.0  64.4      260       12.7      NA        0    1012.
##  7  2013     1     1     6 EWR    FLL   N516JB  B6       37.9  28.0  67.2      240       11.5      NA        0    1012.
##  8  2013     1     1     6 LGA    IAD   N829AS  EV       39.9  25.0  54.8      260       16.1      23.0      0    1012.
##  9  2013     1     1     6 JFK    MCO   N593JB  B6       37.9  27.0  64.3      260       13.8      NA        0    1013.
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA       39.9  25.0  54.8      260       16.1      23.0      0    1012.
## # … with 336,766 more rows, and 2 more variables: visib <dbl>, time_hour <dttm>
flights2 %>%
  left_join(planes, by="tailnum")
## # A tibble: 336,776 x 16
##    year.x month   day  hour origin dest  tailnum carrier year.y type       manufacturer  model  engines seats speed engine
##     <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>    <int> <chr>      <chr>         <chr>    <int> <int> <int> <chr> 
##  1   2013     1     1     5 EWR    IAH   N14228  UA        1999 Fixed win… BOEING        737-8…       2   149    NA Turbo…
##  2   2013     1     1     5 LGA    IAH   N24211  UA        1998 Fixed win… BOEING        737-8…       2   149    NA Turbo…
##  3   2013     1     1     5 JFK    MIA   N619AA  AA        1990 Fixed win… BOEING        757-2…       2   178    NA Turbo…
##  4   2013     1     1     5 JFK    BQN   N804JB  B6        2012 Fixed win… AIRBUS        A320-…       2   200    NA Turbo…
##  5   2013     1     1     6 LGA    ATL   N668DN  DL        1991 Fixed win… BOEING        757-2…       2   178    NA Turbo…
##  6   2013     1     1     5 EWR    ORD   N39463  UA        2012 Fixed win… BOEING        737-9…       2   191    NA Turbo…
##  7   2013     1     1     6 EWR    FLL   N516JB  B6        2000 Fixed win… AIRBUS INDUS… A320-…       2   200    NA Turbo…
##  8   2013     1     1     6 LGA    IAD   N829AS  EV        1998 Fixed win… CANADAIR      CL-60…       2    55    NA Turbo…
##  9   2013     1     1     6 JFK    MCO   N593JB  B6        2004 Fixed win… AIRBUS        A320-…       2   200    NA Turbo…
## 10   2013     1     1     6 LGA    ORD   N3ALAA  AA          NA <NA>       <NA>          <NA>        NA    NA    NA <NA>  
## # … with 336,766 more rows
airports
## # A tibble: 1,458 x 8
##    faa   name                             lat    lon   alt    tz dst   tzone              
##    <chr> <chr>                          <dbl>  <dbl> <dbl> <dbl> <chr> <chr>              
##  1 04G   Lansdowne Airport               41.1  -80.6  1044    -5 A     America/New_York   
##  2 06A   Moton Field Municipal Airport   32.5  -85.7   264    -6 A     America/Chicago    
##  3 06C   Schaumburg Regional             42.0  -88.1   801    -6 A     America/Chicago    
##  4 06N   Randall Airport                 41.4  -74.4   523    -5 A     America/New_York   
##  5 09J   Jekyll Island Airport           31.1  -81.4    11    -5 A     America/New_York   
##  6 0A9   Elizabethton Municipal Airport  36.4  -82.2  1593    -5 A     America/New_York   
##  7 0G6   Williams County Airport         41.5  -84.5   730    -5 A     America/New_York   
##  8 0G7   Finger Lakes Regional Airport   42.9  -76.8   492    -5 A     America/New_York   
##  9 0P2   Shoestring Aviation Airfield    39.8  -76.6  1000    -5 U     America/New_York   
## 10 0S9   Jefferson County Intl           48.1 -123.    108    -8 A     America/Los_Angeles
## # … with 1,448 more rows
flights2 %>%
  left_join(airports, c("dest"="faa"))
## # A tibble: 336,776 x 15
##     year month   day  hour origin dest  tailnum carrier name                      lat   lon   alt    tz dst   tzone       
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>                   <dbl> <dbl> <dbl> <dbl> <chr> <chr>       
##  1  2013     1     1     5 EWR    IAH   N14228  UA      George Bush Interconti…  30.0 -95.3    97    -6 A     America/Chi…
##  2  2013     1     1     5 LGA    IAH   N24211  UA      George Bush Interconti…  30.0 -95.3    97    -6 A     America/Chi…
##  3  2013     1     1     5 JFK    MIA   N619AA  AA      Miami Intl               25.8 -80.3     8    -5 A     America/New…
##  4  2013     1     1     5 JFK    BQN   N804JB  B6      <NA>                     NA    NA      NA    NA <NA>  <NA>        
##  5  2013     1     1     6 LGA    ATL   N668DN  DL      Hartsfield Jackson Atl…  33.6 -84.4  1026    -5 A     America/New…
##  6  2013     1     1     5 EWR    ORD   N39463  UA      Chicago Ohare Intl       42.0 -87.9   668    -6 A     America/Chi…
##  7  2013     1     1     6 EWR    FLL   N516JB  B6      Fort Lauderdale Hollyw…  26.1 -80.2     9    -5 A     America/New…
##  8  2013     1     1     6 LGA    IAD   N829AS  EV      Washington Dulles Intl   38.9 -77.5   313    -5 A     America/New…
##  9  2013     1     1     6 JFK    MCO   N593JB  B6      Orlando Intl             28.4 -81.3    96    -5 A     America/New…
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA      Chicago Ohare Intl       42.0 -87.9   668    -6 A     America/Chi…
## # … with 336,766 more rows
flights2 %>%
  left_join(airports, c("origin"="faa"))
## # A tibble: 336,776 x 15
##     year month   day  hour origin dest  tailnum carrier name                  lat   lon   alt    tz dst   tzone           
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>               <dbl> <dbl> <dbl> <dbl> <chr> <chr>           
##  1  2013     1     1     5 EWR    IAH   N14228  UA      Newark Liberty Intl  40.7 -74.2    18    -5 A     America/New_York
##  2  2013     1     1     5 LGA    IAH   N24211  UA      La Guardia           40.8 -73.9    22    -5 A     America/New_York
##  3  2013     1     1     5 JFK    MIA   N619AA  AA      John F Kennedy Intl  40.6 -73.8    13    -5 A     America/New_York
##  4  2013     1     1     5 JFK    BQN   N804JB  B6      John F Kennedy Intl  40.6 -73.8    13    -5 A     America/New_York
##  5  2013     1     1     6 LGA    ATL   N668DN  DL      La Guardia           40.8 -73.9    22    -5 A     America/New_York
##  6  2013     1     1     5 EWR    ORD   N39463  UA      Newark Liberty Intl  40.7 -74.2    18    -5 A     America/New_York
##  7  2013     1     1     6 EWR    FLL   N516JB  B6      Newark Liberty Intl  40.7 -74.2    18    -5 A     America/New_York
##  8  2013     1     1     6 LGA    IAD   N829AS  EV      La Guardia           40.8 -73.9    22    -5 A     America/New_York
##  9  2013     1     1     6 JFK    MCO   N593JB  B6      John F Kennedy Intl  40.6 -73.8    13    -5 A     America/New_York
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA      La Guardia           40.8 -73.9    22    -5 A     America/New_York
## # … with 336,766 more rows
#filtering joins
top_dest <- flights %>%
  count(dest, sort = T) %>%
  head(10)
top_dest
## # A tibble: 10 x 2
##    dest      n
##    <chr> <int>
##  1 ORD   17283
##  2 ATL   17215
##  3 LAX   16174
##  4 BOS   15508
##  5 MCO   14082
##  6 CLT   14064
##  7 SFO   13331
##  8 FLL   12055
##  9 MIA   11728
## 10 DCA    9705
flights %>%
  filter(dest %in% top_dest$dest)
## # A tibble: 141,145 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr> 
##  1  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK   
##  2  2013     1     1      554            600        -6      812            837       -25 DL         461 N668DN  LGA   
##  3  2013     1     1      554            558        -4      740            728        12 UA        1696 N39463  EWR   
##  4  2013     1     1      555            600        -5      913            854        19 B6         507 N516JB  EWR   
##  5  2013     1     1      557            600        -3      838            846        -8 B6          79 N593JB  JFK   
##  6  2013     1     1      558            600        -2      753            745         8 AA         301 N3ALAA  LGA   
##  7  2013     1     1      558            600        -2      924            917         7 UA         194 N29129  JFK   
##  8  2013     1     1      558            600        -2      923            937       -14 UA        1124 N53441  EWR   
##  9  2013     1     1      559            559         0      702            706        -4 B6        1806 N708JB  JFK   
## 10  2013     1     1      600            600         0      851            858        -7 B6         371 N595JB  LGA   
## # … with 141,135 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
flights %>%
  semi_join(top_dest)
## Joining, by = "dest"
## # A tibble: 141,145 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr> 
##  1  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK   
##  2  2013     1     1      554            600        -6      812            837       -25 DL         461 N668DN  LGA   
##  3  2013     1     1      554            558        -4      740            728        12 UA        1696 N39463  EWR   
##  4  2013     1     1      555            600        -5      913            854        19 B6         507 N516JB  EWR   
##  5  2013     1     1      557            600        -3      838            846        -8 B6          79 N593JB  JFK   
##  6  2013     1     1      558            600        -2      753            745         8 AA         301 N3ALAA  LGA   
##  7  2013     1     1      558            600        -2      924            917         7 UA         194 N29129  JFK   
##  8  2013     1     1      558            600        -2      923            937       -14 UA        1124 N53441  EWR   
##  9  2013     1     1      559            559         0      702            706        -4 B6        1806 N708JB  JFK   
## 10  2013     1     1      600            600         0      851            858        -7 B6         371 N595JB  LGA   
## # … with 141,135 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
flights %>%
  anti_join(planes, by="tailnum")%>%
  count(tailnum, sort = T)
## # A tibble: 722 x 2
##    tailnum     n
##    <chr>   <int>
##  1 <NA>     2512
##  2 N725MQ    575
##  3 N722MQ    513
##  4 N723MQ    507
##  5 N713MQ    483
##  6 N735MQ    396
##  7 N0EGMQ    371
##  8 N534MQ    364
##  9 N542MQ    363
## 10 N531MQ    349
## # … with 712 more rows