Chapter 22 Joins & Merging

What You’ll Learn:

  • Types of joins (inner, left, right, full)
  • Join keys and matching
  • Common join errors
  • Multiple key joins
  • Handling duplicates
  • Anti and semi joins

Key Errors Covered: 20+ join errors

Difficulty: ⭐⭐⭐ Advanced

22.1 Introduction

Joining combines data from multiple sources:

library(dplyr)

# Two related datasets
customers <- tibble(
  id = 1:3,
  name = c("Alice", "Bob", "Charlie")
)

orders <- tibble(
  order_id = 1:4,
  customer_id = c(1, 1, 2, 4),
  amount = c(100, 150, 200, 75)
)

# Join them
customers %>%
  left_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 4 × 4
#>      id name    order_id amount
#>   <dbl> <chr>      <int>  <dbl>
#> 1     1 Alice          1    100
#> 2     1 Alice          2    150
#> 3     2 Bob            3    200
#> 4     3 Charlie       NA     NA

But joins have many pitfalls. Let’s master them!

22.2 Join Types

💡 Key Insight: Four Main Join Types

# Sample data
band_members
#> # A tibble: 3 × 2
#>   name  band   
#>   <chr> <chr>  
#> 1 Mick  Stones 
#> 2 John  Beatles
#> 3 Paul  Beatles
band_instruments
#> # A tibble: 3 × 2
#>   name  plays 
#>   <chr> <chr> 
#> 1 John  guitar
#> 2 Paul  bass  
#> 3 Keith guitar

# inner_join: Only matching rows
band_members %>%
  inner_join(band_instruments, by = "name")
#> # A tibble: 2 × 3
#>   name  band    plays 
#>   <chr> <chr>   <chr> 
#> 1 John  Beatles guitar
#> 2 Paul  Beatles bass

# left_join: All from left, matching from right
band_members %>%
  left_join(band_instruments, by = "name")
#> # A tibble: 3 × 3
#>   name  band    plays 
#>   <chr> <chr>   <chr> 
#> 1 Mick  Stones  <NA>  
#> 2 John  Beatles guitar
#> 3 Paul  Beatles bass

# right_join: All from right, matching from left
band_members %>%
  right_join(band_instruments, by = "name")
#> # A tibble: 3 × 3
#>   name  band    plays 
#>   <chr> <chr>   <chr> 
#> 1 John  Beatles guitar
#> 2 Paul  Beatles bass  
#> 3 Keith <NA>    guitar

# full_join: All rows from both
band_members %>%
  full_join(band_instruments, by = "name")
#> # A tibble: 4 × 3
#>   name  band    plays 
#>   <chr> <chr>   <chr> 
#> 1 Mick  Stones  <NA>  
#> 2 John  Beatles guitar
#> 3 Paul  Beatles bass  
#> 4 Keith <NA>    guitar

Visual representation: - inner_join: ∩ (intersection) - left_join: ← (left + intersection) - right_join: → (right + intersection) - full_join: ∪ (union, all rows)

22.3 Error #1: by must be supplied

⭐ BEGINNER 📋 ARGS

22.3.1 The Error

df1 <- tibble(id = 1:3, x = c("a", "b", "c"))
df2 <- tibble(identifier = 1:3, y = c("d", "e", "f"))

df1 %>%
  left_join(df2)
#> Error in `left_join()`:
#> ! `by` must be supplied when `x` and `y` have no common variables.
#> ℹ Use `cross_join()` to perform a cross-join.

🔴 ERROR (dplyr >= 1.1.0)

Error in left_join():
! `by` must be supplied when `x` and `y` have no common variables.

22.3.2 What It Means

The two datasets have no columns with the same name, so dplyr doesn’t know how to join them.

22.3.3 Common Causes

# Different column names for same concept
customers <- tibble(customer_id = 1:3, name = c("A", "B", "C"))
orders <- tibble(cust_id = c(1, 1, 2), amount = c(100, 150, 200))

customers %>%
  left_join(orders)  # No common columns!
#> Error in `left_join()`:
#> ! `by` must be supplied when `x` and `y` have no common variables.
#> ℹ Use `cross_join()` to perform a cross-join.

# Typo in column name
df1 <- tibble(id = 1:3, x = "a")
df2 <- tibble(ID = 1:3, y = "b")  # Different case!

df1 %>%
  left_join(df2)
#> Error in `left_join()`:
#> ! `by` must be supplied when `x` and `y` have no common variables.
#> ℹ Use `cross_join()` to perform a cross-join.

22.3.4 Solutions

SOLUTION 1: Specify Join Keys

# Use by argument with named vector
customers <- tibble(customer_id = 1:3, name = c("A", "B", "C"))
orders <- tibble(cust_id = c(1, 1, 2), amount = c(100, 150, 200))

customers %>%
  left_join(orders, by = c("customer_id" = "cust_id"))
#> # A tibble: 4 × 3
#>   customer_id name  amount
#>         <dbl> <chr>  <dbl>
#> 1           1 A        100
#> 2           1 A        150
#> 3           2 B        200
#> 4           3 C         NA

# Multiple keys
df1 <- tibble(id = 1:3, type = "A", x = 1:3)
df2 <- tibble(id = 1:3, type = "A", y = 4:6)

df1 %>%
  left_join(df2, by = c("id", "type"))
#> # A tibble: 3 × 4
#>      id type      x     y
#>   <int> <chr> <int> <int>
#> 1     1 A         1     4
#> 2     2 A         2     5
#> 3     3 A         3     6

# Or with different names
df1 <- tibble(id1 = 1:3, type1 = "A", x = 1:3)
df2 <- tibble(id2 = 1:3, type2 = "A", y = 4:6)

df1 %>%
  left_join(df2, by = c("id1" = "id2", "type1" = "type2"))
#> # A tibble: 3 × 4
#>     id1 type1     x     y
#>   <int> <chr> <int> <int>
#> 1     1 A         1     4
#> 2     2 A         2     5
#> 3     3 A         3     6

SOLUTION 2: Rename Columns First

customers <- tibble(customer_id = 1:3, name = c("A", "B", "C"))
orders <- tibble(cust_id = c(1, 1, 2), amount = c(100, 150, 200))

# Rename to match
orders_renamed <- orders %>%
  rename(customer_id = cust_id)

customers %>%
  left_join(orders_renamed, by = "customer_id")
#> # A tibble: 4 × 3
#>   customer_id name  amount
#>         <dbl> <chr>  <dbl>
#> 1           1 A        100
#> 2           1 A        150
#> 3           2 B        200
#> 4           3 C         NA

SOLUTION 3: Use join_by() (dplyr >= 1.1.0)

# Modern syntax (if available)
if (packageVersion("dplyr") >= "1.1.0") {
  customers %>%
    left_join(orders, join_by(customer_id == cust_id))
}
#> # A tibble: 4 × 3
#>   customer_id name  amount
#>         <dbl> <chr>  <dbl>
#> 1           1 A        100
#> 2           1 A        150
#> 3           2 B        200
#> 4           3 C         NA

22.4 Error #2: Duplicate keys creating many rows

⭐⭐ INTERMEDIATE 🧠 LOGIC

22.4.1 The Problem

# Data with duplicates
customers <- tibble(
  id = c(1, 1, 2),  # Duplicate id 1!
  name = c("Alice", "Alice", "Bob")
)

orders <- tibble(
  customer_id = c(1, 1, 2),  # Duplicate id 1!
  amount = c(100, 150, 200)
)

# Join creates cartesian product
result <- customers %>%
  left_join(orders, by = c("id" = "customer_id"))
#> Warning in left_join(., orders, by = c(id = "customer_id")): Detected an unexpected many-to-many relationship between `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set `relationship =
#>   "many-to-many"` to silence this warning.

result
#> # A tibble: 5 × 3
#>      id name  amount
#>   <dbl> <chr>  <dbl>
#> 1     1 Alice    100
#> 2     1 Alice    150
#> 3     1 Alice    100
#> 4     1 Alice    150
#> 5     2 Bob      200
nrow(result)  # 5 rows from 3 + 3!
#> [1] 5

22.4.2 What Happened

When both tables have duplicate keys, you get all combinations (cartesian product).

22.4.3 Common Causes

# Many-to-many relationship
students <- tibble(
  student_id = c(1, 1, 2, 2),
  course = c("Math", "English", "Math", "Science")
)

grades <- tibble(
  student_id = c(1, 1, 2, 2),
  grade = c("A", "B", "A", "B")
)

# Which grade goes with which course?
students %>%
  left_join(grades, by = "student_id")  # Ambiguous!
#> Warning in left_join(., grades, by = "student_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set `relationship =
#>   "many-to-many"` to silence this warning.
#> # A tibble: 8 × 3
#>   student_id course  grade
#>        <dbl> <chr>   <chr>
#> 1          1 Math    A    
#> 2          1 Math    B    
#> 3          1 English A    
#> 4          1 English B    
#> 5          2 Math    A    
#> 6          2 Math    B    
#> 7          2 Science A    
#> 8          2 Science B

22.4.4 Solutions

SOLUTION 1: Remove Duplicates Before Join

customers <- tibble(
  id = c(1, 1, 2),
  name = c("Alice", "Alice", "Bob")
)

# Keep unique customers
customers_unique <- customers %>%
  distinct(id, .keep_all = TRUE)

customers_unique %>%
  left_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 3 × 3
#>      id name  amount
#>   <dbl> <chr>  <dbl>
#> 1     1 Alice    100
#> 2     1 Alice    150
#> 3     2 Bob      200

SOLUTION 2: Check for Duplicates

check_join_keys <- function(data, key_cols) {
  dups <- data %>%
    group_by(across(all_of(key_cols))) %>%
    filter(n() > 1) %>%
    ungroup()
  
  if (nrow(dups) > 0) {
    warning("Duplicate keys found:\n")
    print(dups)
    return(FALSE)
  }
  
  message("No duplicate keys")
  return(TRUE)
}

check_join_keys(customers, "id")
#> Warning in check_join_keys(customers, "id"): Duplicate keys found:
#> # A tibble: 2 × 2
#>      id name 
#>   <dbl> <chr>
#> 1     1 Alice
#> 2     1 Alice
#> [1] FALSE
check_join_keys(customers_unique, "id")
#> No duplicate keys
#> [1] TRUE

SOLUTION 3: Aggregate Before Join

# If duplicates make sense (e.g., multiple orders)
orders_summary <- orders %>%
  group_by(customer_id) %>%
  summarize(
    total_orders = n(),
    total_amount = sum(amount),
    .groups = "drop"
  )

customers_unique %>%
  left_join(orders_summary, by = c("id" = "customer_id"))
#> # A tibble: 2 × 4
#>      id name  total_orders total_amount
#>   <dbl> <chr>        <int>        <dbl>
#> 1     1 Alice            2          250
#> 2     2 Bob              1          200

SOLUTION 4: Use relationship argument (dplyr >= 1.1.0)

# Specify expected relationship
if (packageVersion("dplyr") >= "1.1.0") {
  # This will error if relationship is wrong
  # customers %>%
  #   left_join(orders, 
  #            by = c("id" = "customer_id"),
  #            relationship = "one-to-one")
  
  # Specify many-to-one
  customers %>%
    left_join(orders, 
             by = c("id" = "customer_id"),
             relationship = "many-to-one")
}
#> Error in `left_join()`:
#> ! Each row in `x` must match at most 1 row in `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.

22.5 Multiple Key Joins

💡 Key Insight: Joining on Multiple Columns

# Data with composite keys
sales <- tibble(
  store = c("A", "A", "B", "B"),
  product = c("X", "Y", "X", "Y"),
  quantity = c(10, 20, 15, 25)
)

prices <- tibble(
  store = c("A", "A", "B", "B"),
  product = c("X", "Y", "X", "Y"),
  price = c(5.00, 7.50, 4.50, 8.00)
)

# Join on both store and product
sales %>%
  left_join(prices, by = c("store", "product")) %>%
  mutate(revenue = quantity * price)
#> # A tibble: 4 × 5
#>   store product quantity price revenue
#>   <chr> <chr>      <dbl> <dbl>   <dbl>
#> 1 A     X             10   5      50  
#> 2 A     Y             20   7.5   150  
#> 3 B     X             15   4.5    67.5
#> 4 B     Y             25   8     200

# With different names
prices2 <- tibble(
  location = c("A", "A", "B", "B"),
  item = c("X", "Y", "X", "Y"),
  price = c(5.00, 7.50, 4.50, 8.00)
)

sales %>%
  left_join(prices2, by = c("store" = "location", "product" = "item"))
#> # A tibble: 4 × 4
#>   store product quantity price
#>   <chr> <chr>      <dbl> <dbl>
#> 1 A     X             10   5  
#> 2 A     Y             20   7.5
#> 3 B     X             15   4.5
#> 4 B     Y             25   8

22.6 Error #3: Unexpected NA values after join

⭐⭐ INTERMEDIATE 🧠 LOGIC

22.6.1 The Problem

customers <- tibble(
  id = 1:4,
  name = c("Alice", "Bob", "Charlie", "David")
)

orders <- tibble(
  customer_id = c(1, 1, 2, 5),  # Customer 5 doesn't exist!
  amount = c(100, 150, 200, 75)
)

# Left join
result <- customers %>%
  left_join(orders, by = c("id" = "customer_id"))

result
#> # A tibble: 5 × 3
#>      id name    amount
#>   <dbl> <chr>    <dbl>
#> 1     1 Alice      100
#> 2     1 Alice      150
#> 3     2 Bob        200
#> 4     3 Charlie     NA
#> 5     4 David       NA
# Charlie and David have no orders (NA in amount)

22.6.2 What Happened

Left join keeps all rows from left table. Unmatched rows get NA in right table columns.

22.6.3 Understanding NA Sources

# Right join - unmatched from right get NA in left columns
result_right <- customers %>%
  right_join(orders, by = c("id" = "customer_id"))

result_right
#> # A tibble: 4 × 3
#>      id name  amount
#>   <dbl> <chr>  <dbl>
#> 1     1 Alice    100
#> 2     1 Alice    150
#> 3     2 Bob      200
#> 4     5 <NA>      75
# Customer 5 order has NA in name

# Inner join - no NAs (only matching rows)
result_inner <- customers %>%
  inner_join(orders, by = c("id" = "customer_id"))

result_inner
#> # A tibble: 3 × 3
#>      id name  amount
#>   <dbl> <chr>  <dbl>
#> 1     1 Alice    100
#> 2     1 Alice    150
#> 3     2 Bob      200

22.6.4 Solutions

SOLUTION 1: Choose Right Join Type

# If you want only matching rows
customers %>%
  inner_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 3 × 3
#>      id name  amount
#>   <dbl> <chr>  <dbl>
#> 1     1 Alice    100
#> 2     1 Alice    150
#> 3     2 Bob      200

# If you want all customers (even without orders)
customers %>%
  left_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 5 × 3
#>      id name    amount
#>   <dbl> <chr>    <dbl>
#> 1     1 Alice      100
#> 2     1 Alice      150
#> 3     2 Bob        200
#> 4     3 Charlie     NA
#> 5     4 David       NA

# If you want all orders (even with unknown customers)
customers %>%
  right_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 4 × 3
#>      id name  amount
#>   <dbl> <chr>  <dbl>
#> 1     1 Alice    100
#> 2     1 Alice    150
#> 3     2 Bob      200
#> 4     5 <NA>      75

SOLUTION 2: Fill NAs After Join

customers %>%
  left_join(orders, by = c("id" = "customer_id")) %>%
  mutate(
    amount = replace_na(amount, 0),
    order_count = if_else(is.na(amount), 0L, 1L)
  )
#> # A tibble: 5 × 4
#>      id name    amount order_count
#>   <dbl> <chr>    <dbl>       <int>
#> 1     1 Alice      100           1
#> 2     1 Alice      150           1
#> 3     2 Bob        200           1
#> 4     3 Charlie      0           1
#> 5     4 David        0           1

# Or with tidyr
library(tidyr)
customers %>%
  left_join(orders, by = c("id" = "customer_id")) %>%
  replace_na(list(amount = 0))
#> # A tibble: 5 × 3
#>      id name    amount
#>   <dbl> <chr>    <dbl>
#> 1     1 Alice      100
#> 2     1 Alice      150
#> 3     2 Bob        200
#> 4     3 Charlie      0
#> 5     4 David        0

SOLUTION 3: Check for Unmatched Rows

# Find unmatched customers
customers %>%
  anti_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 2 × 2
#>      id name   
#>   <int> <chr>  
#> 1     3 Charlie
#> 2     4 David

# Find unmatched orders
orders %>%
  anti_join(customers, by = c("customer_id" = "id"))
#> # A tibble: 1 × 2
#>   customer_id amount
#>         <dbl>  <dbl>
#> 1           5     75

22.7 Filtering Joins

💡 Key Insight: semi_join() and anti_join()

# semi_join: Keep rows in x that have match in y
# Like inner_join but doesn't add columns from y
band_members %>%
  semi_join(band_instruments, by = "name")
#> # A tibble: 2 × 2
#>   name  band   
#>   <chr> <chr>  
#> 1 John  Beatles
#> 2 Paul  Beatles

# anti_join: Keep rows in x that DON'T have match in y
band_members %>%
  anti_join(band_instruments, by = "name")
#> # A tibble: 1 × 2
#>   name  band  
#>   <chr> <chr> 
#> 1 Mick  Stones

# Practical use: find missing data
customers <- tibble(id = 1:5, name = letters[1:5])
orders <- tibble(customer_id = c(1, 2, 2, 3), amount = c(100, 150, 200, 75))

# Customers with no orders
customers %>%
  anti_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 2 × 2
#>      id name 
#>   <int> <chr>
#> 1     4 d    
#> 2     5 e

# Customers with orders
customers %>%
  semi_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 3 × 2
#>      id name 
#>   <int> <chr>
#> 1     1 a    
#> 2     2 b    
#> 3     3 c

22.8 Error #4: Column name conflicts

⭐ BEGINNER 🏷️ NAMING

22.8.1 The Problem

df1 <- tibble(
  id = 1:3,
  value = c(10, 20, 30)
)

df2 <- tibble(
  id = 1:3,
  value = c(100, 200, 300)
)

# Both have 'value' column
result <- df1 %>%
  left_join(df2, by = "id")

result
#> # A tibble: 3 × 3
#>      id value.x value.y
#>   <int>   <dbl>   <dbl>
#> 1     1      10     100
#> 2     2      20     200
#> 3     3      30     300
# Creates value.x and value.y

22.8.2 What Happened

Columns with same name (not in join key) get suffixes.

22.8.3 Solutions

SOLUTION 1: Use suffix argument

df1 %>%
  left_join(df2, by = "id", suffix = c("_left", "_right"))
#> # A tibble: 3 × 3
#>      id value_left value_right
#>   <int>      <dbl>       <dbl>
#> 1     1         10         100
#> 2     2         20         200
#> 3     3         30         300

# Or more descriptive
df1 %>%
  left_join(df2, by = "id", suffix = c("_original", "_new"))
#> # A tibble: 3 × 3
#>      id value_original value_new
#>   <int>          <dbl>     <dbl>
#> 1     1             10       100
#> 2     2             20       200
#> 3     3             30       300

SOLUTION 2: Rename Before Join

df1 %>%
  rename(value1 = value) %>%
  left_join(
    df2 %>% rename(value2 = value),
    by = "id"
  )
#> # A tibble: 3 × 3
#>      id value1 value2
#>   <int>  <dbl>  <dbl>
#> 1     1     10    100
#> 2     2     20    200
#> 3     3     30    300

SOLUTION 3: Select Columns First

# Only keep needed columns from each
df1 %>%
  left_join(
    df2 %>% select(id, value2 = value),
    by = "id"
  )
#> Error in select(., id, value2 = value): unused arguments (id, value2 = value)

22.9 Common Join Patterns

🎯 Best Practice: Common Join Patterns

# 1. Lookup table (add descriptions)
products <- tibble(
  product_id = 1:3,
  quantity = c(10, 20, 15)
)

product_info <- tibble(
  product_id = 1:3,
  name = c("Widget", "Gadget", "Doohickey"),
  price = c(9.99, 19.99, 14.99)
)

products %>%
  left_join(product_info, by = "product_id")
#> # A tibble: 3 × 4
#>   product_id quantity name      price
#>        <int>    <dbl> <chr>     <dbl>
#> 1          1       10 Widget     9.99
#> 2          2       20 Gadget    20.0 
#> 3          3       15 Doohickey 15.0

# 2. Aggregate then join
orders <- tibble(
  customer_id = c(1, 1, 2, 2, 3),
  amount = c(100, 150, 200, 75, 300)
)

customers <- tibble(
  id = 1:3,
  name = c("Alice", "Bob", "Charlie")
)

order_summary <- orders %>%
  group_by(customer_id) %>%
  summarize(
    total_orders = n(),
    total_spent = sum(amount),
    avg_order = mean(amount),
    .groups = "drop"
  )

customers %>%
  left_join(order_summary, by = c("id" = "customer_id"))
#> # A tibble: 3 × 5
#>      id name    total_orders total_spent avg_order
#>   <dbl> <chr>          <int>       <dbl>     <dbl>
#> 1     1 Alice              2         250      125 
#> 2     2 Bob                2         275      138.
#> 3     3 Charlie            1         300      300

# 3. Multiple joins
students <- tibble(id = 1:3, name = c("A", "B", "C"))
math_grades <- tibble(student_id = 1:3, math = c(85, 90, 78))
english_grades <- tibble(student_id = 1:3, english = c(88, 92, 85))

students %>%
  left_join(math_grades, by = c("id" = "student_id")) %>%
  left_join(english_grades, by = c("id" = "student_id"))
#> # A tibble: 3 × 4
#>      id name   math english
#>   <int> <chr> <dbl>   <dbl>
#> 1     1 A        85      88
#> 2     2 B        90      92
#> 3     3 C        78      85

# 4. Self join
employees <- tibble(
  id = 1:4,
  name = c("Alice", "Bob", "Charlie", "David"),
  manager_id = c(NA, 1, 1, 2)
)

employees %>%
  left_join(
    employees %>% select(id, manager_name = name),
    by = c("manager_id" = "id")
  )
#> Error in select(., id, manager_name = name): unused arguments (id, manager_name = name)

# 5. Rolling join (closest match)
events <- tibble(
  time = c(1, 3, 5, 7),
  event = c("A", "B", "C", "D")
)

measurements <- tibble(
  time = c(0, 2, 4, 6, 8),
  value = c(10, 20, 30, 40, 50)
)

# Nearest match (simplified example)
events %>%
  left_join(measurements, by = "time")  # Only exact matches
#> # A tibble: 4 × 3
#>    time event value
#>   <dbl> <chr> <dbl>
#> 1     1 A        NA
#> 2     3 B        NA
#> 3     5 C        NA
#> 4     7 D        NA

22.10 Checking Join Results

🎯 Best Practice: Validate Joins

validate_join <- function(original, joined, join_type = "left") {
  cat("=== Join Validation ===\n\n")
  
  # Row count check
  cat("Original rows:", nrow(original), "\n")
  cat("Joined rows:", nrow(joined), "\n")
  
  if (join_type == "inner") {
    if (nrow(joined) > nrow(original)) {
      warning("Inner join resulted in MORE rows - duplicates in join key?")
    }
  } else if (join_type == "left") {
    if (nrow(joined) < nrow(original)) {
      warning("Left join resulted in FEWER rows - this shouldn't happen!")
    } else if (nrow(joined) > nrow(original)) {
      warning("Left join resulted in MORE rows - duplicates in right table?")
    }
  }
  
  # NA check
  na_cols <- names(joined)[colSums(is.na(joined)) > 0]
  if (length(na_cols) > 0) {
    cat("\nColumns with NAs:\n")
    for (col in na_cols) {
      n_na <- sum(is.na(joined[[col]]))
      cat("  ", col, ":", n_na, "NAs\n")
    }
  }
  
  # Duplicate check in key columns
  cat("\n")
}

# Test
customers <- tibble(id = 1:3, name = c("A", "B", "C"))
orders <- tibble(customer_id = c(1, 1, 2, 4), amount = c(100, 150, 200, 75))

result <- customers %>%
  left_join(orders, by = c("id" = "customer_id"))

validate_join(customers, result, "left")
#> === Join Validation ===
#> 
#> Original rows: 3 
#> Joined rows: 4
#> Warning in validate_join(customers, result, "left"): Left join resulted in MORE
#> rows - duplicates in right table?
#> 
#> Columns with NAs:
#>    amount : 1 NAs

22.11 Summary

Key Takeaways:

  1. Four main joins - inner, left, right, full
  2. Specify by argument - When column names differ
  3. Watch for duplicates - Can create many rows
  4. Left join creates NAs - For unmatched rows
  5. Use filtering joins - semi_join(), anti_join()
  6. Check join results - Validate row counts and NAs
  7. Name conflicts - Use suffix or rename

Quick Reference:

Error Cause Fix
by must be supplied No common columns Use by = c("x" = "y")
Too many rows Duplicate keys Check for duplicates, aggregate
Unexpected NAs Unmatched rows Use right join type, fill NAs
Column conflicts Same name, not in key Use suffix argument

Join Types:

# Mutating joins (add columns)
inner_join(x, y, by = "key")  # Only matching
left_join(x, y, by = "key")   # All x, matching y
right_join(x, y, by = "key")  # Matching x, all y
full_join(x, y, by = "key")   # All rows

# Filtering joins (filter rows)
semi_join(x, y, by = "key")   # x rows that match y
anti_join(x, y, by = "key")   # x rows that don't match y

# Different column names
left_join(x, y, by = c("x_id" = "y_id"))

# Multiple keys
left_join(x, y, by = c("id", "type"))

# Handle name conflicts
left_join(x, y, by = "id", suffix = c("_x", "_y"))

Best Practices:

# ✅ Good
left_join(x, y, by = "key")              # Explicit by
Check for duplicates before join          # Avoid cartesian
Validate row counts after join            # Catch issues
Use appropriate join type                 # inner vs left vs full

# ❌ Avoid
left_join(x, y)  # No by (relies on names)
Join without checking duplicates          # Surprise row explosion
Ignore NAs after join                     # May indicate issues
Always use left_join                      # Sometimes inner is right

22.12 Exercises

📝 Exercise 1: Basic Joins

Given:

products <- tibble(id = 1:5, name = c("A", "B", "C", "D", "E"))
sales <- tibble(product_id = c(1, 2, 2, 3, 6), amount = c(100, 150, 200, 75, 50))
  1. Inner join - only products with sales
  2. Left join - all products, add sales
  3. Find products with no sales
  4. Find sales for unknown products

📝 Exercise 2: Duplicate Handling

Given data with duplicates:

customers <- tibble(id = c(1, 1, 2, 3), name = c("A", "A", "B", "C"))
orders <- tibble(cust_id = c(1, 2, 2, 3), amount = c(100, 150, 200, 75))
  1. Identify the duplicates
  2. Remove them appropriately
  3. Perform safe join
  4. Validate result

📝 Exercise 3: Safe Join Function

Write safe_join(x, y, by, type) that: 1. Checks for duplicate keys 2. Validates join will work 3. Performs join 4. Checks result makes sense 5. Returns result with metadata

📝 Exercise 4: Multiple Table Join

Join three tables efficiently:

students <- tibble(id = 1:3, name = c("A", "B", "C"))
courses <- tibble(student_id = c(1, 1, 2, 3), course = c("Math", "Eng", "Math", "Sci"))
grades <- tibble(student_id = c(1, 1, 2, 3), course = c("Math", "Eng", "Math", "Sci"), 
                grade = c(85, 90, 78, 92))

22.13 Exercise Answers

Click to see answers

Exercise 1:

products <- tibble(id = 1:5, name = c("A", "B", "C", "D", "E"))
sales <- tibble(product_id = c(1, 2, 2, 3, 6), amount = c(100, 150, 200, 75, 50))

# 1. Inner join - only products with sales
products_with_sales <- products %>%
  inner_join(sales, by = c("id" = "product_id"))
products_with_sales
#> # A tibble: 4 × 3
#>      id name  amount
#>   <dbl> <chr>  <dbl>
#> 1     1 A        100
#> 2     2 B        150
#> 3     2 B        200
#> 4     3 C         75

# 2. Left join - all products, add sales
all_products <- products %>%
  left_join(sales, by = c("id" = "product_id"))
all_products
#> # A tibble: 6 × 3
#>      id name  amount
#>   <dbl> <chr>  <dbl>
#> 1     1 A        100
#> 2     2 B        150
#> 3     2 B        200
#> 4     3 C         75
#> 5     4 D         NA
#> 6     5 E         NA

# 3. Find products with no sales
no_sales <- products %>%
  anti_join(sales, by = c("id" = "product_id"))
no_sales
#> # A tibble: 2 × 2
#>      id name 
#>   <int> <chr>
#> 1     4 D    
#> 2     5 E

# 4. Find sales for unknown products
unknown_products <- sales %>%
  anti_join(products, by = c("product_id" = "id"))
unknown_products
#> # A tibble: 1 × 2
#>   product_id amount
#>        <dbl>  <dbl>
#> 1          6     50

Exercise 2:

customers <- tibble(id = c(1, 1, 2, 3), name = c("A", "A", "B", "C"))
orders <- tibble(cust_id = c(1, 2, 2, 3), amount = c(100, 150, 200, 75))

# 1. Identify duplicates
cat("Duplicate customers:\n")
#> Duplicate customers:
customers %>%
  group_by(id) %>%
  filter(n() > 1)
#> # A tibble: 2 × 2
#> # Groups:   id [1]
#>      id name 
#>   <dbl> <chr>
#> 1     1 A    
#> 2     1 A

cat("\nDuplicate orders:\n")
#> 
#> Duplicate orders:
orders %>%
  group_by(cust_id) %>%
  filter(n() > 1)
#> # A tibble: 2 × 2
#> # Groups:   cust_id [1]
#>   cust_id amount
#>     <dbl>  <dbl>
#> 1       2    150
#> 2       2    200

# 2. Remove duplicates
customers_unique <- customers %>%
  distinct(id, .keep_all = TRUE)

# Orders: multiple per customer is expected, so aggregate
orders_summary <- orders %>%
  group_by(cust_id) %>%
  summarize(
    n_orders = n(),
    total_amount = sum(amount),
    avg_amount = mean(amount),
    .groups = "drop"
  )

# 3. Perform safe join
result <- customers_unique %>%
  left_join(orders_summary, by = c("id" = "cust_id"))

# 4. Validate
cat("\nValidation:\n")
#> 
#> Validation:
cat("Original customers:", nrow(customers_unique), "\n")
#> Original customers: 3
cat("Result rows:", nrow(result), "\n")
#> Result rows: 3
cat("Should be equal:", nrow(customers_unique) == nrow(result), "\n")
#> Should be equal: TRUE

result
#> # A tibble: 3 × 5
#>      id name  n_orders total_amount avg_amount
#>   <dbl> <chr>    <int>        <dbl>      <dbl>
#> 1     1 A            1          100        100
#> 2     2 B            2          350        175
#> 3     3 C            1           75         75

Exercise 3:

safe_join <- function(x, y, by, type = "left") {
  # Validate inputs
  if (!type %in% c("inner", "left", "right", "full")) {
    stop("type must be inner, left, right, or full")
  }
  
  # Extract key columns
  if (is.character(by)) {
    x_keys <- if (is.null(names(by))) by else names(by)
    y_keys <- if (is.null(names(by))) by else unname(by)
  } else {
    stop("by must be a character vector")
  }
  
  # Check keys exist
  if (!all(x_keys %in% names(x))) {
    stop("Some keys not found in x: ", 
         paste(setdiff(x_keys, names(x)), collapse = ", "))
  }
  if (!all(y_keys %in% names(y))) {
    stop("Some keys not found in y: ", 
         paste(setdiff(y_keys, names(y)), collapse = ", "))
  }
  
  # Check for duplicates
  x_dups <- x %>%
    group_by(across(all_of(x_keys))) %>%
    filter(n() > 1) %>%
    nrow()
  
  y_dups <- y %>%
    group_by(across(all_of(y_keys))) %>%
    filter(n() > 1) %>%
    nrow()
  
  if (x_dups > 0) {
    warning("x has ", x_dups, " duplicate key rows")
  }
  if (y_dups > 0) {
    warning("y has ", y_dups, " duplicate key rows")
  }
  
  # Perform join
  result <- switch(type,
    inner = inner_join(x, y, by = by),
    left = left_join(x, y, by = by),
    right = right_join(x, y, by = by),
    full = full_join(x, y, by = by)
  )
  
  # Validate result
  metadata <- list(
    type = type,
    x_rows = nrow(x),
    y_rows = nrow(y),
    result_rows = nrow(result),
    x_duplicates = x_dups,
    y_duplicates = y_dups,
    na_columns = names(result)[colSums(is.na(result)) > 0]
  )
  
  # Check expectations
  if (type == "left" && nrow(result) < nrow(x)) {
    warning("Left join resulted in fewer rows than x!")
  }
  
  if (type == "inner" && nrow(result) > min(nrow(x), nrow(y))) {
    warning("Inner join resulted in more rows than either input - check for duplicates")
  }
  
  # Return with metadata
  attr(result, "join_metadata") <- metadata
  result
}

# Test
customers <- tibble(id = 1:3, name = c("A", "B", "C"))
orders <- tibble(customer_id = c(1, 1, 2, 4), amount = c(100, 150, 200, 75))

result <- safe_join(customers, orders, by = c("id" = "customer_id"), type = "left")
#> Warning in safe_join(customers, orders, by = c(id = "customer_id"), type =
#> "left"): y has 2 duplicate key rows
result
#> # A tibble: 4 × 3
#>      id name  amount
#>   <dbl> <chr>  <dbl>
#> 1     1 A        100
#> 2     1 A        150
#> 3     2 B        200
#> 4     3 C         NA
attr(result, "join_metadata")
#> $type
#> [1] "left"
#> 
#> $x_rows
#> [1] 3
#> 
#> $y_rows
#> [1] 4
#> 
#> $result_rows
#> [1] 4
#> 
#> $x_duplicates
#> [1] 0
#> 
#> $y_duplicates
#> [1] 2
#> 
#> $na_columns
#> [1] "amount"

Exercise 4:

students <- tibble(id = 1:3, name = c("A", "B", "C"))
courses <- tibble(
  student_id = c(1, 1, 2, 3), 
  course = c("Math", "Eng", "Math", "Sci")
)
grades <- tibble(
  student_id = c(1, 1, 2, 3), 
  course = c("Math", "Eng", "Math", "Sci"), 
  grade = c(85, 90, 78, 92)
)

# Method 1: Chain joins
result1 <- students %>%
  left_join(courses, by = c("id" = "student_id")) %>%
  left_join(grades, by = c("id" = "student_id", "course" = "course"))
result1
#> # A tibble: 4 × 4
#>      id name  course grade
#>   <dbl> <chr> <chr>  <dbl>
#> 1     1 A     Math      85
#> 2     1 A     Eng       90
#> 3     2 B     Math      78
#> 4     3 C     Sci       92

# Method 2: Join courses and grades first
result2 <- students %>%
  left_join(
    courses %>%
      left_join(grades, by = c("student_id", "course")),
    by = c("id" = "student_id")
  )
result2
#> # A tibble: 4 × 4
#>      id name  course grade
#>   <dbl> <chr> <chr>  <dbl>
#> 1     1 A     Math      85
#> 2     1 A     Eng       90
#> 3     2 B     Math      78
#> 4     3 C     Sci       92

# Method 3: Using reduce (for many tables)
library(purrr)

tables <- list(
  students,
  courses,
  grades
)

keys <- list(
  c("id" = "student_id"),
  c("id" = "student_id", "course" = "course")
)

# This is complex - usually method 1 or 2 is better
# Showing for completeness

# Summary by student
summary_result <- students %>%
  left_join(
    courses %>%
      left_join(grades, by = c("student_id", "course")) %>%
      group_by(student_id) %>%
      summarize(
        n_courses = n(),
        avg_grade = mean(grade, na.rm = TRUE),
        courses = paste(course, collapse = ", "),
        .groups = "drop"
      ),
    by = c("id" = "student_id")
  )

summary_result
#> # A tibble: 3 × 5
#>      id name  n_courses avg_grade courses  
#>   <dbl> <chr>     <int>     <dbl> <chr>    
#> 1     1 A             2      87.5 Math, Eng
#> 2     2 B             1      78   Math     
#> 3     3 C             1      92   Sci