Chapter 22 Joins & Merging
What You’ll Learn:
- Types of joins (inner, left, right, full)
- Join keys and matching
- Common join errors
- Multiple key joins
- Handling duplicates
- Anti and semi joins
Key Errors Covered: 20+ join errors
Difficulty: ⭐⭐⭐ Advanced
22.1 Introduction
Joining combines data from multiple sources:
library(dplyr)
# Two related datasets
customers <- tibble(
id = 1:3,
name = c("Alice", "Bob", "Charlie")
)
orders <- tibble(
order_id = 1:4,
customer_id = c(1, 1, 2, 4),
amount = c(100, 150, 200, 75)
)
# Join them
customers %>%
left_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 4 × 4
#> id name order_id amount
#> <dbl> <chr> <int> <dbl>
#> 1 1 Alice 1 100
#> 2 1 Alice 2 150
#> 3 2 Bob 3 200
#> 4 3 Charlie NA NABut joins have many pitfalls. Let’s master them!
22.2 Join Types
💡 Key Insight: Four Main Join Types
# Sample data
band_members
#> # A tibble: 3 × 2
#> name band
#> <chr> <chr>
#> 1 Mick Stones
#> 2 John Beatles
#> 3 Paul Beatles
band_instruments
#> # A tibble: 3 × 2
#> name plays
#> <chr> <chr>
#> 1 John guitar
#> 2 Paul bass
#> 3 Keith guitar
# inner_join: Only matching rows
band_members %>%
inner_join(band_instruments, by = "name")
#> # A tibble: 2 × 3
#> name band plays
#> <chr> <chr> <chr>
#> 1 John Beatles guitar
#> 2 Paul Beatles bass
# left_join: All from left, matching from right
band_members %>%
left_join(band_instruments, by = "name")
#> # A tibble: 3 × 3
#> name band plays
#> <chr> <chr> <chr>
#> 1 Mick Stones <NA>
#> 2 John Beatles guitar
#> 3 Paul Beatles bass
# right_join: All from right, matching from left
band_members %>%
right_join(band_instruments, by = "name")
#> # A tibble: 3 × 3
#> name band plays
#> <chr> <chr> <chr>
#> 1 John Beatles guitar
#> 2 Paul Beatles bass
#> 3 Keith <NA> guitar
# full_join: All rows from both
band_members %>%
full_join(band_instruments, by = "name")
#> # A tibble: 4 × 3
#> name band plays
#> <chr> <chr> <chr>
#> 1 Mick Stones <NA>
#> 2 John Beatles guitar
#> 3 Paul Beatles bass
#> 4 Keith <NA> guitarVisual representation:
- inner_join: ∩ (intersection)
- left_join: ← (left + intersection)
- right_join: → (right + intersection)
- full_join: ∪ (union, all rows)
22.3 Error #1: by must be supplied
⭐ BEGINNER 📋 ARGS
22.3.1 The Error
df1 <- tibble(id = 1:3, x = c("a", "b", "c"))
df2 <- tibble(identifier = 1:3, y = c("d", "e", "f"))
df1 %>%
left_join(df2)
#> Error in `left_join()`:
#> ! `by` must be supplied when `x` and `y` have no common variables.
#> ℹ Use `cross_join()` to perform a cross-join.🔴 ERROR (dplyr >= 1.1.0)
Error in left_join():
! `by` must be supplied when `x` and `y` have no common variables.
22.3.2 What It Means
The two datasets have no columns with the same name, so dplyr doesn’t know how to join them.
22.3.3 Common Causes
# Different column names for same concept
customers <- tibble(customer_id = 1:3, name = c("A", "B", "C"))
orders <- tibble(cust_id = c(1, 1, 2), amount = c(100, 150, 200))
customers %>%
left_join(orders) # No common columns!
#> Error in `left_join()`:
#> ! `by` must be supplied when `x` and `y` have no common variables.
#> ℹ Use `cross_join()` to perform a cross-join.
# Typo in column name
df1 <- tibble(id = 1:3, x = "a")
df2 <- tibble(ID = 1:3, y = "b") # Different case!
df1 %>%
left_join(df2)
#> Error in `left_join()`:
#> ! `by` must be supplied when `x` and `y` have no common variables.
#> ℹ Use `cross_join()` to perform a cross-join.22.3.4 Solutions
✅ SOLUTION 1: Specify Join Keys
# Use by argument with named vector
customers <- tibble(customer_id = 1:3, name = c("A", "B", "C"))
orders <- tibble(cust_id = c(1, 1, 2), amount = c(100, 150, 200))
customers %>%
left_join(orders, by = c("customer_id" = "cust_id"))
#> # A tibble: 4 × 3
#> customer_id name amount
#> <dbl> <chr> <dbl>
#> 1 1 A 100
#> 2 1 A 150
#> 3 2 B 200
#> 4 3 C NA
# Multiple keys
df1 <- tibble(id = 1:3, type = "A", x = 1:3)
df2 <- tibble(id = 1:3, type = "A", y = 4:6)
df1 %>%
left_join(df2, by = c("id", "type"))
#> # A tibble: 3 × 4
#> id type x y
#> <int> <chr> <int> <int>
#> 1 1 A 1 4
#> 2 2 A 2 5
#> 3 3 A 3 6
# Or with different names
df1 <- tibble(id1 = 1:3, type1 = "A", x = 1:3)
df2 <- tibble(id2 = 1:3, type2 = "A", y = 4:6)
df1 %>%
left_join(df2, by = c("id1" = "id2", "type1" = "type2"))
#> # A tibble: 3 × 4
#> id1 type1 x y
#> <int> <chr> <int> <int>
#> 1 1 A 1 4
#> 2 2 A 2 5
#> 3 3 A 3 6✅ SOLUTION 2: Rename Columns First
customers <- tibble(customer_id = 1:3, name = c("A", "B", "C"))
orders <- tibble(cust_id = c(1, 1, 2), amount = c(100, 150, 200))
# Rename to match
orders_renamed <- orders %>%
rename(customer_id = cust_id)
customers %>%
left_join(orders_renamed, by = "customer_id")
#> # A tibble: 4 × 3
#> customer_id name amount
#> <dbl> <chr> <dbl>
#> 1 1 A 100
#> 2 1 A 150
#> 3 2 B 200
#> 4 3 C NA✅ SOLUTION 3: Use join_by() (dplyr >= 1.1.0)
22.4 Error #2: Duplicate keys creating many rows
⭐⭐ INTERMEDIATE 🧠 LOGIC
22.4.1 The Problem
# Data with duplicates
customers <- tibble(
id = c(1, 1, 2), # Duplicate id 1!
name = c("Alice", "Alice", "Bob")
)
orders <- tibble(
customer_id = c(1, 1, 2), # Duplicate id 1!
amount = c(100, 150, 200)
)
# Join creates cartesian product
result <- customers %>%
left_join(orders, by = c("id" = "customer_id"))
#> Warning in left_join(., orders, by = c(id = "customer_id")): Detected an unexpected many-to-many relationship between `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set `relationship =
#> "many-to-many"` to silence this warning.
result
#> # A tibble: 5 × 3
#> id name amount
#> <dbl> <chr> <dbl>
#> 1 1 Alice 100
#> 2 1 Alice 150
#> 3 1 Alice 100
#> 4 1 Alice 150
#> 5 2 Bob 200
nrow(result) # 5 rows from 3 + 3!
#> [1] 522.4.2 What Happened
When both tables have duplicate keys, you get all combinations (cartesian product).
22.4.3 Common Causes
# Many-to-many relationship
students <- tibble(
student_id = c(1, 1, 2, 2),
course = c("Math", "English", "Math", "Science")
)
grades <- tibble(
student_id = c(1, 1, 2, 2),
grade = c("A", "B", "A", "B")
)
# Which grade goes with which course?
students %>%
left_join(grades, by = "student_id") # Ambiguous!
#> Warning in left_join(., grades, by = "student_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set `relationship =
#> "many-to-many"` to silence this warning.
#> # A tibble: 8 × 3
#> student_id course grade
#> <dbl> <chr> <chr>
#> 1 1 Math A
#> 2 1 Math B
#> 3 1 English A
#> 4 1 English B
#> 5 2 Math A
#> 6 2 Math B
#> 7 2 Science A
#> 8 2 Science B22.4.4 Solutions
✅ SOLUTION 1: Remove Duplicates Before Join
customers <- tibble(
id = c(1, 1, 2),
name = c("Alice", "Alice", "Bob")
)
# Keep unique customers
customers_unique <- customers %>%
distinct(id, .keep_all = TRUE)
customers_unique %>%
left_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 3 × 3
#> id name amount
#> <dbl> <chr> <dbl>
#> 1 1 Alice 100
#> 2 1 Alice 150
#> 3 2 Bob 200✅ SOLUTION 2: Check for Duplicates
check_join_keys <- function(data, key_cols) {
dups <- data %>%
group_by(across(all_of(key_cols))) %>%
filter(n() > 1) %>%
ungroup()
if (nrow(dups) > 0) {
warning("Duplicate keys found:\n")
print(dups)
return(FALSE)
}
message("No duplicate keys")
return(TRUE)
}
check_join_keys(customers, "id")
#> Warning in check_join_keys(customers, "id"): Duplicate keys found:
#> # A tibble: 2 × 2
#> id name
#> <dbl> <chr>
#> 1 1 Alice
#> 2 1 Alice
#> [1] FALSE
check_join_keys(customers_unique, "id")
#> No duplicate keys
#> [1] TRUE✅ SOLUTION 3: Aggregate Before Join
# If duplicates make sense (e.g., multiple orders)
orders_summary <- orders %>%
group_by(customer_id) %>%
summarize(
total_orders = n(),
total_amount = sum(amount),
.groups = "drop"
)
customers_unique %>%
left_join(orders_summary, by = c("id" = "customer_id"))
#> # A tibble: 2 × 4
#> id name total_orders total_amount
#> <dbl> <chr> <int> <dbl>
#> 1 1 Alice 2 250
#> 2 2 Bob 1 200✅ SOLUTION 4: Use relationship argument (dplyr >= 1.1.0)
# Specify expected relationship
if (packageVersion("dplyr") >= "1.1.0") {
# This will error if relationship is wrong
# customers %>%
# left_join(orders,
# by = c("id" = "customer_id"),
# relationship = "one-to-one")
# Specify many-to-one
customers %>%
left_join(orders,
by = c("id" = "customer_id"),
relationship = "many-to-one")
}
#> Error in `left_join()`:
#> ! Each row in `x` must match at most 1 row in `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.22.5 Multiple Key Joins
💡 Key Insight: Joining on Multiple Columns
# Data with composite keys
sales <- tibble(
store = c("A", "A", "B", "B"),
product = c("X", "Y", "X", "Y"),
quantity = c(10, 20, 15, 25)
)
prices <- tibble(
store = c("A", "A", "B", "B"),
product = c("X", "Y", "X", "Y"),
price = c(5.00, 7.50, 4.50, 8.00)
)
# Join on both store and product
sales %>%
left_join(prices, by = c("store", "product")) %>%
mutate(revenue = quantity * price)
#> # A tibble: 4 × 5
#> store product quantity price revenue
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 A X 10 5 50
#> 2 A Y 20 7.5 150
#> 3 B X 15 4.5 67.5
#> 4 B Y 25 8 200
# With different names
prices2 <- tibble(
location = c("A", "A", "B", "B"),
item = c("X", "Y", "X", "Y"),
price = c(5.00, 7.50, 4.50, 8.00)
)
sales %>%
left_join(prices2, by = c("store" = "location", "product" = "item"))
#> # A tibble: 4 × 4
#> store product quantity price
#> <chr> <chr> <dbl> <dbl>
#> 1 A X 10 5
#> 2 A Y 20 7.5
#> 3 B X 15 4.5
#> 4 B Y 25 822.6 Error #3: Unexpected NA values after join
⭐⭐ INTERMEDIATE 🧠 LOGIC
22.6.1 The Problem
customers <- tibble(
id = 1:4,
name = c("Alice", "Bob", "Charlie", "David")
)
orders <- tibble(
customer_id = c(1, 1, 2, 5), # Customer 5 doesn't exist!
amount = c(100, 150, 200, 75)
)
# Left join
result <- customers %>%
left_join(orders, by = c("id" = "customer_id"))
result
#> # A tibble: 5 × 3
#> id name amount
#> <dbl> <chr> <dbl>
#> 1 1 Alice 100
#> 2 1 Alice 150
#> 3 2 Bob 200
#> 4 3 Charlie NA
#> 5 4 David NA
# Charlie and David have no orders (NA in amount)22.6.2 What Happened
Left join keeps all rows from left table. Unmatched rows get NA in right table columns.
22.6.3 Understanding NA Sources
# Right join - unmatched from right get NA in left columns
result_right <- customers %>%
right_join(orders, by = c("id" = "customer_id"))
result_right
#> # A tibble: 4 × 3
#> id name amount
#> <dbl> <chr> <dbl>
#> 1 1 Alice 100
#> 2 1 Alice 150
#> 3 2 Bob 200
#> 4 5 <NA> 75
# Customer 5 order has NA in name
# Inner join - no NAs (only matching rows)
result_inner <- customers %>%
inner_join(orders, by = c("id" = "customer_id"))
result_inner
#> # A tibble: 3 × 3
#> id name amount
#> <dbl> <chr> <dbl>
#> 1 1 Alice 100
#> 2 1 Alice 150
#> 3 2 Bob 20022.6.4 Solutions
✅ SOLUTION 1: Choose Right Join Type
# If you want only matching rows
customers %>%
inner_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 3 × 3
#> id name amount
#> <dbl> <chr> <dbl>
#> 1 1 Alice 100
#> 2 1 Alice 150
#> 3 2 Bob 200
# If you want all customers (even without orders)
customers %>%
left_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 5 × 3
#> id name amount
#> <dbl> <chr> <dbl>
#> 1 1 Alice 100
#> 2 1 Alice 150
#> 3 2 Bob 200
#> 4 3 Charlie NA
#> 5 4 David NA
# If you want all orders (even with unknown customers)
customers %>%
right_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 4 × 3
#> id name amount
#> <dbl> <chr> <dbl>
#> 1 1 Alice 100
#> 2 1 Alice 150
#> 3 2 Bob 200
#> 4 5 <NA> 75✅ SOLUTION 2: Fill NAs After Join
customers %>%
left_join(orders, by = c("id" = "customer_id")) %>%
mutate(
amount = replace_na(amount, 0),
order_count = if_else(is.na(amount), 0L, 1L)
)
#> # A tibble: 5 × 4
#> id name amount order_count
#> <dbl> <chr> <dbl> <int>
#> 1 1 Alice 100 1
#> 2 1 Alice 150 1
#> 3 2 Bob 200 1
#> 4 3 Charlie 0 1
#> 5 4 David 0 1
# Or with tidyr
library(tidyr)
customers %>%
left_join(orders, by = c("id" = "customer_id")) %>%
replace_na(list(amount = 0))
#> # A tibble: 5 × 3
#> id name amount
#> <dbl> <chr> <dbl>
#> 1 1 Alice 100
#> 2 1 Alice 150
#> 3 2 Bob 200
#> 4 3 Charlie 0
#> 5 4 David 0✅ SOLUTION 3: Check for Unmatched Rows
# Find unmatched customers
customers %>%
anti_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 2 × 2
#> id name
#> <int> <chr>
#> 1 3 Charlie
#> 2 4 David
# Find unmatched orders
orders %>%
anti_join(customers, by = c("customer_id" = "id"))
#> # A tibble: 1 × 2
#> customer_id amount
#> <dbl> <dbl>
#> 1 5 7522.7 Filtering Joins
💡 Key Insight: semi_join() and anti_join()
# semi_join: Keep rows in x that have match in y
# Like inner_join but doesn't add columns from y
band_members %>%
semi_join(band_instruments, by = "name")
#> # A tibble: 2 × 2
#> name band
#> <chr> <chr>
#> 1 John Beatles
#> 2 Paul Beatles
# anti_join: Keep rows in x that DON'T have match in y
band_members %>%
anti_join(band_instruments, by = "name")
#> # A tibble: 1 × 2
#> name band
#> <chr> <chr>
#> 1 Mick Stones
# Practical use: find missing data
customers <- tibble(id = 1:5, name = letters[1:5])
orders <- tibble(customer_id = c(1, 2, 2, 3), amount = c(100, 150, 200, 75))
# Customers with no orders
customers %>%
anti_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 2 × 2
#> id name
#> <int> <chr>
#> 1 4 d
#> 2 5 e
# Customers with orders
customers %>%
semi_join(orders, by = c("id" = "customer_id"))
#> # A tibble: 3 × 2
#> id name
#> <int> <chr>
#> 1 1 a
#> 2 2 b
#> 3 3 c22.8 Error #4: Column name conflicts
⭐ BEGINNER 🏷️ NAMING
22.8.1 The Problem
df1 <- tibble(
id = 1:3,
value = c(10, 20, 30)
)
df2 <- tibble(
id = 1:3,
value = c(100, 200, 300)
)
# Both have 'value' column
result <- df1 %>%
left_join(df2, by = "id")
result
#> # A tibble: 3 × 3
#> id value.x value.y
#> <int> <dbl> <dbl>
#> 1 1 10 100
#> 2 2 20 200
#> 3 3 30 300
# Creates value.x and value.y22.8.3 Solutions
✅ SOLUTION 1: Use suffix argument
df1 %>%
left_join(df2, by = "id", suffix = c("_left", "_right"))
#> # A tibble: 3 × 3
#> id value_left value_right
#> <int> <dbl> <dbl>
#> 1 1 10 100
#> 2 2 20 200
#> 3 3 30 300
# Or more descriptive
df1 %>%
left_join(df2, by = "id", suffix = c("_original", "_new"))
#> # A tibble: 3 × 3
#> id value_original value_new
#> <int> <dbl> <dbl>
#> 1 1 10 100
#> 2 2 20 200
#> 3 3 30 300✅ SOLUTION 2: Rename Before Join
22.9 Common Join Patterns
🎯 Best Practice: Common Join Patterns
# 1. Lookup table (add descriptions)
products <- tibble(
product_id = 1:3,
quantity = c(10, 20, 15)
)
product_info <- tibble(
product_id = 1:3,
name = c("Widget", "Gadget", "Doohickey"),
price = c(9.99, 19.99, 14.99)
)
products %>%
left_join(product_info, by = "product_id")
#> # A tibble: 3 × 4
#> product_id quantity name price
#> <int> <dbl> <chr> <dbl>
#> 1 1 10 Widget 9.99
#> 2 2 20 Gadget 20.0
#> 3 3 15 Doohickey 15.0
# 2. Aggregate then join
orders <- tibble(
customer_id = c(1, 1, 2, 2, 3),
amount = c(100, 150, 200, 75, 300)
)
customers <- tibble(
id = 1:3,
name = c("Alice", "Bob", "Charlie")
)
order_summary <- orders %>%
group_by(customer_id) %>%
summarize(
total_orders = n(),
total_spent = sum(amount),
avg_order = mean(amount),
.groups = "drop"
)
customers %>%
left_join(order_summary, by = c("id" = "customer_id"))
#> # A tibble: 3 × 5
#> id name total_orders total_spent avg_order
#> <dbl> <chr> <int> <dbl> <dbl>
#> 1 1 Alice 2 250 125
#> 2 2 Bob 2 275 138.
#> 3 3 Charlie 1 300 300
# 3. Multiple joins
students <- tibble(id = 1:3, name = c("A", "B", "C"))
math_grades <- tibble(student_id = 1:3, math = c(85, 90, 78))
english_grades <- tibble(student_id = 1:3, english = c(88, 92, 85))
students %>%
left_join(math_grades, by = c("id" = "student_id")) %>%
left_join(english_grades, by = c("id" = "student_id"))
#> # A tibble: 3 × 4
#> id name math english
#> <int> <chr> <dbl> <dbl>
#> 1 1 A 85 88
#> 2 2 B 90 92
#> 3 3 C 78 85
# 4. Self join
employees <- tibble(
id = 1:4,
name = c("Alice", "Bob", "Charlie", "David"),
manager_id = c(NA, 1, 1, 2)
)
employees %>%
left_join(
employees %>% select(id, manager_name = name),
by = c("manager_id" = "id")
)
#> Error in select(., id, manager_name = name): unused arguments (id, manager_name = name)
# 5. Rolling join (closest match)
events <- tibble(
time = c(1, 3, 5, 7),
event = c("A", "B", "C", "D")
)
measurements <- tibble(
time = c(0, 2, 4, 6, 8),
value = c(10, 20, 30, 40, 50)
)
# Nearest match (simplified example)
events %>%
left_join(measurements, by = "time") # Only exact matches
#> # A tibble: 4 × 3
#> time event value
#> <dbl> <chr> <dbl>
#> 1 1 A NA
#> 2 3 B NA
#> 3 5 C NA
#> 4 7 D NA22.10 Checking Join Results
🎯 Best Practice: Validate Joins
validate_join <- function(original, joined, join_type = "left") {
cat("=== Join Validation ===\n\n")
# Row count check
cat("Original rows:", nrow(original), "\n")
cat("Joined rows:", nrow(joined), "\n")
if (join_type == "inner") {
if (nrow(joined) > nrow(original)) {
warning("Inner join resulted in MORE rows - duplicates in join key?")
}
} else if (join_type == "left") {
if (nrow(joined) < nrow(original)) {
warning("Left join resulted in FEWER rows - this shouldn't happen!")
} else if (nrow(joined) > nrow(original)) {
warning("Left join resulted in MORE rows - duplicates in right table?")
}
}
# NA check
na_cols <- names(joined)[colSums(is.na(joined)) > 0]
if (length(na_cols) > 0) {
cat("\nColumns with NAs:\n")
for (col in na_cols) {
n_na <- sum(is.na(joined[[col]]))
cat(" ", col, ":", n_na, "NAs\n")
}
}
# Duplicate check in key columns
cat("\n")
}
# Test
customers <- tibble(id = 1:3, name = c("A", "B", "C"))
orders <- tibble(customer_id = c(1, 1, 2, 4), amount = c(100, 150, 200, 75))
result <- customers %>%
left_join(orders, by = c("id" = "customer_id"))
validate_join(customers, result, "left")
#> === Join Validation ===
#>
#> Original rows: 3
#> Joined rows: 4
#> Warning in validate_join(customers, result, "left"): Left join resulted in MORE
#> rows - duplicates in right table?
#>
#> Columns with NAs:
#> amount : 1 NAs22.11 Summary
Key Takeaways:
- Four main joins - inner, left, right, full
- Specify by argument - When column names differ
- Watch for duplicates - Can create many rows
- Left join creates NAs - For unmatched rows
- Use filtering joins - semi_join(), anti_join()
- Check join results - Validate row counts and NAs
- Name conflicts - Use suffix or rename
Quick Reference:
| Error | Cause | Fix |
|---|---|---|
| by must be supplied | No common columns | Use by = c("x" = "y") |
| Too many rows | Duplicate keys | Check for duplicates, aggregate |
| Unexpected NAs | Unmatched rows | Use right join type, fill NAs |
| Column conflicts | Same name, not in key | Use suffix argument |
Join Types:
# Mutating joins (add columns)
inner_join(x, y, by = "key") # Only matching
left_join(x, y, by = "key") # All x, matching y
right_join(x, y, by = "key") # Matching x, all y
full_join(x, y, by = "key") # All rows
# Filtering joins (filter rows)
semi_join(x, y, by = "key") # x rows that match y
anti_join(x, y, by = "key") # x rows that don't match y
# Different column names
left_join(x, y, by = c("x_id" = "y_id"))
# Multiple keys
left_join(x, y, by = c("id", "type"))
# Handle name conflicts
left_join(x, y, by = "id", suffix = c("_x", "_y"))Best Practices:
# ✅ Good
left_join(x, y, by = "key") # Explicit by
Check for duplicates before join # Avoid cartesian
Validate row counts after join # Catch issues
Use appropriate join type # inner vs left vs full
# ❌ Avoid
left_join(x, y) # No by (relies on names)
Join without checking duplicates # Surprise row explosion
Ignore NAs after join # May indicate issues
Always use left_join # Sometimes inner is right22.12 Exercises
📝 Exercise 1: Basic Joins
Given:
products <- tibble(id = 1:5, name = c("A", "B", "C", "D", "E"))
sales <- tibble(product_id = c(1, 2, 2, 3, 6), amount = c(100, 150, 200, 75, 50))- Inner join - only products with sales
- Left join - all products, add sales
- Find products with no sales
- Find sales for unknown products
📝 Exercise 2: Duplicate Handling
Given data with duplicates:
customers <- tibble(id = c(1, 1, 2, 3), name = c("A", "A", "B", "C"))
orders <- tibble(cust_id = c(1, 2, 2, 3), amount = c(100, 150, 200, 75))- Identify the duplicates
- Remove them appropriately
- Perform safe join
- Validate result
📝 Exercise 3: Safe Join Function
Write safe_join(x, y, by, type) that:
1. Checks for duplicate keys
2. Validates join will work
3. Performs join
4. Checks result makes sense
5. Returns result with metadata
📝 Exercise 4: Multiple Table Join
Join three tables efficiently:
22.13 Exercise Answers
Click to see answers
Exercise 1:
products <- tibble(id = 1:5, name = c("A", "B", "C", "D", "E"))
sales <- tibble(product_id = c(1, 2, 2, 3, 6), amount = c(100, 150, 200, 75, 50))
# 1. Inner join - only products with sales
products_with_sales <- products %>%
inner_join(sales, by = c("id" = "product_id"))
products_with_sales
#> # A tibble: 4 × 3
#> id name amount
#> <dbl> <chr> <dbl>
#> 1 1 A 100
#> 2 2 B 150
#> 3 2 B 200
#> 4 3 C 75
# 2. Left join - all products, add sales
all_products <- products %>%
left_join(sales, by = c("id" = "product_id"))
all_products
#> # A tibble: 6 × 3
#> id name amount
#> <dbl> <chr> <dbl>
#> 1 1 A 100
#> 2 2 B 150
#> 3 2 B 200
#> 4 3 C 75
#> 5 4 D NA
#> 6 5 E NA
# 3. Find products with no sales
no_sales <- products %>%
anti_join(sales, by = c("id" = "product_id"))
no_sales
#> # A tibble: 2 × 2
#> id name
#> <int> <chr>
#> 1 4 D
#> 2 5 E
# 4. Find sales for unknown products
unknown_products <- sales %>%
anti_join(products, by = c("product_id" = "id"))
unknown_products
#> # A tibble: 1 × 2
#> product_id amount
#> <dbl> <dbl>
#> 1 6 50Exercise 2:
customers <- tibble(id = c(1, 1, 2, 3), name = c("A", "A", "B", "C"))
orders <- tibble(cust_id = c(1, 2, 2, 3), amount = c(100, 150, 200, 75))
# 1. Identify duplicates
cat("Duplicate customers:\n")
#> Duplicate customers:
customers %>%
group_by(id) %>%
filter(n() > 1)
#> # A tibble: 2 × 2
#> # Groups: id [1]
#> id name
#> <dbl> <chr>
#> 1 1 A
#> 2 1 A
cat("\nDuplicate orders:\n")
#>
#> Duplicate orders:
orders %>%
group_by(cust_id) %>%
filter(n() > 1)
#> # A tibble: 2 × 2
#> # Groups: cust_id [1]
#> cust_id amount
#> <dbl> <dbl>
#> 1 2 150
#> 2 2 200
# 2. Remove duplicates
customers_unique <- customers %>%
distinct(id, .keep_all = TRUE)
# Orders: multiple per customer is expected, so aggregate
orders_summary <- orders %>%
group_by(cust_id) %>%
summarize(
n_orders = n(),
total_amount = sum(amount),
avg_amount = mean(amount),
.groups = "drop"
)
# 3. Perform safe join
result <- customers_unique %>%
left_join(orders_summary, by = c("id" = "cust_id"))
# 4. Validate
cat("\nValidation:\n")
#>
#> Validation:
cat("Original customers:", nrow(customers_unique), "\n")
#> Original customers: 3
cat("Result rows:", nrow(result), "\n")
#> Result rows: 3
cat("Should be equal:", nrow(customers_unique) == nrow(result), "\n")
#> Should be equal: TRUE
result
#> # A tibble: 3 × 5
#> id name n_orders total_amount avg_amount
#> <dbl> <chr> <int> <dbl> <dbl>
#> 1 1 A 1 100 100
#> 2 2 B 2 350 175
#> 3 3 C 1 75 75Exercise 3:
safe_join <- function(x, y, by, type = "left") {
# Validate inputs
if (!type %in% c("inner", "left", "right", "full")) {
stop("type must be inner, left, right, or full")
}
# Extract key columns
if (is.character(by)) {
x_keys <- if (is.null(names(by))) by else names(by)
y_keys <- if (is.null(names(by))) by else unname(by)
} else {
stop("by must be a character vector")
}
# Check keys exist
if (!all(x_keys %in% names(x))) {
stop("Some keys not found in x: ",
paste(setdiff(x_keys, names(x)), collapse = ", "))
}
if (!all(y_keys %in% names(y))) {
stop("Some keys not found in y: ",
paste(setdiff(y_keys, names(y)), collapse = ", "))
}
# Check for duplicates
x_dups <- x %>%
group_by(across(all_of(x_keys))) %>%
filter(n() > 1) %>%
nrow()
y_dups <- y %>%
group_by(across(all_of(y_keys))) %>%
filter(n() > 1) %>%
nrow()
if (x_dups > 0) {
warning("x has ", x_dups, " duplicate key rows")
}
if (y_dups > 0) {
warning("y has ", y_dups, " duplicate key rows")
}
# Perform join
result <- switch(type,
inner = inner_join(x, y, by = by),
left = left_join(x, y, by = by),
right = right_join(x, y, by = by),
full = full_join(x, y, by = by)
)
# Validate result
metadata <- list(
type = type,
x_rows = nrow(x),
y_rows = nrow(y),
result_rows = nrow(result),
x_duplicates = x_dups,
y_duplicates = y_dups,
na_columns = names(result)[colSums(is.na(result)) > 0]
)
# Check expectations
if (type == "left" && nrow(result) < nrow(x)) {
warning("Left join resulted in fewer rows than x!")
}
if (type == "inner" && nrow(result) > min(nrow(x), nrow(y))) {
warning("Inner join resulted in more rows than either input - check for duplicates")
}
# Return with metadata
attr(result, "join_metadata") <- metadata
result
}
# Test
customers <- tibble(id = 1:3, name = c("A", "B", "C"))
orders <- tibble(customer_id = c(1, 1, 2, 4), amount = c(100, 150, 200, 75))
result <- safe_join(customers, orders, by = c("id" = "customer_id"), type = "left")
#> Warning in safe_join(customers, orders, by = c(id = "customer_id"), type =
#> "left"): y has 2 duplicate key rows
result
#> # A tibble: 4 × 3
#> id name amount
#> <dbl> <chr> <dbl>
#> 1 1 A 100
#> 2 1 A 150
#> 3 2 B 200
#> 4 3 C NA
attr(result, "join_metadata")
#> $type
#> [1] "left"
#>
#> $x_rows
#> [1] 3
#>
#> $y_rows
#> [1] 4
#>
#> $result_rows
#> [1] 4
#>
#> $x_duplicates
#> [1] 0
#>
#> $y_duplicates
#> [1] 2
#>
#> $na_columns
#> [1] "amount"Exercise 4:
students <- tibble(id = 1:3, name = c("A", "B", "C"))
courses <- tibble(
student_id = c(1, 1, 2, 3),
course = c("Math", "Eng", "Math", "Sci")
)
grades <- tibble(
student_id = c(1, 1, 2, 3),
course = c("Math", "Eng", "Math", "Sci"),
grade = c(85, 90, 78, 92)
)
# Method 1: Chain joins
result1 <- students %>%
left_join(courses, by = c("id" = "student_id")) %>%
left_join(grades, by = c("id" = "student_id", "course" = "course"))
result1
#> # A tibble: 4 × 4
#> id name course grade
#> <dbl> <chr> <chr> <dbl>
#> 1 1 A Math 85
#> 2 1 A Eng 90
#> 3 2 B Math 78
#> 4 3 C Sci 92
# Method 2: Join courses and grades first
result2 <- students %>%
left_join(
courses %>%
left_join(grades, by = c("student_id", "course")),
by = c("id" = "student_id")
)
result2
#> # A tibble: 4 × 4
#> id name course grade
#> <dbl> <chr> <chr> <dbl>
#> 1 1 A Math 85
#> 2 1 A Eng 90
#> 3 2 B Math 78
#> 4 3 C Sci 92
# Method 3: Using reduce (for many tables)
library(purrr)
tables <- list(
students,
courses,
grades
)
keys <- list(
c("id" = "student_id"),
c("id" = "student_id", "course" = "course")
)
# This is complex - usually method 1 or 2 is better
# Showing for completeness
# Summary by student
summary_result <- students %>%
left_join(
courses %>%
left_join(grades, by = c("student_id", "course")) %>%
group_by(student_id) %>%
summarize(
n_courses = n(),
avg_grade = mean(grade, na.rm = TRUE),
courses = paste(course, collapse = ", "),
.groups = "drop"
),
by = c("id" = "student_id")
)
summary_result
#> # A tibble: 3 × 5
#> id name n_courses avg_grade courses
#> <dbl> <chr> <int> <dbl> <chr>
#> 1 1 A 2 87.5 Math, Eng
#> 2 2 B 1 78 Math
#> 3 3 C 1 92 Sci