Chapter 9 Data Frame Construction
What You’ll Learn:
- How data frames are constructed
- Row and column length requirements
- Type coercion during construction
- Row names and their pitfalls
- Converting between structures
Key Errors Covered: 15+ construction errors
Difficulty: ⭐ Beginner to ⭐⭐ Intermediate
9.1 Introduction
Data frames are R’s workhorse for tabular data. But creating them can be tricky:
# This looks innocent...
data.frame(x = 1:3, y = 1:5)
#> Error in data.frame(x = 1:3, y = 1:5): arguments imply differing number of rows: 3, 5🔴 ERROR
Error in data.frame(x = 1:3, y = 1:5) :
arguments imply differing number of rows: 3, 5
Let’s master data frame construction and avoid all the common pitfalls.
9.2 Data Frame Basics
💡 Key Insight: What is a Data Frame?
A data frame is a list of vectors with special properties:
# Create a data frame
df <- data.frame(
name = c("Alice", "Bob", "Charlie"),
age = c(25, 30, 35),
score = c(85, 90, 95)
)
# It's actually a list!
typeof(df)
#> [1] "list"
is.list(df)
#> [1] TRUE
# But special
class(df)
#> [1] "data.frame"
is.data.frame(df)
#> [1] TRUE
# Each column is a vector
df$name
#> [1] "Alice" "Bob" "Charlie"
df$age
#> [1] 25 30 35
# All columns must have same length
length(df$name)
#> [1] 3
length(df$age)
#> [1] 3Key requirements: 1. All columns must be same length (or length 1) 2. Each column must be a vector (atomic or list) 3. Row names must be unique 4. Column names should be unique (R allows duplicates but it’s confusing)
9.3 Error #1: arguments imply differing number of rows
⭐ BEGINNER 📏 DIMENSION
9.3.1 The Error
data.frame(
x = 1:5,
y = 1:3
)
#> Error in data.frame(x = 1:5, y = 1:3): arguments imply differing number of rows: 5, 3🔴 ERROR
Error in data.frame(x = 1:5, y = 1:3) :
arguments imply differing number of rows: 5, 3
9.3.2 What It Means
You’re trying to create a data frame with columns of different lengths that aren’t compatible.
9.3.3 The Recycling Rule
Data frames allow recycling, but only in specific cases:
9.3.4 Common Causes
9.3.4.1 Cause 1: Filtered Data
# Start with same length
ids <- 1:10
values <- rnorm(10)
# Filter one but not the other
filtered_values <- values[values > 0] # Might be 6 elements
# Try to combine
data.frame(id = ids, value = filtered_values) # Error!
#> Error in data.frame(id = ids, value = filtered_values): arguments imply differing number of rows: 10, 39.3.4.3 Cause 3: Reading Different Sources
# Simulating reading from different sources
col1 <- readLines(textConnection("a\nb\nc\nd\ne"))
col2 <- readLines(textConnection("1\n2\n3")) # Shorter!
# Try to combine
data.frame(col1 = col1, col2 = col2)
#> Error in data.frame(col1 = col1, col2 = col2): arguments imply differing number of rows: 5, 39.3.5 Solutions
✅ SOLUTION 1: Match Lengths Before Creating
ids <- 1:10
values <- rnorm(10)
filtered_values <- values[values > 0]
# Option A: Filter both the same way
keep <- values > 0
data.frame(
id = ids[keep],
value = filtered_values
)
#> id value
#> 1 1 0.4922286
#> 2 2 0.2678350
#> 3 3 0.6532577
#> 4 8 0.4302847
#> 5 9 0.5353988
# Option B: Extend shorter with NA
max_len <- max(length(ids), length(filtered_values))
ids_ext <- c(ids, rep(NA, max_len - length(ids)))
val_ext <- c(filtered_values, rep(NA, max_len - length(filtered_values)))
data.frame(id = ids_ext, value = val_ext)
#> id value
#> 1 1 0.4922286
#> 2 2 0.2678350
#> 3 3 0.6532577
#> 4 4 0.4302847
#> 5 5 0.5353988
#> 6 6 NA
#> 7 7 NA
#> 8 8 NA
#> 9 9 NA
#> 10 10 NA
# Option C: Trim longer to match
min_len <- min(length(ids), length(filtered_values))
data.frame(
id = ids[1:min_len],
value = filtered_values[1:min_len]
)
#> id value
#> 1 1 0.4922286
#> 2 2 0.2678350
#> 3 3 0.6532577
#> 4 4 0.4302847
#> 5 5 0.5353988✅ SOLUTION 2: Check Lengths First
safe_data_frame <- function(...) {
# Get all arguments
args <- list(...)
# Get lengths
lens <- sapply(args, length)
# Check compatibility
max_len <- max(lens)
valid <- lens == 1 | lens == max_len | max_len %% lens == 0
if (!all(valid)) {
invalid_lens <- unique(lens[!valid])
stop("Incompatible lengths: ", paste(lens, collapse = ", "))
}
# Create data frame
data.frame(..., stringsAsFactors = FALSE)
}
# Test
safe_data_frame(x = 1:5, y = 10) # Works
#> x y
#> 1 1 10
#> 2 2 10
#> 3 3 10
#> 4 4 10
#> 5 5 10
safe_data_frame(x = 1:6, y = c(1,2)) # Works
#> x y
#> 1 1 1
#> 2 2 2
#> 3 3 1
#> 4 4 2
#> 5 5 1
#> 6 6 2✅ SOLUTION 3: Use tidyverse for Better Errors
⚠️ Common Pitfall: Silent Recycling
# This works but may not be intended!
df <- data.frame(
group = 1:12,
label = c("A", "B", "C") # Recycled 4 times
)
df
#> group label
#> 1 1 A
#> 2 2 B
#> 3 3 C
#> 4 4 A
#> 5 5 B
#> 6 6 C
#> 7 7 A
#> 8 8 B
#> 9 9 C
#> 10 10 A
#> 11 11 B
#> 12 12 C
# Was this intended? Hard to tell!
# Explicit is better:
df <- data.frame(
group = 1:12,
label = rep(c("A", "B", "C"), times = 4)
)9.4 Error #2: row names supplied are of wrong length
⭐ BEGINNER 📏 DIMENSION
9.4.1 The Error
data.frame(
x = 1:5,
y = 6:10,
row.names = c("a", "b", "c") # Only 3 names for 5 rows!
)
#> Error in data.frame(x = 1:5, y = 6:10, row.names = c("a", "b", "c")): row names supplied are of the wrong length🔴 ERROR
Error in data.frame(x = 1:5, y = 6:10, row.names = c("a", "b", "c")) :
row names supplied are of wrong length
9.4.3 Row Names Basics
# Row names are optional
df <- data.frame(x = 1:3, y = 4:6)
rownames(df) # Default: "1", "2", "3"
#> [1] "1" "2" "3"
# Can set custom row names
df <- data.frame(
x = 1:3,
y = 4:6,
row.names = c("first", "second", "third")
)
rownames(df)
#> [1] "first" "second" "third"
# Or set after creation
df <- data.frame(x = 1:3, y = 4:6)
rownames(df) <- c("a", "b", "c")
rownames(df)
#> [1] "a" "b" "c"9.4.4 Common Causes
9.4.4.2 Cause 2: After Subsetting
df <- data.frame(x = 1:5, y = 6:10)
original_names <- c("a", "b", "c", "d", "e")
rownames(df) <- original_names
# Subset data frame
df_subset <- df[1:3, ]
rownames(df_subset) # Kept original names (good)
#> [1] "a" "b" "c"
# But if you try to reuse original names on subset:
rownames(df_subset) <- original_names # Error! 5 names for 3 rows
#> Error in `.rowNamesDF<-`(x, value = value): invalid 'row.names' length9.4.5 Solutions
✅ SOLUTION 1: Match Length
df <- data.frame(x = 1:5, y = 6:10)
names_available <- c("a", "b", "c")
# Option A: Extend with numbers
all_names <- c(names_available,
paste0("row", (length(names_available)+1):nrow(df)))
rownames(df) <- all_names
# Option B: Use only available, add column instead
df$label <- c(names_available, rep(NA, nrow(df) - length(names_available)))
df
#> x y label
#> a 1 6 a
#> b 2 7 b
#> c 3 8 c
#> row4 4 9 <NA>
#> row5 5 10 <NA>✅ SOLUTION 2: Check Before Assigning
safe_set_rownames <- function(df, names) {
if (length(names) != nrow(df)) {
warning("Row names length (", length(names),
") doesn't match rows (", nrow(df), ")")
return(df)
}
rownames(df) <- names
return(df)
}
# Test
df <- data.frame(x = 1:5, y = 6:10)
df <- safe_set_rownames(df, c("a", "b", "c")) # Warning, unchanged
#> Warning in safe_set_rownames(df, c("a", "b", "c")): Row names length (3)
#> doesn't match rows (5)
df <- safe_set_rownames(df, letters[1:5]) # Works✅ SOLUTION 3: Use Column Instead
🎯 Best Practice: Avoid Row Names
Row names are a legacy feature. Modern R style:
# Old style (avoid)
df <- data.frame(x = 1:3, y = 4:6, row.names = c("a", "b", "c"))
# New style (prefer)
df <- data.frame(
id = c("a", "b", "c"),
x = 1:3,
y = 4:6
)
# Tibbles don't even support row names!
library(tibble)
tibble(id = c("a", "b", "c"), x = 1:3, y = 4:6)
#> # A tibble: 3 × 3
#> id x y
#> <chr> <int> <int>
#> 1 a 1 4
#> 2 b 2 5
#> 3 c 3 6Why? - Row names are easily lost in operations - Harder to work with programmatically - Not supported by modern tidyverse - Column is more explicit and flexible
9.5 Error #3: duplicate row.names are not allowed
⭐ BEGINNER 🔤 SYNTAX
9.5.1 The Error
data.frame(
x = 1:3,
y = 4:6,
row.names = c("a", "b", "a") # Duplicate!
)
#> Error in data.frame(x = 1:3, y = 4:6, row.names = c("a", "b", "a")): duplicate row.names: a🔴 ERROR
Error in data.frame(x = 1:3, y = 4:6, row.names = c("a", "b", "a")) :
duplicate row.names: a
9.5.4 Solutions
✅ SOLUTION 1: Make Unique
df <- data.frame(x = 1:5, y = 6:10)
ids <- c("sample1", "sample2", "sample2", "sample3", "sample4")
# Make unique automatically
unique_ids <- make.unique(ids, sep = "_")
unique_ids
#> [1] "sample1" "sample2" "sample2_1" "sample3" "sample4"
rownames(df) <- unique_ids
df
#> x y
#> sample1 1 6
#> sample2 2 7
#> sample2_1 3 8
#> sample3 4 9
#> sample4 5 10✅ SOLUTION 2: Reset Row Names When Combining
df1 <- data.frame(x = 1:3, row.names = c("a", "b", "c"))
df2 <- data.frame(x = 4:6, row.names = c("c", "d", "e"))
# Remove row names before combining
rownames(df1) <- NULL
rownames(df2) <- NULL
rbind(df1, df2)
#> x
#> 1 1
#> 2 2
#> 3 3
#> 4 4
#> 5 5
#> 6 6
# Or use row.names = FALSE
df_combined <- rbind(df1, df2) # Still errors
# Better: let R assign new row names
df1_clean <- df1
df2_clean <- df2
rownames(df1_clean) <- NULL
rownames(df2_clean) <- NULL
rbind(df1_clean, df2_clean)
#> x
#> 1 1
#> 2 2
#> 3 3
#> 4 4
#> 5 5
#> 6 6✅ SOLUTION 3: Check for Duplicates First
safe_set_rownames_unique <- function(df, names) {
if (anyDuplicated(names)) {
dupes <- names[duplicated(names)]
warning("Duplicate row names found: ",
paste(unique(dupes), collapse = ", "))
names <- make.unique(names, sep = "_")
}
rownames(df) <- names
return(df)
}
# Test
df <- data.frame(x = 1:5)
df <- safe_set_rownames_unique(df, c("a", "b", "a", "c", "d"))
#> Warning in safe_set_rownames_unique(df, c("a", "b", "a", "c", "d")): Duplicate
#> row names found: a
rownames(df)
#> [1] "a" "b" "a_1" "c" "d"9.6 Error #4: invalid type (list) for variable
⭐⭐ INTERMEDIATE 🔢 TYPE
9.6.1 The Error
# Trying to create column from nested list
data.frame(
id = 1:3,
values = list(1:3, 4:6, 7:9) # List column - old data.frame rejects
)
#> id values.1.3 values.4.6 values.7.9
#> 1 1 1 4 7
#> 2 2 2 5 8
#> 3 3 3 6 9🔴 ERROR (in older R versions or strict mode)
Error in data.frame(...) : invalid type (list) for variable 'values'
9.6.2 What It Means
Traditional data.frame() doesn’t easily support list columns. Each column should be an atomic vector.
9.6.3 Modern Solution: List Columns
# Modern R allows this with I()
df <- data.frame(
id = 1:3,
values = I(list(1:3, 4:6, 7:9))
)
df
#> id values
#> 1 1 1, 2, 3
#> 2 2 4, 5, 6
#> 3 3 7, 8, 9
# Access list column
df$values[[1]]
#> [1] 1 2 3
# Tibbles make it easier
library(tibble)
tibble(
id = 1:3,
values = list(1:3, 4:6, 7:9)
)
#> # A tibble: 3 × 2
#> id values
#> <int> <list>
#> 1 1 <int [3]>
#> 2 2 <int [3]>
#> 3 3 <int [3]>9.6.4 Common Causes
9.6.4.2 Cause 2: Split/Group Result
# Split creates list
values <- 1:12
groups <- rep(1:3, each = 4)
split_data <- split(values, groups)
# Try to put in data frame directly
data.frame(
group = 1:3,
values = split_data # May error
)
#> Error in data.frame(group = 1:3, values = split_data): arguments imply differing number of rows: 3, 49.6.5 Solutions
✅ SOLUTION 1: Use I() to Protect List
✅ SOLUTION 2: Use Tibble (Easier)
library(tibble)
# Tibbles naturally support list columns
tib <- tibble(
id = 1:3,
values = list(1:3, 4:6, 7:9),
nested = list(
list(a = 1, b = 2),
list(a = 3, b = 4),
list(a = 5, b = 6)
)
)
tib
#> # A tibble: 3 × 3
#> id values nested
#> <int> <list> <list>
#> 1 1 <int [3]> <named list [2]>
#> 2 2 <int [3]> <named list [2]>
#> 3 3 <int [3]> <named list [2]>
# Clean syntax
tib$values
#> [[1]]
#> [1] 1 2 3
#>
#> [[2]]
#> [1] 4 5 6
#>
#> [[3]]
#> [1] 7 8 9
tib$nested[[1]]
#> $a
#> [1] 1
#>
#> $b
#> [1] 2✅ SOLUTION 3: Unnest if Possible
# If you don't need list column, unnest
library(tidyr)
# Start with list column
df <- tibble(
id = 1:3,
values = list(1:3, 4:6, 7:9)
)
# Unnest to regular columns
unnest(df, values)
#> # A tibble: 9 × 2
#> id values
#> <int> <int>
#> 1 1 1
#> 2 1 2
#> 3 1 3
#> 4 2 4
#> 5 2 5
#> 6 2 6
#> 7 3 7
#> 8 3 8
#> 9 3 99.7 Error #5: cannot coerce class X to a data.frame
⭐ BEGINNER 🔢 TYPE
9.7.1 The Error
# Try to convert function to data frame
my_func <- function(x) x + 1
as.data.frame(my_func)
#> Error in as.data.frame.default(my_func): cannot coerce class '"function"' to a data.frame🔴 ERROR
Error in as.data.frame.default(my_func) :
cannot coerce class '"function"' to a data.frame
9.7.4 Solutions
✅ SOLUTION 1: Check Type First
safe_as_dataframe <- function(x) {
# Check if already data frame
if (is.data.frame(x)) {
return(x)
}
# Check if matrix
if (is.matrix(x)) {
return(as.data.frame(x))
}
# Check if list with equal lengths
if (is.list(x)) {
lens <- lengths(x)
if (length(unique(lens)) == 1 || all(lens == 1 | lens == max(lens))) {
return(as.data.frame(x, stringsAsFactors = FALSE))
} else {
stop("List has incompatible lengths: ", paste(lens, collapse = ", "))
}
}
# Check if vector
if (is.atomic(x)) {
return(data.frame(value = x, stringsAsFactors = FALSE))
}
stop("Cannot convert ", class(x)[1], " to data frame")
}
# Test
safe_as_dataframe(1:5)
#> value
#> 1 1
#> 2 2
#> 3 3
#> 4 4
#> 5 5
safe_as_dataframe(matrix(1:6, 2, 3))
#> V1 V2 V3
#> 1 1 3 5
#> 2 2 4 6
safe_as_dataframe(list(a = 1:3, b = 4:6))
#> a b
#> 1 1 4
#> 2 2 5
#> 3 3 6✅ SOLUTION 2: Fix Structure First
# Uneven list
irregular <- list(
a = 1:3,
b = 1:5,
c = 1:2
)
# Option A: Pad with NA
max_len <- max(lengths(irregular))
regular <- lapply(irregular, function(x) {
c(x, rep(NA, max_len - length(x)))
})
as.data.frame(regular)
#> a b c
#> 1 1 1 1
#> 2 2 2 2
#> 3 3 3 NA
#> 4 NA 4 NA
#> 5 NA 5 NA
# Option B: Trim to shortest
min_len <- min(lengths(irregular))
regular <- lapply(irregular, function(x) x[1:min_len])
as.data.frame(regular)
#> a b c
#> 1 1 1 1
#> 2 2 2 29.8 Stringsasfactors Historical Issue
⚠️ Historical Pitfall: stringsAsFactors
In R < 4.0.0, stringsAsFactors = TRUE was default:
# R < 4.0.0 behavior
df_old <- data.frame(
name = c("Alice", "Bob"),
age = c(25, 30),
stringsAsFactors = TRUE # Was default
)
class(df_old$name) # "factor" (was default)
#> [1] "factor"
# R >= 4.0.0 behavior
df_new <- data.frame(
name = c("Alice", "Bob"),
age = c(25, 30)
# stringsAsFactors = FALSE is now default
)
class(df_new$name) # "character"
#> [1] "character"Best Practice: Always specify explicitly:
9.9 Creating Data Frames: All Methods
🎯 Best Practice: Data Frame Creation Methods
# Method 1: data.frame() - base R
df1 <- data.frame(
x = 1:3,
y = 4:6,
stringsAsFactors = FALSE
)
# Method 2: tibble() - modern tidyverse
library(tibble)
df2 <- tibble(
x = 1:3,
y = 4:6
)
# Method 3: From matrix
mat <- matrix(1:6, nrow = 2, ncol = 3)
df3 <- as.data.frame(mat)
# Method 4: From list
lst <- list(x = 1:3, y = 4:6)
df4 <- as.data.frame(lst)
# Method 5: From vectors
x <- 1:3
y <- 4:6
df5 <- data.frame(x, y)
# Method 6: Reading data
df6 <- read.csv("file.csv") # Base R
#> Warning in file(file, "rt"): cannot open file 'file.csv': No such file or
#> directory
#> Error in file(file, "rt"): cannot open the connection
# df7 <- read_csv("file.csv") # readr (tibble)
# Method 7: tribble() - row-wise
library(tibble)
df8 <- tribble(
~x, ~y,
1, 4,
2, 5,
3, 6
)When to use each:
- data.frame(): Base R compatibility needed
- tibble(): Modern code, better defaults
- as.data.frame(): Converting from other structures
- tribble(): Small data, readable layout
9.10 Summary
Key Takeaways:
- All columns must be same length (or length 1 for recycling)
- Row names must be unique (if used at all)
- Avoid row names in modern code - use column instead
- List columns need I() in data.frame(), or use tibble
- stringsAsFactors = FALSE for compatibility
- Check lengths before construction
- tibble is stricter and usually better
Quick Reference:
| Error | Cause | Fix |
|---|---|---|
| differing number of rows | Unequal lengths | Match lengths or use length 1 |
| row names wrong length | Row names ≠ nrows | Provide correct number |
| duplicate row.names | Non-unique names | Use make.unique() or remove |
| invalid type (list) | List column in old data.frame | Use I() or tibble |
| cannot coerce to data.frame | Wrong structure | Check type and fix structure |
Construction Checklist:
# Before creating data frame:
lengths(list_of_vectors) # Check all lengths
anyDuplicated(row_names) # Check for duplicates
class(each_column) # Verify types
# Safe construction:
tibble(...) # Stricter checking
data.frame(..., stringsAsFactors = FALSE) # Explicit
# After construction:
str(df) # Verify structure
anyDuplicated(rownames(df)) # Check row namesModern Best Practices:
9.11 Exercises
📝 Exercise 1: Diagnose the Error
What’s wrong and how do you fix it?
# Scenario 1
df <- data.frame(
id = 1:10,
group = c("A", "B", "C")
)
# Scenario 2
df <- data.frame(
x = 1:5,
y = 6:10,
row.names = c("a", "b", "c", "a", "e")
)
# Scenario 3
df1 <- data.frame(x = 1:3, row.names = c("a", "b", "c"))
df2 <- data.frame(x = 4:6, row.names = c("b", "c", "d"))
combined <- rbind(df1, df2)
# Scenario 4
data.frame(
id = 1:3,
data = list(
c(1, 2, 3),
c(4, 5),
c(6, 7, 8, 9)
)
)📝 Exercise 2: Safe Constructor
Write safe_df(...) that:
1. Checks all vectors are same length or length 1
2. Warns about recycling
3. Checks for duplicate names
4. Returns tibble or data.frame
5. Handles list columns properly
📝 Exercise 3: Fix Irregular Data
You have:
Create a data frame handling the unequal lengths gracefully.
📝 Exercise 4: Combine with Row Names
You have multiple data frames with overlapping row names:
df1 <- data.frame(x = 1:3, row.names = c("a", "b", "c"))
df2 <- data.frame(y = 4:6, row.names = c("b", "c", "d"))
df3 <- data.frame(z = 7:9, row.names = c("c", "d", "e"))Combine them into one data frame keeping all data.
9.12 Exercise Answers
Click to see answers
Exercise 1:
# Scenario 1 - Length mismatch (10 vs 3)
# Fix: Recycle explicitly or trim
df <- data.frame(
id = 1:10,
group = rep(c("A", "B", "C"), length.out = 10)
)
# Scenario 2 - Duplicate row name "a"
df <- data.frame(
x = 1:5,
y = 6:10,
row.names = make.unique(c("a", "b", "c", "a", "e"))
)
# Scenario 3 - Overlapping row names
df1 <- data.frame(x = 1:3, row.names = c("a", "b", "c"))
df2 <- data.frame(x = 4:6, row.names = c("b", "c", "d"))
# Option A: Remove row names
combined <- rbind(
data.frame(x = df1$x),
data.frame(x = df2$x)
)
# Option B: Keep as column
combined <- rbind(
data.frame(id = rownames(df1), x = df1$x),
data.frame(id = rownames(df2), x = df2$x)
)
# Scenario 4 - List column (different lengths within)
# Use tibble or I()
library(tibble)
df <- tibble(
id = 1:3,
data = list(
c(1, 2, 3),
c(4, 5),
c(6, 7, 8, 9)
)
)
# Or with data.frame:
df <- data.frame(id = 1:3)
df$data <- list(c(1,2,3), c(4,5), c(6,7,8,9))Exercise 2:
safe_df <- function(..., use_tibble = TRUE) {
args <- list(...)
# Get lengths
lens <- sapply(args, length)
# Check for issues
max_len <- max(lens)
# Check compatibility
recyclable <- lens == 1 | lens == max_len
if (!all(recyclable)) {
# Check if multiples
multiples <- max_len %% lens == 0
if (!all(recyclable | multiples)) {
stop("Incompatible lengths: ", paste(lens, collapse = ", "),
"\nMax length: ", max_len)
}
warning("Recycling vectors of length ",
paste(unique(lens[!recyclable]), collapse = ", "),
" to length ", max_len)
}
# Check names
arg_names <- names(args)
if (!is.null(arg_names) && anyDuplicated(arg_names)) {
warning("Duplicate column names: ",
paste(arg_names[duplicated(arg_names)], collapse = ", "))
}
# Create data frame
if (use_tibble) {
library(tibble)
result <- tibble(...)
} else {
result <- data.frame(..., stringsAsFactors = FALSE)
}
return(result)
}
# Test
safe_df(x = 1:5, y = 10)
#> # A tibble: 5 × 2
#> x y
#> <int> <dbl>
#> 1 1 10
#> 2 2 10
#> 3 3 10
#> 4 4 10
#> 5 5 10
safe_df(x = 1:6, y = c(1, 2)) # Warning about recycling
#> Warning in safe_df(x = 1:6, y = c(1, 2)): Recycling vectors of length 2 to
#> length 6
#> Error in `tibble()`:
#> ! Tibble columns must have compatible sizes.
#> • Size 6: Existing data.
#> • Size 2: Column `y`.
#> ℹ Only values of size one are recycled.Exercise 3:
data <- list(
id = 1:5,
name = c("Alice", "Bob", "Charlie"),
score = c(85, 90, 95, 88)
)
# Option A: Extend shorter with NA
max_len <- max(lengths(data))
data_fixed <- lapply(data, function(x) {
c(x, rep(NA, max_len - length(x)))
})
df <- as.data.frame(data_fixed, stringsAsFactors = FALSE)
df
#> id name score
#> 1 1 Alice 85
#> 2 2 Bob 90
#> 3 3 Charlie 95
#> 4 4 <NA> 88
#> 5 5 <NA> NA
# Option B: Trim all to shortest
min_len <- min(lengths(data))
data_fixed <- lapply(data, function(x) x[1:min_len])
df <- as.data.frame(data_fixed, stringsAsFactors = FALSE)
df
#> id name score
#> 1 1 Alice 85
#> 2 2 Bob 90
#> 3 3 Charlie 95
# Option C: Use only complete cases
# (more complex - requires pairing)Exercise 4:
library(tibble)
df1 <- data.frame(x = 1:3, row.names = c("a", "b", "c"))
df2 <- data.frame(y = 4:6, row.names = c("b", "c", "d"))
df3 <- data.frame(z = 7:9, row.names = c("c", "d", "e"))
# Convert row names to column
df1_with_id <- tibble(id = rownames(df1), x = df1$x)
df2_with_id <- tibble(id = rownames(df2), y = df2$y)
df3_with_id <- tibble(id = rownames(df3), z = df3$z)
# Full join to keep all
library(dplyr)
result <- df1_with_id %>%
full_join(df2_with_id, by = "id") %>%
full_join(df3_with_id, by = "id")
result
#> # A tibble: 5 × 4
#> id x y z
#> <chr> <int> <int> <int>
#> 1 a 1 NA NA
#> 2 b 2 4 NA
#> 3 c 3 5 7
#> 4 d NA 6 8
#> 5 e NA NA 9
# Alternative: using merge
result <- merge(df1_with_id, df2_with_id, by = "id", all = TRUE)
result <- merge(result, df3_with_id, by = "id", all = TRUE)
result
#> id x y z
#> 1 a 1 NA NA
#> 2 b 2 4 NA
#> 3 c 3 5 7
#> 4 d NA 6 8
#> 5 e NA NA 9