Chapter 10 Column Manipulation
What You’ll Learn:
- Adding and removing columns safely
- Column name handling
- Type preservation during operations
- Renaming strategies
- Common manipulation pitfalls
Key Errors Covered: 12+ column manipulation errors
Difficulty: ⭐ Beginner to ⭐⭐ Intermediate
10.1 Introduction
Working with data frame columns is a daily task, but it’s full of traps:
df <- data.frame(x = 1:5, y = 6:10)
df[, "z"] # Typo in column name
#> Error in `[.data.frame`(df, , "z"): undefined columns selected# Or this:
df$new_column <- 1:3 # Wrong length!
#> Error in `$<-.data.frame`(`*tmp*`, new_column, value = 1:3): replacement has 3 rows, data has 5Let’s master column manipulation to avoid these common errors.
10.2 Column Basics
💡 Key Insight: Data Frame is a List
Understanding this is key to column operations:
df <- data.frame(x = 1:3, y = 4:6, z = 7:9)
# Data frame is a special list
is.list(df)
#> [1] TRUE
length(df) # Number of columns!
#> [1] 3
# Each column is a list element
df[[1]] # First column (vector)
#> [1] 1 2 3
df[1] # First column (data frame)
#> x
#> 1 1
#> 2 2
#> 3 3
# Three ways to access columns:
df$x # Dollar sign
#> [1] 1 2 3
df[["x"]] # Double bracket
#> [1] 1 2 3
df["x"] # Single bracket (returns data frame)
#> x
#> 1 1
#> 2 2
#> 3 3
# Column names
names(df)
#> [1] "x" "y" "z"
colnames(df)
#> [1] "x" "y" "z"10.3 Error #1: undefined columns selected
⭐ BEGINNER 📏 DIMENSION
10.3.1 The Error
df <- data.frame(age = c(25, 30, 35), name = c("Alice", "Bob", "Charlie"))
df[, "salary"] # Column doesn't exist
#> Error in `[.data.frame`(df, , "salary"): undefined columns selected🔴 ERROR
Error in `[.data.frame`(df, , "salary") : undefined columns selected
10.3.3 Common Causes
10.3.4 Solutions
✅ SOLUTION 1: Check Column Exists
df <- data.frame(age = c(25, 30, 35), name = c("Alice", "Bob", "Charlie"))
# Check before accessing
if ("salary" %in% names(df)) {
df[, "salary"]
} else {
message("Column 'salary' not found")
NULL
}
#> Column 'salary' not found
#> NULL
# Or for multiple columns
cols_wanted <- c("age", "salary", "name")
cols_available <- cols_wanted[cols_wanted %in% names(df)]
df[, cols_available]
#> age name
#> 1 25 Alice
#> 2 30 Bob
#> 3 35 Charlie✅ SOLUTION 2: Use dplyr’s select() with Helpers
library(dplyr)
df <- data.frame(age = c(25, 30, 35), name = c("Alice", "Bob", "Charlie"))
# Select only existing columns
df %>% select(any_of(c("age", "salary", "name")))
#> Error in select(., any_of(c("age", "salary", "name"))): unused argument (any_of(c("age", "salary", "name")))
# Or with error on missing✅ SOLUTION 3: Safe Column Selection Function
safe_select <- function(df, cols, warn = TRUE) {
existing <- cols[cols %in% names(df)]
missing <- cols[!cols %in% names(df)]
if (length(missing) > 0 && warn) {
warning("Columns not found: ", paste(missing, collapse = ", "))
}
if (length(existing) == 0) {
return(data.frame()) # Empty data frame
}
return(df[, existing, drop = FALSE])
}
# Test
df <- data.frame(x = 1:5, y = 6:10)
safe_select(df, c("x", "z", "y"))
#> Warning in safe_select(df, c("x", "z", "y")): Columns not found: z
#> x y
#> 1 1 6
#> 2 2 7
#> 3 3 8
#> 4 4 9
#> 5 5 1010.4 Error #2: replacement has X rows, data has Y
⭐ BEGINNER 📏 DIMENSION
10.4.1 The Error
df <- data.frame(x = 1:5, y = 6:10)
df$z <- 1:3 # Wrong length!
#> Error in `$<-.data.frame`(`*tmp*`, z, value = 1:3): replacement has 3 rows, data has 5🔴 ERROR
Error in `$<-.data.frame`(`*tmp*`, z, value = 1:3) :
replacement has 3 rows, data has 5
10.4.2 What It Means
When adding/replacing a column, the new values must match the number of rows (or be length 1).
10.4.3 The Recycling Rule for Columns
10.4.4 Common Causes
10.4.4.1 Cause 1: Calculation Resulted in Wrong Length
df <- data.frame(id = 1:10, value = rnorm(10))
# Filter creates shorter vector
high_values <- df$value[df$value > 0] # Maybe 6 elements
# Try to add back
df$high <- high_values # Error! 6 vs 10
#> Error in `$<-.data.frame`(`*tmp*`, high, value = c(1.77950290977515, 0.286424419628825, : replacement has 7 rows, data has 1010.4.5 Solutions
✅ SOLUTION 1: Match Lengths
df <- data.frame(id = 1:10, value = rnorm(10))
high_values <- df$value[df$value > 0]
# Option A: Use NA for missing
df$high <- NA
df$high[df$value > 0] <- high_values
df
#> id value high
#> 1 1 -0.19051680 NA
#> 2 2 0.37842390 0.37842390
#> 3 3 0.30003855 0.30003855
#> 4 4 -1.00563626 NA
#> 5 5 0.01925927 0.01925927
#> 6 6 -1.07742065 NA
#> 7 7 0.71270333 0.71270333
#> 8 8 1.08477509 1.08477509
#> 9 9 -2.22498770 NA
#> 10 10 1.23569346 1.23569346
# Option B: Use ifelse
df$high <- ifelse(df$value > 0, df$value, NA)✅ SOLUTION 2: Use Merge for Aggregates
df <- data.frame(
group = rep(c("A", "B"), each = 5),
value = 1:10
)
# Calculate group means
group_summary <- aggregate(value ~ group, df, mean)
names(group_summary)[2] <- "group_mean"
# Merge back
df <- merge(df, group_summary, by = "group")
df
#> group value group_mean
#> 1 A 1 3
#> 2 A 2 3
#> 3 A 3 3
#> 4 A 4 3
#> 5 A 5 3
#> 6 B 6 8
#> 7 B 7 8
#> 8 B 8 8
#> 9 B 9 8
#> 10 B 10 8✅ SOLUTION 3: Use dplyr (Cleaner)
library(dplyr)
df <- data.frame(
group = rep(c("A", "B"), each = 5),
value = 1:10
)
# Add group mean to each row
df <- df %>%
group_by(group) %>%
mutate(group_mean = mean(value)) %>%
ungroup()
df
#> # A tibble: 10 × 3
#> group value group_mean
#> <chr> <int> <dbl>
#> 1 A 1 3
#> 2 A 2 3
#> 3 A 3 3
#> 4 A 4 3
#> 5 A 5 3
#> 6 B 6 8
#> 7 B 7 8
#> 8 B 8 8
#> 9 B 9 8
#> 10 B 10 810.5 Error #3: duplicate column names
⭐ BEGINNER 🔤 SYNTAX
10.5.2 Why It’s Dangerous
# Create with duplicates
df <- data.frame(value = 1:3, value = 4:6, check.names = FALSE)
# Operations become unpredictable
df$value <- df$value * 2 # Which one gets modified?
df
#> value value
#> 1 2 4
#> 2 4 5
#> 3 6 6
# Selection is confusing
df[, c("value", "value")] # Gets same column twice
#> value value.1
#> 1 2 2
#> 2 4 4
#> 3 6 610.5.3 Solutions
✅ SOLUTION 1: Let R Fix Names
# Default: R makes names unique
df <- data.frame(x = 1:3, x = 4:6) # check.names = TRUE by default
names(df) # "x" and "x.1"
#> [1] "x" "x.1"
# Or manually
names_original <- c("value", "value", "score")
names_fixed <- make.names(names_original, unique = TRUE)
names_fixed
#> [1] "value" "value.1" "score"✅ SOLUTION 2: Check and Fix Names
fix_duplicate_names <- function(df) {
col_names <- names(df)
if (anyDuplicated(col_names)) {
dupes <- col_names[duplicated(col_names)]
warning("Duplicate column names found: ",
paste(unique(dupes), collapse = ", "))
names(df) <- make.names(col_names, unique = TRUE)
}
return(df)
}
# Test
df <- data.frame(x = 1:3, x = 4:6, check.names = FALSE)
df <- fix_duplicate_names(df)
#> Warning in fix_duplicate_names(df): Duplicate column names found: x
names(df)
#> [1] "x" "x.1"✅ SOLUTION 3: Prevent Duplicates
10.6 Error #4: names attribute must be same length as vector
⭐ BEGINNER 📏 DIMENSION
10.6.1 The Error
df <- data.frame(x = 1:5, y = 6:10, z = 11:15)
names(df) <- c("a", "b") # Only 2 names for 3 columns!🔴 ERROR
Error in names(df) <- c("a", "b") :
'names' attribute must be the same length as the vector (3)
10.6.4 Solutions
✅ SOLUTION 1: Match Number of Names
✅ SOLUTION 2: Use Named Vector for Partial Rename
✅ SOLUTION 3: Safe Rename Function
safe_rename <- function(df, ...) {
name_mapping <- list(...)
for (old_name in names(name_mapping)) {
new_name <- name_mapping[[old_name]]
if (!old_name %in% names(df)) {
warning("Column '", old_name, "' not found, skipping")
next
}
if (new_name %in% names(df) && new_name != old_name) {
warning("Column '", new_name, "' already exists, skipping")
next
}
names(df)[names(df) == old_name] <- new_name
}
return(df)
}
# Test
df <- data.frame(x = 1:3, y = 4:6, z = 7:9)
df <- safe_rename(df, x = "id", z = "score", w = "missing")
#> Warning in safe_rename(df, x = "id", z = "score", w = "missing"): Column 'w'
#> not found, skipping
names(df)
#> [1] "id" "y" "score"10.7 Removing Columns
🎯 Best Practice: Removing Columns
df <- data.frame(x = 1:3, y = 4:6, z = 7:9)
# Method 1: Set to NULL
df$y <- NULL
df
#> x z
#> 1 1 7
#> 2 2 8
#> 3 3 9
# Method 2: Subset (keep what you want)
df <- data.frame(x = 1:3, y = 4:6, z = 7:9)
df <- df[, c("x", "z")]
df
#> x z
#> 1 1 7
#> 2 2 8
#> 3 3 9
# Method 3: Subset (exclude what you don't want)
df <- data.frame(x = 1:3, y = 4:6, z = 7:9)
df <- df[, !names(df) %in% c("y")]
df
#> x z
#> 1 1 7
#> 2 2 8
#> 3 3 9
# Method 4: dplyr select with minus
library(dplyr)
df <- data.frame(x = 1:3, y = 4:6, z = 7:9)
df <- df %>% select(-y)
#> Error in select(., -y): unused argument (-y)
df
#> x y z
#> 1 1 4 7
#> 2 2 5 8
#> 3 3 6 9
# Method 5: Remove multiple
df <- data.frame(x = 1:3, y = 4:6, z = 7:9, w = 10:12)
df <- df %>% select(-c(y, w))
#> Error in select(., -c(y, w)): unused argument (-c(y, w))
df
#> x y z w
#> 1 1 4 7 10
#> 2 2 5 8 11
#> 3 3 6 9 12Never do this:
10.8 Column Reordering
💡 Key Insight: Reordering Columns
df <- data.frame(z = 7:9, x = 1:3, y = 4:6)
names(df)
#> [1] "z" "x" "y"
# Method 1: Specify order explicitly
df <- df[, c("x", "y", "z")]
names(df)
#> [1] "x" "y" "z"
# Method 2: Sort alphabetically
df <- df[, sort(names(df))]
names(df)
#> [1] "x" "y" "z"
# Method 3: Move specific columns first
df <- data.frame(z = 7:9, x = 1:3, y = 4:6)
df <- df[, c("x", setdiff(names(df), "x"))]
names(df)
#> [1] "x" "z" "y"
# Method 4: dplyr relocate
library(dplyr)
df <- data.frame(z = 7:9, x = 1:3, y = 4:6)
df <- df %>% relocate(x, y, z)
names(df)
#> [1] "x" "y" "z"
# Or move to front/end
df <- data.frame(z = 7:9, x = 1:3, y = 4:6)
df <- df %>% relocate(x, .before = everything())
df %>% relocate(z, .after = everything())
#> x y z
#> 1 1 4 7
#> 2 2 5 8
#> 3 3 6 910.9 Type Preservation
⚠️ Common Pitfall: Type Changes
# Start with factors
df <- data.frame(
id = 1:3,
category = c("A", "B", "C"),
stringsAsFactors = TRUE
)
class(df$category) # "factor"
#> [1] "factor"
# Select columns - type changes!
df_subset <- df[, "category"]
class(df_subset) # "factor" (still)
#> [1] "factor"
# But extract as vector
vec <- df$category
class(vec) # "factor"
#> [1] "factor"
# With drop = TRUE (default)
df_subset <- df[, "category", drop = TRUE]
class(df_subset) # "factor" - becomes vector
#> [1] "factor"
# With drop = FALSE
df_subset <- df[, "category", drop = FALSE]
class(df_subset) # "data.frame" - stays data frame
#> [1] "data.frame"Best practice:
10.10 Adding Multiple Columns
🎯 Best Practice: Adding Multiple Columns
df <- data.frame(x = 1:5)
# Method 1: One at a time
df$y <- 6:10
df$z <- 11:15
# Method 2: cbind
df <- data.frame(x = 1:5)
df <- cbind(df, data.frame(y = 6:10, z = 11:15))
# Method 3: dplyr mutate
library(dplyr)
df <- data.frame(x = 1:5)
df <- df %>%
mutate(
y = x + 5,
z = y + 5
)
# Method 4: Transform base R
df <- data.frame(x = 1:5)
df <- transform(df,
y = x + 5,
z = y + 5 # Can reference previous
)
#> Error in data.frame(structure(list(x = 1:5), class = "data.frame", row.names = c(NA, : arguments imply differing number of rows: 5, 3
# Method 5: within
df <- data.frame(x = 1:5)
df <- within(df, {
y <- x + 5
z <- y + 5
})10.11 Summary
Key Takeaways:
- Check column exists before accessing with
%in% names() - Match row count when adding columns (or use length 1)
- Avoid duplicate names - check with
anyDuplicated() - Provide all names when renaming - one per column
- Use drop = FALSE to preserve data frame structure
- $ sets to NULL removes columns cleanly
- dplyr is clearer for complex column operations
Quick Reference:
| Error | Cause | Fix |
|---|---|---|
| undefined columns selected | Column doesn’t exist | Check with %in% names() |
| replacement has X rows | Wrong length column | Match nrows or use length 1 |
| duplicate column names | Non-unique names | Use make.names(unique=TRUE) |
| names attribute wrong length | Wrong # of names | Provide one per column |
Column Operations Checklist:
# Before accessing:
"colname" %in% names(df) # Check exists
anyDuplicated(names(df)) # Check no duplicates
# When adding column:
length(new_values) == nrow(df) || length(new_values) == 1
# When renaming:
length(new_names) == ncol(df)
!anyDuplicated(new_names)
# Safe patterns:
df$col <- NULL # Remove column
df[, cols, drop = FALSE] # Keep as data frameBest Practices:
10.12 Exercises
📝 Exercise 1: Safe Column Access
Write a function that safely gets a column: - Returns the column if it exists - Returns default value if it doesn’t - Warns user about missing columns - Handles both $ and [[ ]] style access
📝 Exercise 2: Batch Rename
You have:
Write a function to rename all columns matching a pattern.
📝 Exercise 3: Safe Column Addition
Write add_column(df, name, values) that:
1. Checks if name already exists
2. Validates values length
3. Handles recycling appropriately
4. Returns modified data frame
5. Gives informative errors
📝 Exercise 4: Column Audit
Write a function that audits a data frame and reports: - Missing column names - Duplicate column names - Invalid column names (non-syntactic) - Columns with NA names
10.13 Exercise Answers
Click to see answers
Exercise 1:
safe_get_column <- function(df, col, default = NULL, warn = TRUE) {
if (!col %in% names(df)) {
if (warn) {
warning("Column '", col, "' not found in data frame")
}
return(default)
}
return(df[[col]])
}
# Test
df <- data.frame(x = 1:5, y = 6:10)
safe_get_column(df, "x") # Returns column
#> [1] 1 2 3 4 5
safe_get_column(df, "z") # Returns NULL with warning
#> Warning in safe_get_column(df, "z"): Column 'z' not found in data frame
#> NULL
safe_get_column(df, "z", default = NA, warn = FALSE)
#> [1] NAExercise 2:
rename_pattern <- function(df, pattern, replacement) {
old_names <- names(df)
new_names <- gsub(pattern, replacement, old_names)
if (identical(old_names, new_names)) {
message("No columns matched pattern '", pattern, "'")
return(df)
}
# Check for duplicates after rename
if (anyDuplicated(new_names)) {
warning("Renaming would create duplicate names, using make.unique()")
new_names <- make.unique(new_names)
}
names(df) <- new_names
# Report changes
changed <- old_names != new_names
if (any(changed)) {
message("Renamed ", sum(changed), " columns:")
for (i in which(changed)) {
message(" ", old_names[i], " -> ", new_names[i])
}
}
return(df)
}
# Test
df <- data.frame(
old_name_1 = 1:5,
old_name_2 = 6:10,
old_name_3 = 11:15
)
df <- rename_pattern(df, "old_name_", "new_col_")
#> Renamed 3 columns:
#> old_name_1 -> new_col_1
#> old_name_2 -> new_col_2
#> old_name_3 -> new_col_3
names(df)
#> [1] "new_col_1" "new_col_2" "new_col_3"Exercise 3:
add_column <- function(df, name, values, overwrite = FALSE) {
# Check if name exists
if (name %in% names(df) && !overwrite) {
stop("Column '", name, "' already exists. ",
"Use overwrite = TRUE to replace.")
}
# Check length
n_rows <- nrow(df)
n_values <- length(values)
if (n_values == n_rows) {
# Perfect match
df[[name]] <- values
} else if (n_values == 1) {
# Recycle single value
message("Recycling single value to ", n_rows, " rows")
df[[name]] <- values
} else if (n_rows %% n_values == 0) {
# Multiple recycling
message("Recycling ", n_values, " values to ", n_rows, " rows")
df[[name]] <- rep(values, length.out = n_rows)
} else {
stop("Length mismatch: values has ", n_values,
" elements but data frame has ", n_rows, " rows")
}
return(df)
}
# Test
df <- data.frame(x = 1:5)
df <- add_column(df, "y", 10) # Recycles
#> Recycling single value to 5 rows
df <- add_column(df, "z", 11:15) # Matchesdf <- add_column(df, "w", 1:3) # Errors
#> Error in add_column(df, "w", 1:3): Length mismatch: values has 3 elements but data frame has 5 rowsExercise 4:
audit_columns <- function(df) {
col_names <- names(df)
issues <- list()
# Check for missing names
if (any(is.na(col_names) | col_names == "")) {
issues$missing <- which(is.na(col_names) | col_names == "")
}
# Check for duplicates
if (anyDuplicated(col_names)) {
dupes <- col_names[duplicated(col_names)]
issues$duplicates <- unique(dupes)
}
# Check for invalid names (non-syntactic)
valid <- make.names(col_names) == col_names
if (!all(valid)) {
issues$invalid <- col_names[!valid]
}
# Report
if (length(issues) == 0) {
message("✓ All column names are valid")
return(invisible(TRUE))
}
message("Column name issues found:")
if (!is.null(issues$missing)) {
message(" Missing names at positions: ",
paste(issues$missing, collapse = ", "))
}
if (!is.null(issues$duplicates)) {
message(" Duplicate names: ",
paste(issues$duplicates, collapse = ", "))
}
if (!is.null(issues$invalid)) {
message(" Invalid names: ",
paste(issues$invalid, collapse = ", "))
message(" Suggested: ",
paste(make.names(issues$invalid), collapse = ", "))
}
return(invisible(issues))
}
# Test
df_good <- data.frame(x = 1:3, y = 4:6)
audit_columns(df_good)
#> ✓ All column names are valid
df_bad <- data.frame(x = 1:3, x = 4:6, `2bad` = 7:9,
check.names = FALSE)
audit_columns(df_bad)
#> Column name issues found:
#> Duplicate names: x
#> Invalid names: 2bad
#> Suggested: X2bad