Chapter 14 String Basics & Common Errors
What You’ll Learn:
- Character vectors vs strings
- String creation and encoding
- Common string errors
- Quotes and escaping
- String conversion issues
Key Errors Covered: 15+ string errors
Difficulty: ⭐⭐ Intermediate
14.1 Introduction
Strings in R seem simple but have surprising complexity:
text <- "He said "Hello"" # Try to include quotes
#> Error in parse(text = input): <text>:1:19: unexpected symbol
#> 1: text <- "He said "Hello
#> ^🔴 ERROR
Error: unexpected symbol in "text <- "He said "Hello"
Let’s master string handling to avoid these pitfalls.
14.2 String Basics
💡 Key Insight: No String Type in R
R doesn’t have a separate “string” type:
# What you think of as a "string"
text <- "hello"
typeof(text) # "character"
#> [1] "character"
class(text) # "character"
#> [1] "character"
# It's a character vector of length 1
length(text) # 1 (one element)
#> [1] 1
nchar(text) # 5 (five characters)
#> [1] 5
# Multiple strings
texts <- c("hello", "world")
typeof(texts) # Still "character"
#> [1] "character"
length(texts) # 2 (two elements)
#> [1] 2
nchar(texts) # 5 5 (characters in each)
#> [1] 5 5Key points:
- R has “character vectors”, not “strings”
- A “string” is a character vector of length 1
- length() = number of elements
- nchar() = number of characters in each element
14.3 Error #1: unexpected symbol (quote issues)
⭐ BEGINNER 🔤 SYNTAX
14.3.1 The Error
text <- "She said "yes""
#> Error in parse(text = input): <text>:1:20: unexpected symbol
#> 1: text <- "She said "yes
#> ^🔴 ERROR
Error: unexpected symbol in "text <- "She said "yes"
14.4 Escape Sequences
💡 Key Insight: Common Escape Sequences
# Newline
cat("Line 1\nLine 2")
#> Line 1
#> Line 2
# Tab
cat("Col1\tCol2\tCol3")
#> Col1 Col2 Col3
# Backslash itself
cat("Path: C:\\Users\\Documents")
#> Path: C:\Users\Documents
# Quotes
cat("He said \"Hello\"")
#> He said "Hello"
cat('It\'s working')
#> It's working
# Carriage return
cat("Part 1\rPart 2") # Overwrites
#> Part 1Part 2
# Unicode
cat("\u03B1 \u03B2 \u03B3") # α β γ
#> α β γ
# Hex
cat("\x48\x65\x6C\x6C\x6F") # Hello
#> Hello
# All escape sequences
cat("Newline:\nTab:\tQuote:\"Backslash:\\")
#> Newline:
#> Tab: Quote:"Backslash:\Common escapes:
- \n - newline
- \t - tab
- \\ - backslash
- \" - double quote
- \' - single quote
- \r - carriage return
- \uXXXX - Unicode character
- \xXX - Hex character
14.5 Error #2: argument is not of mode character
⭐ BEGINNER 🔢 TYPE
14.5.1 The Error
🔴 ERROR (in some contexts)
Error in nchar(numbers) : 'nchar()' requires a character vector
Actually, nchar() coerces, but other functions don’t:
🔴 ERROR
Error in substr(123, 1, 2) : argument is not of mode character
14.6 Error #3: invalid multibyte string
⭐⭐⭐ ADVANCED 🌐 ENCODING
14.6.1 The Error
# Try to read file with wrong encoding
text <- readLines("file_with_utf8.txt", encoding = "latin1")
#> Warning in file(con, "r"): cannot open file 'file_with_utf8.txt': No such file
#> or directory
#> Error in file(con, "r"): cannot open the connection
nchar(text) # May error
#> [1] 5🔴 ERROR
Error in nchar(text) : invalid multibyte string
14.7 String Creation and Manipulation
🎯 Best Practice: Creating and Combining Strings
# Creating strings
single <- "hello"
multiple <- c("hello", "world", "!")
# Combining strings
paste("hello", "world") # Space by default
#> [1] "hello world"
paste("hello", "world", sep = "") # No space
#> [1] "helloworld"
paste0("hello", "world") # paste with sep=""
#> [1] "helloworld"
# Collapse vector into one string
paste(c("a", "b", "c"), collapse = ", ")
#> [1] "a, b, c"
# Vector operations (recycling)
paste("File", 1:5, ".txt", sep = "")
#> [1] "File1.txt" "File2.txt" "File3.txt" "File4.txt" "File5.txt"
# stringr alternatives
library(stringr)
str_c("hello", "world", sep = " ")
#> [1] "hello world"
str_c(c("a", "b", "c"), collapse = ", ")
#> [1] "a, b, c"
# glue for interpolation
library(glue)
name <- "Alice"
age <- 30
glue("My name is {name} and I am {age} years old")
#> My name is Alice and I am 30 years old14.8 Length vs Number of Characters
⚠️ Common Confusion: length() vs nchar()
# Single string
text <- "hello"
length(text) # 1 (one element in vector)
#> [1] 1
nchar(text) # 5 (five characters)
#> [1] 5
# Multiple strings
texts <- c("hi", "hello", "hey")
length(texts) # 3 (three elements)
#> [1] 3
nchar(texts) # 2 5 3 (characters in each)
#> [1] 2 5 3
# Empty string vs NULL
empty <- ""
length(empty) # 1 (one element)
#> [1] 1
nchar(empty) # 0 (no characters)
#> [1] 0
nothing <- character(0)
length(nothing) # 0 (no elements)
#> [1] 0
nchar(nothing) # integer(0)
#> integer(0)
# Common mistake
text <- "hello world"
length(text) # 1 (NOT 11!)
#> [1] 1
nchar(text) # 11 (including space)
#> [1] 11
# To split into characters
strsplit(text, "")[[1]]
#> [1] "h" "e" "l" "l" "o" " " "w" "o" "r" "l" "d"
length(strsplit(text, "")[[1]]) # 11
#> [1] 1114.9 Empty Strings and NA
💡 Key Insight: Empty vs NA vs NULL
# Empty string (exists but empty)
empty <- ""
length(empty) # 1
#> [1] 1
nchar(empty) # 0
#> [1] 0
is.na(empty) # FALSE
#> [1] FALSE
empty == "" # TRUE
#> [1] TRUE
# NA (missing value)
missing <- NA_character_
length(missing) # 1
#> [1] 1
nchar(missing) # NA (can't count characters of NA)
#> [1] NA
is.na(missing) # TRUE
#> [1] TRUE
# NULL (doesn't exist)
nothing <- NULL
length(nothing) # 0
#> [1] 0
is.null(nothing) # TRUE
#> [1] TRUE
# In a vector
vec <- c("hello", "", NA, "world")
length(vec) # 4
#> [1] 4
nchar(vec) # 5 0 NA 5
#> [1] 5 0 NA 5
is.na(vec) # FALSE FALSE TRUE FALSE
#> [1] FALSE FALSE TRUE FALSE
vec == "" # FALSE TRUE NA FALSE (NA propagates!)
#> [1] FALSE TRUE NA FALSE
# Testing for empty strings safely
is_empty <- function(x) {
!is.na(x) & x == ""
}
is_empty(vec) # FALSE TRUE FALSE FALSE
#> [1] FALSE TRUE FALSE FALSE14.10 Case Conversion
🎯 Best Practice: Case Operations
text <- "Hello World"
# Base R
toupper(text)
#> [1] "HELLO WORLD"
tolower(text)
#> [1] "hello world"
# First letter (no built-in function)
capitalize <- function(x) {
paste0(toupper(substr(x, 1, 1)), substr(x, 2, nchar(x)))
}
capitalize("hello")
#> [1] "Hello"
# stringr alternatives
library(stringr)
str_to_upper(text)
#> [1] "HELLO WORLD"
str_to_lower(text)
#> [1] "hello world"
str_to_title(text) # Title Case
#> [1] "Hello World"
# Handle NAs better
text_with_na <- c("hello", NA, "world")
toupper(text_with_na) # Preserves NA
#> [1] "HELLO" NA "WORLD"
str_to_upper(text_with_na) # Also preserves NA
#> [1] "HELLO" NA "WORLD"14.11 Whitespace Issues
⚠️ Common Pitfall: Invisible Whitespace
# Strings that look the same
text1 <- "hello"
text2 <- " hello"
text3 <- "hello "
text4 <- "hello\n"
# But aren't equal
text1 == text2 # FALSE (leading space)
#> [1] FALSE
text1 == text3 # FALSE (trailing space)
#> [1] FALSE
text1 == text4 # FALSE (newline)
#> [1] FALSE
# Hard to see!
print(text2)
#> [1] " hello"
print(text3)
#> [1] "hello "
# Better visualization
cat("[", text1, "]\n", sep = "")
#> [hello]
cat("[", text2, "]\n", sep = "")
#> [ hello]
cat("[", text3, "]\n", sep = "")
#> [hello ]
# Trim whitespace
trimws(text2) # Remove leading/trailing
#> [1] "hello"
trimws(text3)
#> [1] "hello"
# stringr
library(stringr)
str_trim(text2)
#> [1] "hello"
str_squish("hello world") # Remove extra internal spaces too
#> [1] "hello world"14.12 String Comparison
💡 Key Insight: String Comparison
# Equality
"hello" == "hello" # TRUE
#> [1] TRUE
"hello" == "Hello" # FALSE (case-sensitive)
#> [1] FALSE
# Lexicographic ordering
"a" < "b" # TRUE
#> [1] TRUE
"apple" < "banana" # TRUE
#> [1] TRUE
"10" < "2" # TRUE (lexicographic, not numeric!)
#> [1] TRUE
# Vector comparison
c("a", "b") == c("a", "c") # TRUE FALSE
#> [1] TRUE FALSE
# %in% for membership
"apple" %in% c("apple", "banana", "cherry") # TRUE
#> [1] TRUE
# Case-insensitive comparison
tolower("Hello") == tolower("hello") # TRUE
#> [1] TRUE
# Partial matching (base R)
grep("app", c("apple", "banana", "application")) # 1 3
#> [1] 1 3
grepl("app", c("apple", "banana", "application")) # TRUE FALSE TRUE
#> [1] TRUE FALSE TRUE
# stringr
library(stringr)
str_detect(c("apple", "banana"), "app") # TRUE FALSE
#> [1] TRUE FALSE
str_which(c("apple", "banana", "app"), "app") # 1 3
#> [1] 1 314.13 Type Coercion with Strings
⚠️ Pitfall: Implicit String Coercion
# Combining strings and numbers
c("a", 1, "b", 2) # All become character
#> [1] "a" "1" "b" "2"
# In data frames (old R)
df_old <- data.frame(
id = 1:3,
name = c("Alice", "Bob", "Charlie"),
stringsAsFactors = TRUE # Old default
)
class(df_old$name) # "factor" (was default in R < 4.0)
#> [1] "factor"
# Modern R
df_new <- data.frame(
id = 1:3,
name = c("Alice", "Bob", "Charlie")
)
class(df_new$name) # "character"
#> [1] "character"
# Operations can coerce
x <- c(1, 2, 3)
y <- paste(x, "items")
y # "1 items" "2 items" "3 items"
#> [1] "1 items" "2 items" "3 items"
class(y) # "character"
#> [1] "character"14.14 Summary
Key Takeaways:
- No string type - R has character vectors
- length() vs nchar() - Elements vs characters
- Escape quotes with
\or use different quotes - Encoding matters - Specify UTF-8 when reading files
- Empty vs NA vs NULL - Three different concepts
- Convert to character before string operations
- Whitespace is invisible - Use trimws() or str_trim()
Quick Reference:
| Error | Cause | Fix |
|---|---|---|
| unexpected symbol | Quotes not escaped | Use \" or different quotes |
| not of mode character | Non-character input | as.character() first |
| invalid multibyte string | Encoding mismatch | Specify correct encoding |
| Wrong comparison | Case or whitespace | tolower() and trimws() |
Essential Functions:
# Creation
c(), paste(), paste0(), sprintf()
# Inspection
length(), nchar(), Encoding()
# Manipulation
substr(), substring(), strsplit()
toupper(), tolower(), trimws()
# Comparison
==, %in%, grep(), grepl()
# Conversion
as.character(), toString()
# stringr equivalents (better)
str_c(), str_length(), str_sub()
str_to_upper(), str_to_lower()
str_trim(), str_squish()
str_detect(), str_which()Best Practices:
# ✅ Good
text <- 'He said "Hello"' # Different quotes
readLines("file.txt", encoding = "UTF-8") # Explicit encoding
trimws(text) # Clean whitespace
as.character(x) before string ops # Convert first
# ❌ Avoid
text <- "He said "Hello"" # Unescaped quotes
readLines("file.txt") # Platform-dependent encoding
Assuming no whitespace # Invisible characters
String operations on numbers # Type mismatch14.15 Exercises
📝 Exercise 1: Quote Handling
Create strings containing: 1. Double quotes inside single quotes 2. Single quotes inside double quotes 3. Both quote types in one string 4. A file path with backslashes
📝 Exercise 2: Length vs Characters
You have: texts <- c("hi", "hello", "hey")
- Find number of elements
- Find characters in each
- Find total characters
- Find longest string
📝 Exercise 3: Clean Text
Write clean_text(x) that:
1. Trims whitespace
2. Converts to consistent case
3. Removes or replaces NAs
4. Reports what was changed
📝 Exercise 4: Safe String Operations
Write safe_substr(x, start, stop) that:
1. Converts to character if needed
2. Handles NAs appropriately
3. Handles out-of-bounds indices
4. Returns character vector
14.16 Exercise Answers
Click to see answers
Exercise 1:
# 1. Double quotes inside single quotes
text1 <- 'She said "Hello"'
cat(text1)
#> She said "Hello"
# 2. Single quotes inside double quotes
text2 <- "It's a nice day"
cat(text2)
#> It's a nice day
# 3. Both quote types
text3 <- "She said \"It's nice\"" # Escape double quotes
cat(text3)
#> She said "It's nice"
# Alternative with single outside
text3_alt <- 'She said "It\'s nice"' # Escape single quote
cat(text3_alt)
#> She said "It's nice"
# 4. File path with backslashes
path <- "C:\\Users\\Documents\\file.txt"
cat(path)
#> C:\Users\Documents\file.txt
# Or use forward slashes (works on all platforms)
path_alt <- "C:/Users/Documents/file.txt"Exercise 2:
texts <- c("hi", "hello", "hey")
# 1. Number of elements
num_elements <- length(texts)
num_elements
#> [1] 3
# 2. Characters in each
chars_each <- nchar(texts)
chars_each
#> [1] 2 5 3
# 3. Total characters
total_chars <- sum(nchar(texts))
total_chars
#> [1] 10
# 4. Longest string
longest <- texts[which.max(nchar(texts))]
longest
#> [1] "hello"
# Or get length of longest
max_length <- max(nchar(texts))
max_length
#> [1] 5
# Complete analysis
analyze_strings <- function(x) {
list(
n_elements = length(x),
chars_each = nchar(x),
total_chars = sum(nchar(x)),
avg_chars = mean(nchar(x)),
longest = x[which.max(nchar(x))],
shortest = x[which.min(nchar(x))]
)
}
analyze_strings(texts)
#> $n_elements
#> [1] 3
#>
#> $chars_each
#> [1] 2 5 3
#>
#> $total_chars
#> [1] 10
#>
#> $avg_chars
#> [1] 3.333333
#>
#> $longest
#> [1] "hello"
#>
#> $shortest
#> [1] "hi"Exercise 3:
clean_text <- function(x,
trim = TRUE,
case = c("lower", "upper", "none"),
na_action = c("keep", "remove", "replace"),
na_replacement = "",
report = TRUE) {
case <- match.arg(case)
na_action <- match.arg(na_action)
original <- x
changes <- list()
# Handle NAs
n_na <- sum(is.na(x))
if (n_na > 0) {
if (na_action == "remove") {
x <- x[!is.na(x)]
changes$na <- paste("Removed", n_na, "NAs")
} else if (na_action == "replace") {
x[is.na(x)] <- na_replacement
changes$na <- paste("Replaced", n_na, "NAs with",
shQuote(na_replacement))
} else {
changes$na <- paste("Kept", n_na, "NAs")
}
}
# Trim whitespace
if (trim) {
had_whitespace <- x != trimws(x) & !is.na(x)
if (any(had_whitespace)) {
x <- trimws(x)
changes$whitespace <- paste("Trimmed whitespace from",
sum(had_whitespace), "strings")
}
}
# Case conversion
if (case == "lower") {
x <- tolower(x)
changes$case <- "Converted to lowercase"
} else if (case == "upper") {
x <- toupper(x)
changes$case <- "Converted to uppercase"
}
# Report
if (report && length(changes) > 0) {
message("Text cleaning applied:")
for (change in changes) {
message(" - ", change)
}
}
return(x)
}
# Test
messy <- c(" Hello ", "WORLD", NA, " Test ")
clean_text(messy, case = "lower", na_action = "replace",
na_replacement = "[missing]")
#> Text cleaning applied:
#> - Replaced 1 NAs with '[missing]'
#> - Trimmed whitespace from 2 strings
#> - Converted to lowercase
#> [1] "hello" "world" "[missing]" "test"Exercise 4:
safe_substr <- function(x, start, stop) {
# Convert to character if needed
if (!is.character(x)) {
message("Converting input to character")
x <- as.character(x)
}
# Validate indices
if (start < 1) {
warning("start < 1, setting to 1")
start <- 1
}
if (stop < start) {
warning("stop < start, returning empty strings")
return(rep("", length(x)))
}
# Handle each element
result <- character(length(x))
for (i in seq_along(x)) {
if (is.na(x[i])) {
result[i] <- NA_character_
next
}
# Get length
len <- nchar(x[i])
# Adjust stop if beyond length
actual_stop <- min(stop, len)
if (start > len) {
result[i] <- ""
} else {
result[i] <- substr(x[i], start, actual_stop)
}
}
return(result)
}
# Test
safe_substr(c("hello", "world", NA, "R"), 1, 3)
#> [1] "hel" "wor" NA "R"
safe_substr(12345, 1, 3) # Auto-converts
#> Converting input to character
#> [1] "123"
safe_substr("short", 1, 100) # Beyond length
#> [1] "short"
safe_substr("test", 10, 20) # Start beyond length
#> [1] ""