Chapter 15 String Pattern Matching
What You’ll Learn:
- Regular expressions basics
- grep family functions
- Pattern matching errors
- Replacement operations
- Common regex pitfalls
Key Errors Covered: 12+ pattern matching errors
Difficulty: ⭐⭐⭐ Advanced
15.1 Introduction
Pattern matching in strings is powerful but error-prone:
# Try to match a pattern
text <- c("file1.txt", "file2.csv", "file3.txt")
grep(".", text) # Expect to find the dots
#> [1] 1 2 3Let’s master pattern matching and avoid regex pitfalls.
15.2 Pattern Matching Basics
💡 Key Insight: grep Family Functions
texts <- c("apple", "banana", "apricot", "cherry")
# grep: return indices of matches
grep("ap", texts)
#> [1] 1 3
texts[grep("ap", texts)]
#> [1] "apple" "apricot"
# grepl: return logical vector
grepl("ap", texts)
#> [1] TRUE FALSE TRUE FALSE
# sub: replace first match
sub("a", "X", texts)
#> [1] "Xpple" "bXnana" "Xpricot" "cherry"
# gsub: replace all matches
gsub("a", "X", texts)
#> [1] "Xpple" "bXnXnX" "Xpricot" "cherry"
# regexpr: position of first match
regexpr("a", texts)
#> [1] 1 2 1 -1
#> attr(,"match.length")
#> [1] 1 1 1 -1
#> attr(,"index.type")
#> [1] "chars"
#> attr(,"useBytes")
#> [1] TRUE
# gregexpr: positions of all matches
gregexpr("a", texts)
#> [[1]]
#> [1] 1
#> attr(,"match.length")
#> [1] 1
#> attr(,"index.type")
#> [1] "chars"
#> attr(,"useBytes")
#> [1] TRUE
#>
#> [[2]]
#> [1] 2 4 6
#> attr(,"match.length")
#> [1] 1 1 1
#> attr(,"index.type")
#> [1] "chars"
#> attr(,"useBytes")
#> [1] TRUE
#>
#> [[3]]
#> [1] 1
#> attr(,"match.length")
#> [1] 1
#> attr(,"index.type")
#> [1] "chars"
#> attr(,"useBytes")
#> [1] TRUE
#>
#> [[4]]
#> [1] -1
#> attr(,"match.length")
#> [1] -1
#> attr(,"index.type")
#> [1] "chars"
#> attr(,"useBytes")
#> [1] TRUEKey differences:
- grep() → indices
- grepl() → TRUE/FALSE
- sub() → replace first
- gsub() → replace all
- *expr() → positions
15.3 Error #1: invalid regular expression
⭐⭐⭐ ADVANCED 🔤 SYNTAX
15.3.1 The Error
# Unmatched bracket
grep("[abc", c("a", "b", "c"))
#> Warning in grep("[abc", c("a", "b", "c")): TRE pattern compilation error
#> 'Missing ']''
#> Error in grep("[abc", c("a", "b", "c")): invalid regular expression '[abc', reason 'Missing ']''🔴 ERROR
Error in grep("[abc", c("a", "b", "c")) :
invalid regular expression '[abc', reason 'Missing ']''
15.3.3 Common Invalid Patterns
# Unmatched brackets
grep("[abc", "test")
#> Warning in grep("[abc", "test"): TRE pattern compilation error 'Missing ']''
#> Error in grep("[abc", "test"): invalid regular expression '[abc', reason 'Missing ']''
grep("abc]", "test")
#> integer(0)
# Unmatched parentheses
grep("(abc", "test")
#> Warning in grep("(abc", "test"): TRE pattern compilation error 'Missing ')''
#> Error in grep("(abc", "test"): invalid regular expression '(abc', reason 'Missing ')''
grep("abc)", "test")
#> integer(0)
# Invalid repetition
grep("a{2,1}", "test") # max < min
#> Warning in grep("a{2,1}", "test"): TRE pattern compilation error 'Invalid
#> contents of {}'
#> Error in grep("a{2,1}", "test"): invalid regular expression 'a{2,1}', reason 'Invalid contents of {}'
# Trailing backslash
grep("test\\", "test")
#> Warning in grep("test\\", "test"): TRE pattern compilation error 'Trailing
#> backslash'
#> Error in grep("test\\", "test"): invalid regular expression 'test\', reason 'Trailing backslash'
# Invalid escape
grep("\\k", "test") # \k not valid
#> integer(0)15.3.4 Solutions
✅ SOLUTION 1: Escape Special Characters
# To match literal special characters, escape them
special_chars <- c(".", "*", "+", "?", "[", "]", "(", ")",
"{", "}", "^", "$", "|", "\\")
# Match literal dot
grep("\\.", c("file.txt", "file_txt"))
#> [1] 1
# Match literal bracket
grep("\\[", c("[test]", "test"))
#> [1] 1
# Match literal backslash
grep("\\\\", c("C:\\path", "C:/path"))
#> [1] 1✅ SOLUTION 2: Use fixed = TRUE for Literals
✅ SOLUTION 3: Validate Pattern First
is_valid_regex <- function(pattern) {
tryCatch({
grep(pattern, "test")
TRUE
}, error = function(e) {
message("Invalid regex: ", e$message)
FALSE
})
}
# Test
is_valid_regex("[abc") # FALSE
#> Warning in grep(pattern, "test"): TRE pattern compilation error 'Missing ']''
#> Invalid regex: invalid regular expression '[abc', reason 'Missing ']''
#> [1] FALSE
is_valid_regex("[abc]") # TRUE
#> [1] TRUE15.4 Regular Expression Special Characters
💡 Key Insight: Regex Special Characters
texts <- c("abc", "a.c", "a*c", "aXc", "ac", "abbc")
# . = any single character
grep("a.c", texts, value = TRUE)
#> [1] "abc" "a.c" "a*c" "aXc"
# * = zero or more of previous
grep("ab*c", texts, value = TRUE)
#> [1] "abc" "ac" "abbc"
# + = one or more of previous
grep("ab+c", texts, value = TRUE)
#> [1] "abc" "abbc"
# ? = zero or one of previous
grep("ab?c", texts, value = TRUE)
#> [1] "abc" "ac"
# ^ = start of string
grep("^a", texts, value = TRUE)
#> [1] "abc" "a.c" "a*c" "aXc" "ac" "abbc"
# $ = end of string
grep("c$", texts, value = TRUE)
#> [1] "abc" "a.c" "a*c" "aXc" "ac" "abbc"
# [abc] = any of a, b, or c
grep("a[bX]c", texts, value = TRUE)
#> [1] "abc" "aXc"
# [^abc] = anything except a, b, or c
grep("a[^b]c", texts, value = TRUE)
#> [1] "a.c" "a*c" "aXc"
# | = or
grep("a|c", texts, value = TRUE)
#> [1] "abc" "a.c" "a*c" "aXc" "ac" "abbc"
# {n} = exactly n
grep("b{2}", texts, value = TRUE)
#> [1] "abbc"
# {n,} = n or more
grep("b{1,}", texts, value = TRUE)
#> [1] "abc" "abbc"
# {n,m} = between n and m
grep("b{1,2}", texts, value = TRUE)
#> [1] "abc" "abbc"To match literal special characters, escape with \\:
15.5 Error #2: Pattern Matches Everything/Nothing
⭐⭐ INTERMEDIATE 🧠 LOGIC
15.5.1 The Problem
# Want to find files with dots
files <- c("file1.txt", "file2.csv", "README")
# But . matches any character!
grep(".", files) # Matches all 3!
#> [1] 1 2 3
# Want to find emails
emails <- c("test@email.com", "notanemail", "another@test.org")
# But simple pattern matches too much
grep("@", emails, value = TRUE) # OK so far...
#> [1] "test@email.com" "another@test.org"
grep(".*@.*", emails, value = TRUE) # Also matches all!
#> [1] "test@email.com" "another@test.org"15.5.2 Common Pattern Mistakes
texts <- c("abc", "def", "xyz")
# .* matches everything (zero or more any character)
grep(".*", texts) # All match!
#> [1] 1 2 3
# Wrong escaping
grep(".", texts) # All match (. is any character)
#> [1] 1 2 3
grep("\\.", texts) # None match (no literal dots)
#> integer(0)
# Too greedy
text <- "value=123&other=456"
sub("=.*", "", text) # Removes too much! "value"
#> [1] "value"15.5.3 Solutions
✅ SOLUTION 1: Be Specific
files <- c("file1.txt", "file2.csv", "README")
# Match literal dot
grep("\\.", files, value = TRUE)
#> [1] "file1.txt" "file2.csv"
# Match specific extension
grep("\\.txt$", files, value = TRUE)
#> [1] "file1.txt"
# Match email pattern
emails <- c("test@email.com", "notanemail", "another@test.org")
grep("[A-Za-z0-9.]+@[A-Za-z0-9.]+\\.[A-Za-z]{2,}", emails, value = TRUE)
#> [1] "test@email.com" "another@test.org"✅ SOLUTION 2: Use Anchors
texts <- c("apple", "pineapple", "application")
# Without anchor: matches all
grep("app", texts, value = TRUE)
#> [1] "apple" "pineapple" "application"
# With ^: only at start
grep("^app", texts, value = TRUE)
#> [1] "apple" "application"
# With $: only at end
grep("app$", texts, value = TRUE)
#> character(0)
# Exact match
grep("^apple$", texts, value = TRUE)
#> [1] "apple"✅ SOLUTION 3: Use Non-greedy Matching
text <- "value=123&other=456"
# Greedy: takes everything
sub("=.*&", "=X&", text) # "value=X&other=456"
#> [1] "value=X&other=456"
# Non-greedy (in Perl regex): *? or +?
sub("=.*?&", "=X&", text, perl = TRUE) # "value=X&other=456"
#> [1] "value=X&other=456"
# Alternative: use negated character class
sub("=[^&]*&", "=X&", text) # "value=X&other=456"
#> [1] "value=X&other=456"15.6 stringr: Modern String Operations
🎯 Best Practice: Use stringr
library(stringr)
texts <- c("apple", "banana", "apricot")
# Detect pattern (like grepl)
str_detect(texts, "ap")
#> [1] TRUE FALSE TRUE
# Which match (like grep)
str_which(texts, "ap")
#> [1] 1 3
# Extract matches
str_subset(texts, "ap")
#> [1] "apple" "apricot"
# Count matches
str_count(texts, "a")
#> [1] 1 3 1
# Extract pattern
str_extract(texts, "ap")
#> [1] "ap" NA "ap"
str_extract_all(texts, "a")
#> [[1]]
#> [1] "a"
#>
#> [[2]]
#> [1] "a" "a" "a"
#>
#> [[3]]
#> [1] "a"
# Replace
str_replace(texts, "a", "X") # First match
#> [1] "Xpple" "bXnana" "Xpricot"
str_replace_all(texts, "a", "X") # All matches
#> [1] "Xpple" "bXnXnX" "Xpricot"
# Remove pattern
str_remove(texts, "ap") # First match
#> [1] "ple" "banana" "ricot"
str_remove_all(texts, "a") # All matches
#> [1] "pple" "bnn" "pricot"
# Split
str_split("a-b-c", "-")
#> [[1]]
#> [1] "a" "b" "c"
str_split("a-b-c", "-", simplify = TRUE)
#> [,1] [,2] [,3]
#> [1,] "a" "b" "c"
# Better error messages
str_detect(texts, "[invalid") # Clearer error
#> Error in stri_detect_regex(string, pattern, negate = negate, opts_regex = opts(pattern)): Missing closing bracket on a bracket expression. (U_REGEX_MISSING_CLOSE_BRACKET, context=`[invalid`)15.7 Common Regex Patterns
🎯 Best Practice: Useful Patterns
library(stringr)
# Digits
texts <- c("abc123", "def456", "xyz")
str_extract_all(texts, "\\d+") # One or more digits
#> [[1]]
#> [1] "123"
#>
#> [[2]]
#> [1] "456"
#>
#> [[3]]
#> character(0)
# Non-digits
str_extract_all(texts, "\\D+") # One or more non-digits
#> [[1]]
#> [1] "abc"
#>
#> [[2]]
#> [1] "def"
#>
#> [[3]]
#> [1] "xyz"
# Word characters (letters, digits, underscore)
str_extract_all("hello_world123", "\\w+")
#> [[1]]
#> [1] "hello_world123"
# Whitespace
str_detect("hello world", "\\s")
#> [1] TRUE
# Email (simple)
email_pattern <- "[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}"
str_detect("test@email.com", email_pattern)
#> [1] TRUE
# Phone (US)
phone_pattern <- "\\d{3}-\\d{3}-\\d{4}"
str_detect("123-456-7890", phone_pattern)
#> [1] TRUE
# URL (simple)
url_pattern <- "https?://[A-Za-z0-9.-]+"
str_extract("Visit http://example.com", url_pattern)
#> [1] "http://example.com"
# Extract numbers
text <- "Price: $19.99"
str_extract(text, "\\d+\\.?\\d*")
#> [1] "19.99"
# Extract words
text <- "hello world, how are you?"
str_extract_all(text, "\\w+")
#> [[1]]
#> [1] "hello" "world" "how" "are" "you"15.8 Replacement Operations
💡 Key Insight: Replacement Strategies
library(stringr)
texts <- c("apple", "banana", "apricot")
# Simple replacement
str_replace(texts, "a", "X") # First 'a'
#> [1] "Xpple" "bXnana" "Xpricot"
str_replace_all(texts, "a", "X") # All 'a's
#> [1] "Xpple" "bXnXnX" "Xpricot"
# Using captured groups
str_replace("John Smith", "(\\w+) (\\w+)", "\\2, \\1")
#> [1] "Smith, John"
# Multiple replacements
text <- "I have 3 cats and 2 dogs"
str_replace_all(text, c("cats" = "birds", "dogs" = "fish"))
#> [1] "I have 3 birds and 2 fish"
# Conditional replacement
str_replace_all("hello", "l+", "L") # Multiple l's to one L
#> [1] "heLo"
# Remove pattern
str_remove("Price: $19.99", "\\$")
#> [1] "Price: 19.99"
str_remove_all("a-b-c-d", "-")
#> [1] "abcd"
# Case-insensitive
str_replace("Hello", regex("hello", ignore_case = TRUE), "Hi")
#> [1] "Hi"15.9 Splitting Strings
⚠️ Common Pitfall: strsplit() Returns List
text <- "a,b,c"
# Returns a LIST
result <- strsplit(text, ",")
class(result) # "list"
#> [1] "list"
result # List of 1 element
#> [[1]]
#> [1] "a" "b" "c"
# To get vector, extract first element
result[[1]]
#> [1] "a" "b" "c"
# With multiple strings
texts <- c("a,b,c", "d,e,f")
result <- strsplit(texts, ",")
result # List of 2 elements
#> [[1]]
#> [1] "a" "b" "c"
#>
#> [[2]]
#> [1] "d" "e" "f"
# To get all values as vector
unlist(result)
#> [1] "a" "b" "c" "d" "e" "f"
# stringr alternative (also returns list)
str_split(text, ",")
#> [[1]]
#> [1] "a" "b" "c"
# But can simplify
str_split(text, ",", simplify = TRUE) # Matrix
#> [,1] [,2] [,3]
#> [1,] "a" "b" "c"
# Or use specific extraction
str_split_fixed(text, ",", n = 3) # Fixed number of pieces
#> [,1] [,2] [,3]
#> [1,] "a" "b" "c"15.10 Error #3: 'replacement' is not a character vector
⭐ BEGINNER 🔢 TYPE
15.10.3 Solutions
✅ SOLUTION: Convert Replacement to Character
texts <- c("price: 10", "price: 20")
# Convert to character
sub("price: ", as.character(100), texts)
#> [1] "10010" "10020"
# Or use paste
sub("price: ", paste0("$", 100), texts)
#> [1] "$10010" "$10020"
# With stringr (auto-converts)
library(stringr)
str_replace(texts, "price: ", 100) # Auto-converts
#> Error in `str_replace()`:
#> ! `replacement` must be a character vector, not the number 100.15.11 Case-Insensitive Matching
🎯 Best Practice: Ignore Case
texts <- c("Apple", "banana", "CHERRY")
# Base R: use ignore.case
grep("apple", texts, ignore.case = TRUE, value = TRUE)
#> [1] "Apple"
# Or convert to same case first
grep("apple", tolower(texts), value = TRUE)
#> [1] "apple"
# stringr: use regex() with ignore_case
library(stringr)
str_subset(texts, regex("apple", ignore_case = TRUE))
#> [1] "Apple"
# In replacement
str_replace(texts, regex("apple", ignore_case = TRUE), "Orange")
#> [1] "Orange" "banana" "CHERRY"15.12 Unicode and Locales
⚠️ Platform Issue: Locale-Dependent Matching
# Character classes depend on locale
texts <- c("café", "naïve", "résumé")
# May behave differently on different systems
grep("[[:alpha:]]+", texts, value = TRUE)
#> [1] "café" "naïve" "résumé"
# Safer: specify UTF-8
Sys.setlocale("LC_CTYPE", "en_US.UTF-8")
#> [1] "en_US.UTF-8"
# Or use Unicode escapes
grep("caf\\u00e9", texts, value = TRUE)
#> character(0)
# stringr handles Unicode better
library(stringr)
str_detect(texts, "é") # More consistent across platforms
#> [1] TRUE FALSE TRUE15.13 Extracting Patterns
🎯 Best Practice: Pattern Extraction
library(stringr)
# Extract all numbers
text <- "I have 3 cats, 2 dogs, and 15 fish"
str_extract_all(text, "\\d+")
#> [[1]]
#> [1] "3" "2" "15"
# Extract email addresses
text <- "Contact: john@example.com or jane@test.org"
str_extract_all(text, "[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}")
#> [[1]]
#> [1] "john@example.com" "jane@test.org"
# Extract with groups
text <- "John Smith, Age: 30"
str_match(text, "(\\w+) (\\w+), Age: (\\d+)")
#> [,1] [,2] [,3] [,4]
#> [1,] "John Smith, Age: 30" "John" "Smith" "30"
# Named groups (requires stringr)
str_match(text, "(?<first>\\w+) (?<last>\\w+), Age: (?<age>\\d+)")
#> first last age
#> [1,] "John Smith, Age: 30" "John" "Smith" "30"
# Extract between delimiters
text <- "The value is [123] and the code is [ABC]"
str_extract_all(text, "\\[([^\\]]+)\\]")
#> [[1]]
#> [1] "[123]" "[ABC]"
str_match_all(text, "\\[([^\\]]+)\\]")
#> [[1]]
#> [,1] [,2]
#> [1,] "[123]" "123"
#> [2,] "[ABC]" "ABC"15.14 Debugging Regex
🎯 Best Practice: Test and Debug Patterns
# Test patterns incrementally
text <- "test@email.com"
# Build up pattern piece by piece
str_detect(text, "\\w+") # Any word chars
#> [1] TRUE
str_detect(text, "\\w+@") # Word chars + @
#> [1] TRUE
str_detect(text, "\\w+@\\w+") # Add domain start
#> [1] TRUE
str_detect(text, "\\w+@\\w+\\.") # Add dot
#> [1] TRUE
str_detect(text, "\\w+@\\w+\\.\\w+") # Add extension
#> [1] TRUE
# Use str_view() to visualize (if available)
# str_view(text, "\\w+@\\w+\\.\\w+")
# Test on multiple examples
test_cases <- c(
"valid@email.com",
"invalid",
"no@domain",
"missing.at.sign.com"
)
pattern <- "\\w+@\\w+\\.\\w+"
data.frame(
text = test_cases,
matches = str_detect(test_cases, pattern)
)
#> text matches
#> 1 valid@email.com TRUE
#> 2 invalid FALSE
#> 3 no@domain FALSE
#> 4 missing.at.sign.com FALSE15.15 Summary
Key Takeaways:
- Escape special characters - Use
\\orfixed = TRUE - . matches any character - Use
\\.for literal dot - Use anchors -
^for start,$for end - stringr is easier - Better errors and consistency
- Test patterns incrementally - Build complex patterns step by step
- strsplit() returns list - Extract with
[[1]]or usesimplify = TRUE - Replacement must be character - Convert numbers with
as.character()
Quick Reference:
| Error | Cause | Fix |
|---|---|---|
| invalid regular expression | Syntax error in pattern | Check brackets, escape specials |
| Matches everything/nothing | Wrong pattern | Test incrementally, use anchors |
| replacement not character | Numeric replacement | as.character() first |
| Different results by platform | Locale/encoding | Use stringr, specify UTF-8 |
Essential Patterns:
# Special characters
. # Any character
* # Zero or more
+ # One or more
? # Zero or one
^ # Start of string
$ # End of string
| # Or
[] # Character class
() # Group
# Character classes
\\d # Digit
\\D # Non-digit
\\w # Word character
\\W # Non-word
\\s # Whitespace
\\S # Non-whitespace
# Quantifiers
{n} # Exactly n
{n,} # n or more
{n,m} # Between n and mgrep Family:
grep(pattern, x) # Indices
grepl(pattern, x) # Logical
sub(pattern, repl, x) # Replace first
gsub(pattern, repl, x) # Replace all
# stringr alternatives (recommended)
str_detect(x, pattern)
str_which(x, pattern)
str_subset(x, pattern)
str_replace(x, pattern, replacement)
str_replace_all(x, pattern, replacement)
str_extract(x, pattern)
str_extract_all(x, pattern)Best Practices:
# ✅ Good
grep("\\.", files) # Escape special chars
str_detect(text, "^pattern$") # Use anchors
str_replace_all(text, "a", "X") # Use stringr
fixed = TRUE # For literal matching
# ❌ Avoid
grep(".", files) # Matches everything
grep(unvalidated_pattern, text) # No error checking
sub() with unescaped specials # Unexpected matches
Platform-dependent locale assumptions # Inconsistent results15.16 Exercises
📝 Exercise 1: Pattern Building
Build patterns to match: 1. Valid email addresses 2. Phone numbers (format: XXX-XXX-XXXX) 3. Dates (format: YYYY-MM-DD) 4. URLs starting with http:// or https://
📝 Exercise 2: Text Extraction
From: "Price: $19.99, Quantity: 5 units"
Extract: 1. The price (numeric only) 2. The quantity (number only) 3. Both in a named vector
📝 Exercise 3: Safe Pattern Matching
Write safe_grep(pattern, x) that:
1. Validates pattern first
2. Provides helpful errors
3. Returns indices with option for values
4. Handles empty inputs
📝 Exercise 4: Text Cleaning
Write clean_identifiers(x) that:
1. Removes special characters
2. Converts spaces to underscores
3. Converts to lowercase
4. Ensures valid R variable names
15.17 Exercise Answers
Click to see answers
Exercise 1:
library(stringr)
# 1. Email pattern
email_pattern <- "[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}"
test_emails <- c(
"valid@email.com",
"also.valid@test.co.uk",
"invalid",
"missing@domain"
)
str_subset(test_emails, email_pattern)
#> [1] "valid@email.com" "also.valid@test.co.uk"
# 2. Phone pattern (XXX-XXX-XXXX)
phone_pattern <- "\\d{3}-\\d{3}-\\d{4}"
test_phones <- c(
"123-456-7890",
"555-1234",
"not a phone",
"123-456-789"
)
str_subset(test_phones, phone_pattern)
#> [1] "123-456-7890"
# 3. Date pattern (YYYY-MM-DD)
date_pattern <- "\\d{4}-\\d{2}-\\d{2}"
test_dates <- c(
"2024-01-15",
"2024-1-5",
"not a date",
"2024/01/15"
)
str_subset(test_dates, date_pattern)
#> [1] "2024-01-15"
# 4. URL pattern
url_pattern <- "https?://[A-Za-z0-9.-]+(/[A-Za-z0-9._~:/?#\\[\\]@!$&'()*+,;=-]*)?"
test_urls <- c(
"http://example.com",
"https://test.org/path",
"not a url",
"ftp://wrong.com"
)
str_subset(test_urls, url_pattern)
#> [1] "http://example.com" "https://test.org/path"Exercise 2:
text <- "Price: $19.99, Quantity: 5 units"
# 1. Extract price
price <- str_extract(text, "\\d+\\.\\d+")
as.numeric(price)
#> [1] 19.99
# 2. Extract quantity
quantity <- str_extract(text, "Quantity: (\\d+)")
quantity <- str_extract(quantity, "\\d+")
as.numeric(quantity)
#> [1] 5
# 3. Both in named vector
extract_both <- function(text) {
price <- as.numeric(str_extract(text, "(?<=\\$)\\d+\\.\\d+"))
quantity <- as.numeric(str_extract(text, "(?<=Quantity: )\\d+"))
c(price = price, quantity = quantity)
}
extract_both(text)
#> price quantity
#> 19.99 5.00
# Alternative: using str_match with groups
pattern <- "Price: \\$(\\d+\\.\\d+), Quantity: (\\d+)"
matches <- str_match(text, pattern)
c(
price = as.numeric(matches[, 2]),
quantity = as.numeric(matches[, 3])
)
#> price quantity
#> 19.99 5.00Exercise 3:
safe_grep <- function(pattern, x, value = FALSE, ignore.case = FALSE) {
# Validate inputs
if (length(x) == 0) {
message("Input vector is empty")
return(if (value) character(0) else integer(0))
}
if (!is.character(x)) {
message("Converting input to character")
x <- as.character(x)
}
# Validate pattern
pattern_valid <- tryCatch({
grep(pattern, "test")
TRUE
}, error = function(e) {
FALSE
})
if (!pattern_valid) {
stop("Invalid regular expression pattern: '", pattern, "'\n",
"Check for:\n",
" - Unmatched brackets: [ ] ( )\n",
" - Invalid escapes\n",
" - Unescaped special characters: . * + ? ^ $")
}
# Perform grep
result <- grep(pattern, x, value = value, ignore.case = ignore.case)
# Report
n_matches <- if (value) length(result) else length(result)
message("Found ", n_matches, " match(es) out of ", length(x), " elements")
return(result)
}
# Test
safe_grep("ap", c("apple", "banana", "apricot"))
#> Found 2 match(es) out of 3 elements
#> [1] 1 3
safe_grep("ap", c("apple", "banana", "apricot"), value = TRUE)
#> Found 2 match(es) out of 3 elements
#> [1] "apple" "apricot"
safe_grep("[invalid", c("test")) # Clear error
#> Warning in grep(pattern, "test"): TRE pattern compilation error 'Missing ']''
#> Error in safe_grep("[invalid", c("test")): Invalid regular expression pattern: '[invalid'
#> Check for:
#> - Unmatched brackets: [ ] ( )
#> - Invalid escapes
#> - Unescaped special characters: . * + ? ^ $Exercise 4:
clean_identifiers <- function(x) {
library(stringr)
# Convert to character if needed
if (!is.character(x)) {
x <- as.character(x)
}
# Remove leading/trailing whitespace
x <- str_trim(x)
# Convert to lowercase
x <- str_to_lower(x)
# Replace spaces with underscores
x <- str_replace_all(x, "\\s+", "_")
# Remove non-alphanumeric except underscore
x <- str_replace_all(x, "[^a-z0-9_]", "")
# Ensure doesn't start with number
x <- str_replace(x, "^(\\d)", "x\\1")
# Ensure not empty
x[x == ""] <- "var"
# Ensure unique
x <- make.names(x, unique = TRUE)
return(x)
}
# Test
messy <- c("My Variable!", "123 Start", "test@#$", " spaces ", "")
clean_identifiers(messy)
#> [1] "my_variable" "x123_start" "test" "spaces" "var"