5 Strings
5.1 Basic string operations
- How to use regular expressions?
stringr::str_detect("1", "\\d")
#> [1] TRUE
# is the same as
stringr::str_detect("1", stringr::regex("\\d"))
#> [1] TRUE
# ____________________________________________________________________________
Paste/concatenate strings.
stringr::str_c("a", "b", NA, NaN, NULL, factor("lalala"), character(0)) #> [1] NA # ____________________________________________________________________________ paste("a", "b", NA, NaN, NULL, factor("lalala"), character(0), sep = "") #> [1] "abNANaNlalala"
Treat
NA
’s as strings ("NA"
).stringr::str_replace_na(c(NA, NaN, factor("lalala"), character(0))) #> [1] "NA" "NaN" "1" # note that the `c` function coerces the factor to its integer representation # ____________________________________________________________________________ base_replace_na <- function(x) {ifelse(is.na(x), "NA", x)} base_replace_na(c(NA, NaN, factor("lalala"), character(0))) #> [1] "NA" "NaN" "1"
Count the number of letters in a string.
stringr::str_length(c("four", NA, NaN, NULL, factor("lalala"))) #> [1] 4 NA 3 1 # ____________________________________________________________________________ nchar(c("four", NA, NaN, NULL, factor("lalala"))) #> [1] 4 NA 3 1
Return the last letter of a string, subset a string.
stringr::str_sub(c("five", "six"), -1) #> [1] "e" "x" # ____________________________________________________________________________ substr(c("five", "six"), start = nchar(c("five", "six")), stop = nchar(c("five", "six"))) #> [1] "e" "x"
Repeat/replicate/duplicate a string several times.
stringr::str_dup("_a_", 5) #> [1] "_a__a__a__a__a_" # ____________________________________________________________________________ paste(rep("_a_", 5), collapse = "") #> [1] "_a__a__a__a__a_"
Remove/trim spaces at the beginning (leading) and/or end (trailing) of a string.
stringr::str_trim(c(" 2left_1right ", " "), side = c("left")) #> [1] "2left_1right " "" # ____________________________________________________________________________ # ^\\s+|\\s+$ is the regex for one or more spaces in the beginning # or one or more spaces at the end gsub("^\\s+", "", c(" 2left_1right ", " ")) #> [1] "2left_1right " ""
Fill/pad spaces (or other characters) before and/or after a string.
stringr::str_pad(c("lalala", ""), width = 7) #> [1] " lalala" " " # ____________________________________________________________________________ unname( vapply(c("lalala", ""), function(x) if(nchar(x) < 7) { paste0(rep(" ", 7 - nchar(x)), x, collapse = "") } else {x}, character(1) ) ) #> [1] " lalala" " "
Sort strings for example, by a different local alphabetic order.
# The vowels come before the consonants in Hawaiian stringr::str_sort(letters[1:10], locale = "haw") #> [1] "a" "e" "i" "b" "c" "d" "f" "g" "h" "j" # ____________________________________________________________________________ # one can use `sort()`, but one has to know the specific locale
Return the order of stringelements within a specific alphabet
# The vowels come before the consonants in Hawaiian stringr::str_order(letters[1:10], locale = "haw") #> [1] 1 5 9 2 3 4 6 7 8 10 # ____________________________________________________________________________ # one can use `sort()`, but one has to know the specific locale
Change the encoding of a string
# Example from encoding?stringi::stringi x <- rawToChar(as.raw(177)) x #> [1] "±" stringr::str_conv(x, "ISO-8859-2") # Polish "a with ogonek", (run this line in RStudio) #> [1] "a" stringr::str_conv(x, "ISO-8859-1") # Plus-minus #> [1] "±" # ____________________________________________________________________________
Return the first two words of a string
stringr::word("Hello, how are you?", 1, 2) #> [1] "Hello, how" # ____________________________________________________________________________
Format a string (change width, indentaion, …).
cat(stringr::str_wrap("Hello, how are you?", width = 10, indent = 0, exdent = 2)) #> Hello, how #> are you? # ____________________________________________________________________________
5.2 Pattern matching
How to disable case sensitivity in regex pattern?
stringr::str_detect(c("a", "b", "c"), stringr::regex("b|A|D", ignore_case = TRUE)) #> [1] TRUE TRUE FALSE
Does a string contain a specific sequence of characters?
stringr::str_detect(c("my number is 110", "call 911"), "\\d{3}") #> [1] TRUE TRUE # ____________________________________________________________________________ grepl("\\d{3}", c("my number is 110", "call 911")) #> [1] TRUE TRUE
Return every string that contains a specific sequence of characters?
stringr::str_subset(c("my number is 110", "call 911"), "\\d{3}") #> [1] "my number is 110" "call 911" # ____________________________________________________________________________ grep("\\d{3}", c("my number is 110", "call 911"), value = TRUE) #> [1] "my number is 110" "call 911"
Return the (first) start and end index of substrings within a string?
# use stringr::str_locate_all to get a matrix of all start and end indexes stringr::str_locate(c("my 112 number is 110", "call 911"), c("\\d{3}")) #> start end #> [1,] 4 6 #> [2,] 6 8 # ____________________________________________________________________________ # Use gregexpr to construct something similar to str_locate_all matches <- regexpr("\\d{3}", c("my 112 number is 110", "call 911")) # formatting match_lengths <- attr(matches, "match.length") attributes(matches) <- NULL match_indices <- matrix(c(matches, matches + match_lengths - 1), ncol = 2) colnames(match_indices) <- c("start", "end") # output match_indices #> start end #> [1,] 4 6 #> [2,] 6 8
Return the (first) matched substring within a string?
# use stringr::str_extract_all to get a matrix of all matched substrings stringr::str_extract(c("my 112 number is 110", "call 911"), c("\\d{3}")) #> [1] "112" "911" # ____________________________________________________________________________ # combine `substr()` with match_indices from above
Return the matching strings and matched capture groups.
# the example from the stringr vignette strings <- c( "apple", "219 733 8965", "329-293-8753", "Work: 579-499-7527; Home: 543.355.3679" ) phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})" stringr::str_match(strings, phone) #> [,1] [,2] [,3] [,4] #> [1,] NA NA NA NA #> [2,] "219 733 8965" "219" "733" "8965" #> [3,] "329-293-8753" "329" "293" "8753" #> [4,] "579-499-7527" "579" "499" "7527" # Use stringr::str_match_all for list return # ____________________________________________________________________________ # `regmatches()`
Replace/substitute the first matching subtstring within a string.
stringr::str_replace(c("my 112 number is 110", "call 911"), c("\\d{1}(\\d{2})"), "\\1\\1") #> [1] "my 1212 number is 110" "call 1111" # ____________________________________________________________________________ sub(c("\\d{1}(\\d{2})"), "\\1\\1", c("my 112 number is 110", "call 911")) #> [1] "my 1212 number is 110" "call 1111"
Replace/substitute all matching subtstrings within a string.
stringr::str_replace_all(c("my 112 number is 110", "call 911"), c("\\d{1}(\\d{2})"), "\\1\\1") #> [1] "my 1212 number is 1010" "call 1111" # ____________________________________________________________________________ gsub(c("\\d{1}(\\d{2})"), "\\1\\1", c("my 112 number is 110", "call 911")) #> [1] "my 1212 number is 1010" "call 1111"
Split strings by any pattern into different (optionally n) pieces.
# returns a matrix and padds with empty strings, when n is greater than the # number of pieces stringr::str_split_fixed(c("lalala lala la", "blabla lala la"), c(" ", "b"), 5) #> [,1] [,2] [,3] [,4] [,5] #> [1,] "lalala" "lala" "la" "" "" #> [2,] "" "la" "la lala la" "" "" # returns a list stringr::str_split(c("lalala lala la", "blabla lala la"), c(" ", "b"), 5) #> [[1]] #> [1] "lalala" "lala" "la" #> #> [[2]] #> [1] "" "la" "la lala la" # ____________________________________________________________________________ # returns a list, but does not have an n argument (however, it shouldn't be # too hard to create one). strsplit(c("lalala lala la", "blabla lala la"), c(" ", "b")) #> [[1]] #> [1] "lalala" "lala" "la" #> #> [[2]] #> [1] "" "la" "la lala la"
Set all characters to lower/upper case.
stringr::str_to_lower("aBcDe") #> [1] "abcde" stringr::str_to_upper("aBcDe") #> [1] "ABCDE" # ____________________________________________________________________________ tolower("aBcDe") #> [1] "abcde" toupper("aBcDe") #> [1] "ABCDE"
Truncate a string.
stringr::str_trunc("123456789", width = 7, side = "right", ellipsis = "") #> [1] "1234567" # ____________________________________________________________________________
5.2.1 Regular expressions
- Where can I learn about regular expressions?
?stringr::modifiers
for special regular expression functionality of the stringr package.- reference sheet
- interactively test
- Build a regular expressin
- Try the
str_view()
function to see, which substring a regex matches
```r
stringr::str_view(c("abc", "a.c_blabla", "bef"), "a\\.c")
```
- How to test regular expressions?
When writing regular expressions, I strongly recommend generating a list of positive (pattern should match) and negative (pattern shouldn’t match) test cases to ensure that you are matching the correct components.
5.2.2 Modifiers
Count the number of words within a string.
words <- c("These are some words. Some more words.") stringr::str_count(words, stringr::boundary("word")) #> [1] 7 # ____________________________________________________________________________
Count the number of lines within a string.
stringr::str_count(words, stringr::boundary("line_break")) - stringr::str_count(words, stringr::boundary("word")) #> [1] 1 # ____________________________________________________________________________
Count the number of sentences within a string.
stringr::str_count(words, stringr::boundary("sentence")) #> [1] 2 # ____________________________________________________________________________
Split a string by characters.
stringr::str_split(words, stringr::boundary("character")) #> [[1]] #> [1] "T" "h" "e" "s" "e" " " "a" "r" "e" " " "s" "o" "m" "e" #> [15] " " "w" "o" "r" "d" "s" "." " " "\n" " " " " " " " " " " #> [29] " " " " " " " " " " " " "S" "o" "m" "e" " " "m" "o" "r" #> [43] "e" " " "w" "o" "r" "d" "s" "." # ____________________________________________________________________________ strsplit(words, "") #> [[1]] #> [1] "T" "h" "e" "s" "e" " " "a" "r" "e" " " "s" "o" "m" "e" #> [15] " " "w" "o" "r" "d" "s" "." " " "\n" " " " " " " " " " " #> [29] " " " " " " " " " " " " "S" "o" "m" "e" " " "m" "o" "r" #> [43] "e" " " "w" "o" "r" "d" "s" "."
5.3 List output
- How to work with string functions, that return lists?
5.4 Resources
- stringr vignette
- [Regular Expressions in R - Cheatsheet]https://www.rstudio.com/wp-content/uploads/2016/09/RegExCheatsheet.pdf