5 Strings

5.1 Basic string operations

How to use regular expressions?

    stringr::str_detect("1", "\\d")
#> [1] TRUE
    # is the same as
    stringr::str_detect("1", stringr::regex("\\d"))
#> [1] TRUE
    #   ____________________________________________________________________________

Paste/concatenate strings.

stringr::str_c("a", "b", NA, NaN, NULL, factor("lalala"), character(0))
#> [1] NA
#   ____________________________________________________________________________
paste("a", "b", NA, NaN, NULL, factor("lalala"), character(0), sep = "")
#> [1] "abNANaNlalala"

Treat NA’s as strings ("NA").

stringr::str_replace_na(c(NA, NaN, factor("lalala"), character(0)))
#> [1] "NA"  "NaN" "1"
# note that the `c` function coerces the factor to its integer representation
#   ____________________________________________________________________________
base_replace_na <- function(x) {ifelse(is.na(x), "NA", x)}
base_replace_na(c(NA, NaN, factor("lalala"), character(0)))
#> [1] "NA"  "NaN" "1"

Count the number of letters in a string.

stringr::str_length(c("four", NA, NaN, NULL, factor("lalala")))
#> [1]  4 NA  3  1
#   ____________________________________________________________________________
nchar(c("four", NA, NaN, NULL, factor("lalala")))
#> [1]  4 NA  3  1

Return the last letter of a string, subset a string.

stringr::str_sub(c("five", "six"), -1)
#> [1] "e" "x"
#   ____________________________________________________________________________
substr(c("five", "six"), 
       start = nchar(c("five", "six")), 
       stop = nchar(c("five", "six")))
#> [1] "e" "x"

Repeat/replicate/duplicate a string several times.

stringr::str_dup("_a_", 5)
#> [1] "_a__a__a__a__a_"
#   ____________________________________________________________________________
paste(rep("_a_", 5), collapse = "")
#> [1] "_a__a__a__a__a_"

Remove/trim spaces at the beginning (leading) and/or end (trailing) of a string.

stringr::str_trim(c("  2left_1right ", "   "), side = c("left"))
#> [1] "2left_1right " ""
#   ____________________________________________________________________________
# ^\\s+|\\s+$ is the regex for one or more spaces in the beginning
# or one or more spaces at the end
gsub("^\\s+", "", c("  2left_1right ", "   "))
#> [1] "2left_1right " ""

Fill/pad spaces (or other characters) before and/or after a string.

stringr::str_pad(c("lalala", ""), width = 7)
#> [1] " lalala" "       "
#   ____________________________________________________________________________
unname(
  vapply(c("lalala", ""), 
         function(x) if(nchar(x) < 7) {
           paste0(rep(" ", 7 - nchar(x)), x, collapse = "")
           } else {x},
         character(1)
         )
  )
#> [1] " lalala" "       "

Sort strings for example, by a different local alphabetic order.

# The vowels come before the consonants in Hawaiian
stringr::str_sort(letters[1:10], locale = "haw")
#>  [1] "a" "e" "i" "b" "c" "d" "f" "g" "h" "j"
#   ____________________________________________________________________________
# one can use `sort()`, but one has to know the specific locale

Return the order of stringelements within a specific alphabet

# The vowels come before the consonants in Hawaiian
stringr::str_order(letters[1:10], locale = "haw")
#>  [1]  1  5  9  2  3  4  6  7  8 10
#   ____________________________________________________________________________
# one can use `sort()`, but one has to know the specific locale

Change the encoding of a string

# Example from encoding?stringi::stringi
x <- rawToChar(as.raw(177))
x
#> [1] "±"
stringr::str_conv(x, "ISO-8859-2") # Polish "a with ogonek", (run this line in RStudio)
#> [1] "a"
stringr::str_conv(x, "ISO-8859-1") # Plus-minus
#> [1] "±"
#   ____________________________________________________________________________

Return the first two words of a string

stringr::word("Hello, how are you?", 1, 2)
#> [1] "Hello, how"
#   ____________________________________________________________________________

Format a string (change width, indentaion, …).

cat(stringr::str_wrap("Hello, how are you?", width = 10, indent = 0, exdent = 2))
#> Hello, how
#>   are you?
#   ____________________________________________________________________________

5.2 Pattern matching

How to disable case sensitivity in regex pattern?

stringr::str_detect(c("a", "b", "c"), stringr::regex("b|A|D", ignore_case = TRUE))
#> [1]  TRUE  TRUE FALSE

Does a string contain a specific sequence of characters?

stringr::str_detect(c("my number is 110", "call 911"), "\\d{3}")
#> [1] TRUE TRUE
#   ____________________________________________________________________________
grepl("\\d{3}", c("my number is 110", "call 911"))
#> [1] TRUE TRUE

Return every string that contains a specific sequence of characters?

stringr::str_subset(c("my number is 110", "call 911"), "\\d{3}")
#> [1] "my number is 110" "call 911"
#   ____________________________________________________________________________
grep("\\d{3}", c("my number is 110", "call 911"), value = TRUE)
#> [1] "my number is 110" "call 911"

Return the (first) start and end index of substrings within a string?

# use stringr::str_locate_all to get a matrix of all start and end indexes
stringr::str_locate(c("my 112 number is 110", "call 911"), c("\\d{3}"))
#>      start end
#> [1,]     4   6
#> [2,]     6   8
#   ____________________________________________________________________________
# Use gregexpr to construct something similar to str_locate_all
matches <- regexpr("\\d{3}", c("my 112 number is 110", "call 911"))

# formatting
match_lengths <- attr(matches, "match.length")
attributes(matches) <- NULL
match_indices <- matrix(c(matches, matches + match_lengths - 1), ncol = 2)
colnames(match_indices) <- c("start", "end")

# output
match_indices
#>      start end
#> [1,]     4   6
#> [2,]     6   8

Return the (first) matched substring within a string?

# use stringr::str_extract_all to get a matrix of all matched substrings
stringr::str_extract(c("my 112 number is 110", "call 911"), c("\\d{3}"))
#> [1] "112" "911"
#   ____________________________________________________________________________
# combine `substr()` with match_indices from above

Return the matching strings and matched capture groups.

# the example from the stringr vignette
strings <- c(
  "apple",
  "219 733 8965", 
  "329-293-8753", 
  "Work: 579-499-7527; Home: 543.355.3679"
  )
phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"

stringr::str_match(strings, phone)
#>      [,1]           [,2]  [,3]  [,4]  
#> [1,] NA             NA    NA    NA    
#> [2,] "219 733 8965" "219" "733" "8965"
#> [3,] "329-293-8753" "329" "293" "8753"
#> [4,] "579-499-7527" "579" "499" "7527"
# Use stringr::str_match_all for list return
#   ____________________________________________________________________________
# `regmatches()`

Replace/substitute the first matching subtstring within a string.

stringr::str_replace(c("my 112 number is 110", "call 911"),
                     c("\\d{1}(\\d{2})"), "\\1\\1")
#> [1] "my 1212 number is 110" "call 1111"
#   ____________________________________________________________________________
sub(c("\\d{1}(\\d{2})"), "\\1\\1", c("my 112 number is 110", "call 911"))
#> [1] "my 1212 number is 110" "call 1111"

Replace/substitute all matching subtstrings within a string.

stringr::str_replace_all(c("my 112 number is 110", "call 911"),
                         c("\\d{1}(\\d{2})"), "\\1\\1")
#> [1] "my 1212 number is 1010" "call 1111"
#   ____________________________________________________________________________
gsub(c("\\d{1}(\\d{2})"), "\\1\\1", c("my 112 number is 110", "call 911"))
#> [1] "my 1212 number is 1010" "call 1111"

Split strings by any pattern into different (optionally n) pieces.

# returns a matrix and padds with empty strings, when n is greater than the
# number of pieces
stringr::str_split_fixed(c("lalala lala la",
                           "blabla lala la"),
                         c(" ", "b"), 5)
#>      [,1]     [,2]   [,3]         [,4] [,5]
#> [1,] "lalala" "lala" "la"         ""   ""  
#> [2,] ""       "la"   "la lala la" ""   ""
# returns a list
stringr::str_split(c("lalala lala la",
                     "blabla lala la"),
                   c(" ", "b"), 5)
#> [[1]]
#> [1] "lalala" "lala"   "la"    
#> 
#> [[2]]
#> [1] ""           "la"         "la lala la"
#   ____________________________________________________________________________
# returns a list, but does not have an n argument (however, it shouldn't be
# too hard to create one).
strsplit(c("lalala lala la",
           "blabla lala la"),
         c(" ", "b"))
#> [[1]]
#> [1] "lalala" "lala"   "la"    
#> 
#> [[2]]
#> [1] ""           "la"         "la lala la"

Set all characters to lower/upper case.

stringr::str_to_lower("aBcDe")
#> [1] "abcde"
stringr::str_to_upper("aBcDe")
#> [1] "ABCDE"
#   ____________________________________________________________________________
tolower("aBcDe")
#> [1] "abcde"
toupper("aBcDe")
#> [1] "ABCDE"

Truncate a string.

stringr::str_trunc("123456789", width = 7, side = "right", ellipsis = "")
#> [1] "1234567"
#   ____________________________________________________________________________

5.2.1 Regular expressions

Where can I learn about regular expressions?

?stringr::modifiers for special regular expression functionality of the stringr package.
reference sheet
interactively test
Build a regular expressin
Try the str_view() function to see, which substring a regex matches

```r
stringr::str_view(c("abc", "a.c_blabla", "bef"), "a\\.c")
```


  abc
  a.c_blabla
  bef

How to test regular expressions?

When writing regular expressions, I strongly recommend generating a list of positive (pattern should match) and negative (pattern shouldn’t match) test cases to ensure that you are matching the correct components.

5.2.2 Modifiers

Count the number of words within a string.

words <- c("These are some words. 
           Some more words.")
stringr::str_count(words, stringr::boundary("word"))
#> [1] 7
#   ____________________________________________________________________________

Count the number of lines within a string.

stringr::str_count(words, stringr::boundary("line_break")) -
stringr::str_count(words, stringr::boundary("word"))
#> [1] 1
#   ____________________________________________________________________________

Count the number of sentences within a string.

stringr::str_count(words, stringr::boundary("sentence"))
#> [1] 2
#   ____________________________________________________________________________

Split a string by characters.

stringr::str_split(words, stringr::boundary("character"))
#> [[1]]
#>  [1] "T"  "h"  "e"  "s"  "e"  " "  "a"  "r"  "e"  " "  "s"  "o"  "m"  "e" 
#> [15] " "  "w"  "o"  "r"  "d"  "s"  "."  " "  "\n" " "  " "  " "  " "  " " 
#> [29] " "  " "  " "  " "  " "  " "  "S"  "o"  "m"  "e"  " "  "m"  "o"  "r" 
#> [43] "e"  " "  "w"  "o"  "r"  "d"  "s"  "."
#   ____________________________________________________________________________
strsplit(words, "")
#> [[1]]
#>  [1] "T"  "h"  "e"  "s"  "e"  " "  "a"  "r"  "e"  " "  "s"  "o"  "m"  "e" 
#> [15] " "  "w"  "o"  "r"  "d"  "s"  "."  " "  "\n" " "  " "  " "  " "  " " 
#> [29] " "  " "  " "  " "  " "  " "  "S"  "o"  "m"  "e"  " "  "m"  "o"  "r" 
#> [43] "e"  " "  "w"  "o"  "r"  "d"  "s"  "."

5.3 List output

How to work with string functions, that return lists?

5.4 Resources

stringr vignette
[Regular Expressions in R - Cheatsheet]https://www.rstudio.com/wp-content/uploads/2016/09/RegExCheatsheet.pdf