18.2 Regular Expressions

The rebus package is a good resource for building regular expressions.

library(rebus)
## Warning: package 'rebus' was built under R version 4.0.2
## 
## Attaching package: 'rebus'
## The following objects are masked from 'package:qdapRegex':
## 
##     %|%, group, is.regex
## The following objects are masked from 'package:igraph':
## 
##     %c%, graph
## The following object is masked from 'package:scales':
## 
##     alpha
## The following object is masked from 'package:stringr':
## 
##     regex
## The following object is masked from 'package:ggplot2':
## 
##     alpha

The %R% operator concatenates the regular expression. START represents regex “^” meaning “starting with”. END represents regex “$” meaning “ending with”.

x <- austen_books() %>% 
  filter(book == "Sense & Sensibility") %>%
  select(text) %>%
  head(100) %>%
  pull()

Here are lines from Sense & Sensibility that start with “Mr”.

str_subset(x, pattern = START %R% "Mr")
## [1] "Mrs. Henry Dashwood to his wishes, which proceeded not merely from"     
## [2] "Mr. Dashwood's disappointment was, at first, severe; but his temper was"
## [3] "Mr. John Dashwood had not the strong feelings of the rest of the"

ANY_CHAR represents regex “.” Here are lines from Sense & Sensibility with pattern “handsome

str_subset(x, pattern = ANY_CHAR %R% "handsome" %R% ANY_CHAR)
## [1] "them three thousand pounds: it would be liberal and handsome! It would"

char_class() is similar to ANY_CHAR except that it matches any character from the string parameter. It is the same as regex “[]”. The opposite is negated_char_class(), which is the same as “[^]”. char_class() can accept ranges, such as “0-9”, and “a-z”. DGT is the same thing as “0-9”.

str_subset(c("apple", "Aardvark", "Orukidn"), char_class("Aa"))
## [1] "apple"    "Aardvark"

DOT, CARAT, and DOLLAR represent special characters, “.”, “%”, and “$”. Function or() provides alteration.

str_match(c("kittycat", "doggone"), pattern = or("dog", "cat"))
##      [,1] 
## [1,] "cat"
## [2,] "dog"

Look for repeating patterns with optional(), zero_or_more(), one_or_more(), and repeated().

Wrap a rebus expression in capture() to create a column in the output for the match to each captured part of the regex.

phone_string <- c("555-123-4567", 
                  "(555)123-4567", 
                  "555.123.4567", 
                  "555-123-4567 (M), 555-123-7654 (H)")
phone_pattern <- 
  capture(DGT %R% DGT %R% DGT) %R% zero_or_more(char_class("()-.")) %R% 
  capture(DGT %R% DGT %R% DGT) %R% zero_or_more(char_class("()-.")) %R%
  capture(DGT %R% DGT %R% DGT %R% DGT)

# first match with str_match()
phone_match <- str_match(phone_string, pattern = phone_pattern)
str_c("(", phone_match[, 2], ")", phone_match[, 3], "-", phone_match[, 4])
## [1] "(555)123-4567" "(555)123-4567" "(555)123-4567" "(555)123-4567"
# all matches with str_match_all() and lapply()
phone_match_all <- str_match_all(phone_string, pattern = phone_pattern)
lapply(phone_match_all, function(x){str_c("(", x[, 2], ")", x[, 3], "-", x[, 4])}) %>% unlist()
## [1] "(555)123-4567" "(555)123-4567" "(555)123-4567" "(555)123-4567"
## [5] "(555)123-7654"

You can refer to captured patterns with REF[0-9].

str_match(c("hello", "sweet", "kitten"), 
  pattern = capture(LOWER) %R% REF1)
##      [,1] [,2]
## [1,] "ll" "l" 
## [2,] "ee" "e" 
## [3,] "tt" "t"

Here is an exercise working with the Oscal Wilde play “The Importance of Being Earnest”.

earnest <- read_lines("http://s3.amazonaws.com/assets.datacamp.com/production/course_2922/datasets/importance-of-being-earnest.txt")

The text is between the lines with “START OF THE PROJECT” and “END OF THE PROJECT”. str_which() returns the indices where the string contains the pattern. The text consists of an introduction and the play itself. The play starts at “FIRST ACT”.

start <- str_which(earnest, fixed("START OF THE PROJECT"))
end <- str_which(earnest, fixed("END OF THE PROJECT"))
earnest_sub <- earnest[(start+1):(end-1)]

play_start <- str_which(earnest_sub, "FIRST ACT")
intro_line_index <- 1:(play_start - 1)
intro_text <- earnest_sub[intro_line_index]
play_text <- earnest_sub[-intro_line_index]

# remove the emptly lines
play_lines <- play_text[str_length(play_text) > 0] %>% as.character()

# print first 20 lines
writeLines(play_lines[1:20])
## FIRST ACT
## SCENE
## Morning-room in Algernon's flat in Half-Moon Street.  The room is
## luxuriously and artistically furnished.  The sound of a piano is heard in
## the adjoining room.
## [Lane is arranging afternoon tea on the table, and after the music has
## ceased, Algernon enters.]
## Algernon.  Did you hear what I was playing, Lane?
## Lane.  I didn't think it polite to listen, sir.
## Algernon.  I'm sorry for that, for your sake.  I don't play
## accurately--any one can play accurately--but I play with wonderful
## expression.  As far as the piano is concerned, sentiment is my forte.  I
## keep science for Life.
## Lane.  Yes, sir.
## Algernon.  And, speaking of the science of Life, have you got the
## cucumber sandwiches cut for Lady Bracknell?
## Lane.  Yes, sir.  [Hands them on a salver.]
## Algernon.  [Inspects them, takes two, and sits down on the sofa.]  Oh! . . .
## by the way, Lane, I see from your book that on Thursday night, when
## Lord Shoreman and Mr. Worthing were dining with me, eight bottles of

How would you identify lines where the character is starting to speak? You might look for a capitalized word followed by a “.”.

pattern <- START %R% ascii_upper() %R% one_or_more(WRD) %R% DOT

lines <- str_subset(play_lines, pattern)

# Extract the matching string (the character speaking)
who <- str_extract(lines, pattern)

# Let's see what we have
unique(who)
##  [1] "Algernon."   "Lane."       "Jack."       "Cecily."     "Ernest."    
##  [6] "University." "Gwendolen."  "July."       "Chasuble."   "Merriman."  
## [11] "Sunday."     "Mr."         "London."     "Cardew."     "Opera."     
## [16] "Markby."     "Oxonian."

Close, but not perfect. If you know the characters, just search for them directly. or1() is like or() but lets you supply a vector of strings.

characters <- c("Algernon", "Jack", "Lane", "Cecily", "Gwendolen", "Chasuble", 
  "Merriman", "Lady Bracknell", "Miss Prism")

pattern <- START %R% or1(characters) %R% DOT

lines <- str_subset(play_lines, pattern)

# Extract the matching string (the character speaking)
who <- str_extract(lines, pattern)

# Let's see what we have
unique(who)
## [1] "Algernon."       "Lane."           "Jack."           "Cecily."        
## [5] "Gwendolen."      "Lady Bracknell." "Miss Prism."     "Chasuble."      
## [9] "Merriman."
# Lines per character 
table(who)
## who
##       Algernon.         Cecily.       Chasuble.      Gwendolen.           Jack. 
##             201             154              42             102             219 
## Lady Bracknell.           Lane.       Merriman.     Miss Prism. 
##              84              21              17              41