15.1 fuzzyjoin

http://varianceexplained.org/fuzzyjoin/reference/index.html

15.1.1 inexact matching

library(fuzzyjoin)
library(janeaustenr)

# combine 50 rows into a passage
passages <- tibble(text = prideprejudice) %>%
  group_by(passage = 1 + row_number() %/% 50) %>%
  summarize(text = str_c(text, collapse = " "))

passages
#> # A tibble: 261 x 2
#>   passage text                                                                  
#>     <dbl> <chr>                                                                 
#> 1       1 "PRIDE AND PREJUDICE  By Jane Austen    Chapter 1   It is a truth uni~
#> 2       2 "\"How so? How can it affect them?\"  \"My dear Mr. Bennet,\" replied~
#> 3       3 "are my old friends. I have heard you mention them with consideration~
#> 4       4 "herself, began scolding one of her daughters.  \"Don't keep coughing~
#> 5       5 " The astonishment of the ladies was just what he wished; that of Mrs~
#> 6       6 "married, I shall have nothing to wish for.\"  In a few days Mr. Bing~
#> # ... with 255 more rows

characters <- readr::read_csv(
"character,character_regex
Elizabeth,Elizabeth
Darcy,Darcy
Mr. Bennet,Mr. Bennet
Mrs. Bennet,Mrs. Bennet
Jane,Jane
Mary,Mary
Lydia,Lydia
Kitty,Kitty
Wickham,Wickham
Mr. Collins,Collins
Lady Catherine de Bourgh,de Bourgh
Mr. Gardiner,Mr. Gardiner
Mrs. Gardiner,Mrs. Gardiner
Charlotte Lucas,(Charlotte|Lucas)
")

characters
#> # A tibble: 14 x 2
#>   character   character_regex
#>   <chr>       <chr>          
#> 1 Elizabeth   Elizabeth      
#> 2 Darcy       Darcy          
#> 3 Mr. Bennet  Mr. Bennet     
#> 4 Mrs. Bennet Mrs. Bennet    
#> 5 Jane        Jane           
#> 6 Mary        Mary           
#> # ... with 8 more rows

Which character appears in most passages(the dataset with the text column must always come first):

character_passages <- passages %>% 
  regex_inner_join(characters, by = c("text" = "character_regex"))

character_passages %>% count(character, sort = TRUE)
#> # A tibble: 14 x 2
#>   character       n
#>   <chr>       <int>
#> 1 Elizabeth     227
#> 2 Darcy         159
#> 3 Jane          134
#> 4 Mrs. Bennet    89
#> 5 Wickham        89
#> 6 Lydia          79
#> # ... with 8 more rows

# character_passages %>% 
#   select(-character_regex) %>% 
#   pivot_wider(names_from = "character", values_from = "text") %>% 
#   mutate_all(str_length) %>% 
#   mutate_all(~ replace_na(.x, 0))

15.1.2 stringdist

library(fuzzyjoin)

misspellings
#> # A tibble: 4,505 x 2
#>   misspelling correct   
#>   <chr>       <chr>     
#> 1 abandonned  abandoned 
#> 2 aberation   aberration
#> 3 abilties    abilities 
#> 4 abilty      ability   
#> 5 abondon     abandon   
#> 6 abbout      about     
#> # ... with 4,499 more rows

library(qdapDictionaries)
words <- as_tibble(DICTIONARY)

words
#> # A tibble: 20,137 x 2
#>   word  syllables
#>   <chr>     <dbl>
#> 1 hm            1
#> 2 hmm           1
#> 3 hmmm          1
#> 4 hmph          1
#> 5 mmhmm         2
#> 6 mmhm          2
#> # ... with 20,131 more rows

misspellings %>% 
  sample_n(1000) %>% 
  stringdist_inner_join(words, by = c("misspelling" = "word"), max_dist = 1)
#> # A tibble: 764 x 4
#>   misspelling correct    word       syllables
#>   <chr>       <chr>      <chr>          <dbl>
#> 1 exibition   exhibition exhibition         4
#> 2 eminate     emanate    emanate            3
#> 3 eminate     emanate    geminate           3
#> 4 seperatist  separatist separatist         4
#> 5 thyat       that       that               1
#> 6 visable     visible    disable            3
#> # ... with 758 more rows