15.1 fuzzyjoin
http://varianceexplained.org/fuzzyjoin/reference/index.html
15.1.1 inexact matching
# combine 50 rows into a passage
passages <- tibble(text = prideprejudice) %>%
group_by(passage = 1 + row_number() %/% 50) %>%
summarize(text = str_c(text, collapse = " "))
passages
#> # A tibble: 261 x 2
#> passage text
#> <dbl> <chr>
#> 1 1 "PRIDE AND PREJUDICE By Jane Austen Chapter 1 It is a truth uni~
#> 2 2 "\"How so? How can it affect them?\" \"My dear Mr. Bennet,\" replied~
#> 3 3 "are my old friends. I have heard you mention them with consideration~
#> 4 4 "herself, began scolding one of her daughters. \"Don't keep coughing~
#> 5 5 " The astonishment of the ladies was just what he wished; that of Mrs~
#> 6 6 "married, I shall have nothing to wish for.\" In a few days Mr. Bing~
#> # ... with 255 more rows
characters <- readr::read_csv(
"character,character_regex
Elizabeth,Elizabeth
Darcy,Darcy
Mr. Bennet,Mr. Bennet
Mrs. Bennet,Mrs. Bennet
Jane,Jane
Mary,Mary
Lydia,Lydia
Kitty,Kitty
Wickham,Wickham
Mr. Collins,Collins
Lady Catherine de Bourgh,de Bourgh
Mr. Gardiner,Mr. Gardiner
Mrs. Gardiner,Mrs. Gardiner
Charlotte Lucas,(Charlotte|Lucas)
")
characters
#> # A tibble: 14 x 2
#> character character_regex
#> <chr> <chr>
#> 1 Elizabeth Elizabeth
#> 2 Darcy Darcy
#> 3 Mr. Bennet Mr. Bennet
#> 4 Mrs. Bennet Mrs. Bennet
#> 5 Jane Jane
#> 6 Mary Mary
#> # ... with 8 more rows
Which character appears in most passages(the dataset with the text
column must always come first):
character_passages <- passages %>%
regex_inner_join(characters, by = c("text" = "character_regex"))
character_passages %>% count(character, sort = TRUE)
#> # A tibble: 14 x 2
#> character n
#> <chr> <int>
#> 1 Elizabeth 227
#> 2 Darcy 159
#> 3 Jane 134
#> 4 Mrs. Bennet 89
#> 5 Wickham 89
#> 6 Lydia 79
#> # ... with 8 more rows
15.1.2 stringdist
misspellings
#> # A tibble: 4,505 x 2
#> misspelling correct
#> <chr> <chr>
#> 1 abandonned abandoned
#> 2 aberation aberration
#> 3 abilties abilities
#> 4 abilty ability
#> 5 abondon abandon
#> 6 abbout about
#> # ... with 4,499 more rows
library(qdapDictionaries)
words <- as_tibble(DICTIONARY)
words
#> # A tibble: 20,137 x 2
#> word syllables
#> <chr> <dbl>
#> 1 hm 1
#> 2 hmm 1
#> 3 hmmm 1
#> 4 hmph 1
#> 5 mmhmm 2
#> 6 mmhm 2
#> # ... with 20,131 more rows
misspellings %>%
sample_n(1000) %>%
stringdist_inner_join(words, by = c("misspelling" = "word"), max_dist = 1)
#> # A tibble: 764 x 4
#> misspelling correct word syllables
#> <chr> <chr> <chr> <dbl>
#> 1 exibition exhibition exhibition 4
#> 2 eminate emanate emanate 3
#> 3 eminate emanate geminate 3
#> 4 seperatist separatist separatist 4
#> 5 thyat that that 1
#> 6 visable visible disable 3
#> # ... with 758 more rows