1.1 The unnest_tokens()
function
austen_books()
#> # A tibble: 73,422 x 2
#> text book
#> * <chr> <fct>
#> 1 "SENSE AND SENSIBILITY" Sense & Sensibility
#> 2 "" Sense & Sensibility
#> 3 "by Jane Austen" Sense & Sensibility
#> 4 "" Sense & Sensibility
#> 5 "(1811)" Sense & Sensibility
#> 6 "" Sense & Sensibility
#> # ... with 7.342e+04 more rows
original_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup()
original_books
#> # A tibble: 73,422 x 4
#> text book linenumber chapter
#> <chr> <fct> <int> <int>
#> 1 "SENSE AND SENSIBILITY" Sense & Sensibility 1 0
#> 2 "" Sense & Sensibility 2 0
#> 3 "by Jane Austen" Sense & Sensibility 3 0
#> 4 "" Sense & Sensibility 4 0
#> 5 "(1811)" Sense & Sensibility 5 0
#> 6 "" Sense & Sensibility 6 0
#> # ... with 7.342e+04 more rows
original_books %>%
unnest_tokens(word, text)
#> # A tibble: 725,055 x 4
#> book linenumber chapter word
#> <fct> <int> <int> <chr>
#> 1 Sense & Sensibility 1 0 sense
#> 2 Sense & Sensibility 1 0 and
#> 3 Sense & Sensibility 1 0 sensibility
#> 4 Sense & Sensibility 3 0 by
#> 5 Sense & Sensibility 3 0 jane
#> 6 Sense & Sensibility 3 0 austen
#> # ... with 7.25e+05 more rows
stop_words
#> # A tibble: 1,149 x 2
#> word lexicon
#> <chr> <chr>
#> 1 a SMART
#> 2 a's SMART
#> 3 able SMART
#> 4 about SMART
#> 5 above SMART
#> 6 according SMART
#> # ... with 1,143 more rows
tidy_books <- original_books %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
tidy_books
#> # A tibble: 217,609 x 4
#> book linenumber chapter word
#> <fct> <int> <int> <chr>
#> 1 Sense & Sensibility 1 0 sense
#> 2 Sense & Sensibility 1 0 sensibility
#> 3 Sense & Sensibility 3 0 jane
#> 4 Sense & Sensibility 3 0 austen
#> 5 Sense & Sensibility 5 0 1811
#> 6 Sense & Sensibility 10 1 chapter
#> # ... with 2.176e+05 more rows