1.4 Other tokenization methods
unnest_tokens
supports other ways to split a column into tokens.
text <- c("This is, my bookdown book.",
"Chapter 1: Preface\n",
"Thanks for \n reading this book\n",
"Chapter 2: Introduction\n",
"Chapter 3: Methods\n",
"I demonstrate all of the methods here,",
"well, not all actually.\n\n",
"Chapter 4: Discussion\n",
"blablabla,",
"blablabla,",
"blablabla.")
df <- tibble(text = text)
cat(df$text)
#> This is, my bookdown book. Chapter 1: Preface
#> Thanks for
#> reading this book
#> Chapter 2: Introduction
#> Chapter 3: Methods
#> I demonstrate all of the methods here, well, not all actually.
#>
#> Chapter 4: Discussion
#> blablabla, blablabla, blablabla.
# lines
df %>% unnest_tokens(line, text, token = "lines")
#> # A tibble: 12 x 1
#> line
#> <chr>
#> 1 "this is, my bookdown book."
#> 2 "chapter 1: preface"
#> 3 "thanks for "
#> 4 " reading this book"
#> 5 "chapter 2: introduction"
#> 6 "chapter 3: methods"
#> # ... with 6 more rows
# sentences, split by period
df %>% unnest_tokens(sentences, text, token = "sentences")
#> # A tibble: 3 x 1
#> sentences
#> <chr>
#> 1 this is, my bookdown book.
#> 2 chapter 1: preface thanks for reading this book chapter 2: introduction ~
#> 3 chapter 4: discussion blablabla, blablabla, blablabla.
# paragraphs
df %>% unnest_tokens(paragraphs, text, token = "paragraphs")
#> # A tibble: 7 x 1
#> paragraphs
#> <chr>
#> 1 "this is, my bookdown book. chapter 1: preface"
#> 2 "thanks for reading this book"
#> 3 "chapter 2: introduction"
#> 4 "chapter 3: methods"
#> 5 "i demonstrate all of the methods here, well, not all actually."
#> 6 " chapter 4: discussion"
#> # ... with 1 more row
# split into characters or multiple characters
df %>% unnest_tokens(character, text, token = "characters")
#> # A tibble: 188 x 1
#> character
#> <chr>
#> 1 t
#> 2 h
#> 3 i
#> 4 s
#> 5 i
#> 6 s
#> # ... with 182 more rows
df %>% unnest_tokens(characters, text, token = "character_shingles", n = 4)
#> # A tibble: 185 x 1
#> characters
#> <chr>
#> 1 this
#> 2 hisi
#> 3 isis
#> 4 sism
#> 5 ismy
#> 6 smyb
#> # ... with 179 more rows
# split by regex
df %>%
unnest_tokens(chapter, text, token = "regex", pattern = "Chapter \\d:")
#> # A tibble: 5 x 1
#> chapter
#> <chr>
#> 1 "this is, my bookdown book.\n"
#> 2 " preface\n\nthanks for \n reading this book\n\n"
#> 3 " introduction\n\n"
#> 4 " methods\n\ni demonstrate all of the methods here,\nwell, not all actually.\~
#> 5 " discussion\n\nblablabla,\nblablabla,\nblablabla."