1.4 Other tokenization methods

unnest_tokens supports other ways to split a column into tokens.

text <- c("This is, my bookdown book.",
          "Chapter 1: Preface\n", 
          "Thanks for \n reading this book\n",
          "Chapter 2: Introduction\n",
          "Chapter 3: Methods\n",
          "I demonstrate all of the methods here,",
          "well, not all actually.\n\n",
          "Chapter 4: Discussion\n",
          "blablabla,",
          "blablabla,",
          "blablabla.")
df <- tibble(text = text)
cat(df$text)
#> This is, my bookdown book. Chapter 1: Preface
#>  Thanks for 
#>  reading this book
#>  Chapter 2: Introduction
#>  Chapter 3: Methods
#>  I demonstrate all of the methods here, well, not all actually.
#> 
#>  Chapter 4: Discussion
#>  blablabla, blablabla, blablabla.

# lines
df %>% unnest_tokens(line, text, token = "lines")
#> # A tibble: 12 x 1
#>   line                        
#>   <chr>                       
#> 1 "this is, my bookdown book."
#> 2 "chapter 1: preface"        
#> 3 "thanks for "               
#> 4 " reading this book"        
#> 5 "chapter 2: introduction"   
#> 6 "chapter 3: methods"        
#> # ... with 6 more rows
# sentences, split by period
df %>% unnest_tokens(sentences, text, token = "sentences")
#> # A tibble: 3 x 1
#>   sentences                                                                     
#>   <chr>                                                                         
#> 1 this is, my bookdown book.                                                    
#> 2 chapter 1: preface  thanks for   reading this book  chapter 2: introduction  ~
#> 3 chapter 4: discussion  blablabla, blablabla, blablabla.
# paragraphs
df %>% unnest_tokens(paragraphs, text, token = "paragraphs")
#> # A tibble: 7 x 1
#>   paragraphs                                                      
#>   <chr>                                                           
#> 1 "this is, my bookdown book. chapter 1: preface"                 
#> 2 "thanks for   reading this book"                                
#> 3 "chapter 2: introduction"                                       
#> 4 "chapter 3: methods"                                            
#> 5 "i demonstrate all of the methods here, well, not all actually."
#> 6 " chapter 4: discussion"                                        
#> # ... with 1 more row

# split into characters or multiple characters 
df %>% unnest_tokens(character, text, token = "characters")
#> # A tibble: 188 x 1
#>   character
#>   <chr>    
#> 1 t        
#> 2 h        
#> 3 i        
#> 4 s        
#> 5 i        
#> 6 s        
#> # ... with 182 more rows
df %>% unnest_tokens(characters, text, token = "character_shingles", n = 4)
#> # A tibble: 185 x 1
#>   characters
#>   <chr>     
#> 1 this      
#> 2 hisi      
#> 3 isis      
#> 4 sism      
#> 5 ismy      
#> 6 smyb      
#> # ... with 179 more rows

# split by regex
df %>% 
  unnest_tokens(chapter, text, token = "regex", pattern = "Chapter \\d:")
#> # A tibble: 5 x 1
#>   chapter                                                                       
#>   <chr>                                                                         
#> 1 "this is, my bookdown book.\n"                                                
#> 2 " preface\n\nthanks for \n reading this book\n\n"                             
#> 3 " introduction\n\n"                                                           
#> 4 " methods\n\ni demonstrate all of the methods here,\nwell, not all actually.\~
#> 5 " discussion\n\nblablabla,\nblablabla,\nblablabla."