Chapter 11 Text Analysis

Suppose we have a data set called textdata with a column called articleID which contains an ID labeling each article and a column called text which contains text of the article.

11.1 Tokenizing

tidy_textdata <- textdata |>
  unnest_tokens(word, text)

11.2 Remove Stop Words

tidy_textdata <- textdata |>
  unnest_tokens(word, text) |>
  anti_join(stop_words) 

11.3 Remove Numbers

tidy_textdata <- textdata |>
  unnest_tokens(word, text) |>
  filter(!grepl('[0-9]', word)) 

11.4 Remove punctuation

tidy_textdata <- textdata |>
  mutate(title = removePunctuation(title))|>
  unnest_tokens(word, text) |>

11.5 Count the Number of Words in each article

tidy_textdata <- textdata |>
  unnest_tokens(word, text) |>
  group_by(articleID) |>
  count( ) 

11.6 Choosing only certain rows

xOnly <- text_data |>
  filter(articleID == "x")

11.7 Make a word cloud

# Load the library 
library(wordcloud)

# Make the plot 
textdata |>
  unnest_tokens(word, text) |>
  count( ) |> 
  with(wordcloud(word, n, max.words = 10))