Text Analysis
Suppose we have a data set called textdata
with a column called articleID
which contains an ID labeling each article and a column called text
which contains text of the article.
Tokenizing
tidy_textdata <- textdata |>
unnest_tokens(word, text)
Remove Stop Words
tidy_textdata <- textdata |>
unnest_tokens(word, text) |>
anti_join(stop_words)
Remove Numbers
tidy_textdata <- textdata |>
unnest_tokens(word, text) |>
filter(!grepl('[0-9]', word))
Remove punctuation
tidy_textdata <- textdata |>
mutate(title = removePunctuation(title))|>
unnest_tokens(word, text) |>
Count the Number of Words in each article
tidy_textdata <- textdata |>
unnest_tokens(word, text) |>
group_by(articleID) |>
count( )
Choosing only certain rows
xOnly <- text_data |>
filter(articleID == "x")
Make a word cloud
# Load the library
library(wordcloud)
# Make the plot
textdata |>
unnest_tokens(word, text) |>
count( ) |>
with(wordcloud(word, n, max.words = 10))