Text Analysis
Suppose we have a data set called textdata
with a column called articleID
which contains an ID labeling each article and a column called text
which contains text of the article.
Tokenizing
tidy_textdata <- textdata |>
unnest_tokens (word, text)
Remove Stop Words
tidy_textdata <- textdata |>
unnest_tokens (word, text) |>
anti_join (stop_words)
Remove Numbers
tidy_textdata <- textdata |>
unnest_tokens (word, text) |>
filter (! grepl ('[0-9]' , word))
Remove punctuation
tidy_textdata <- textdata |>
mutate (title = removePunctuation (title))|>
unnest_tokens (word, text) |>
Count the Number of Words in each article
tidy_textdata <- textdata |>
unnest_tokens (word, text) |>
group_by (articleID) |>
count ( )
Choosing only certain rows
xOnly <- text_data |>
filter (articleID == "x" )
Make a word cloud
# Load the library
library (wordcloud)
# Make the plot
textdata |>
unnest_tokens (word, text) |>
count ( ) |>
with (wordcloud (word, n, max.words = 10 ))