第 11 章 主题词分析
这里要用到文本挖掘技术(Silge and Robinson 2017)
11.1 化学学科
%>% count(Category_ESI) %>% arrange(-n) complete_set
我们就看看四川省的化学学科吧
<- complete_set %>% filter(Category_ESI == "CHEMISTRY") chem_set
<- chem_set %>%
word_freq with(str_split(DE, ";")) %>%
unlist() %>%
tibble(keywords = .) %>%
filter(!is.na(keywords) ) %>%
filter(!keywords %in% c("") ) %>%
count(keywords = keywords %>% str_to_lower()) %>%
arrange(desc(n))
head(word_freq)
11.2 词云
library(wordcloud2)
wordcloud2(word_freq, size=1)
#wordcloud2(word_freq,
# size=1,
# figPath = "images/twitter.jpg")
然并卵。不知道大家为什么喜欢词云这个东西
11.3 共现矩阵
<- chem_set %>%
occur mutate(id = row_number()) %>%
select(id, DE) %>%
separate_rows(DE, sep = ";") %>%
filter(!is.na(DE) )
library(widyr) #devtools::install_github("dgrtwo/widyr")
<- occur %>%
paper_words_pair pairwise_count(DE, id, sort = TRUE)
head(paper_words_pair )
<- paper_words_pair %>% select(item1, item2, n) %>%
links filter(n >= 10) %>%
rename(from = item1, to = item2, weight = n)
links
library(igraph)
library(ggraph)
#the development version of ggraph requires the development version of ggplot2
%>%
links graph_from_data_frame(directed = T) %>%
ggraph(layout = 'kk') +
geom_edge_fan(aes(edge_alpha = weight, edge_width = weight)) +
geom_node_point(size = 1) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines"))
这个图需要加点颜色
unnest_tokens + stemming + stopword ?
11.4 全部学科
不过瘾吗? 那我们就看看全局
<- complete_set %>%
occurrence select(Category_ESI_cn, UT, DE) %>%
separate_rows(DE, sep = ";") %>%
mutate(DE = str_trim(DE)) %>%
filter(!is.na(DE) )
<- occurrence %>%
paper_words_pairs ::filter(!is.na(Category_ESI_cn)) %>%
dplyr::group_by(Category_ESI_cn) %>%
dplyr::pairwise_count(item = DE, feature = UT, sort = TRUE, upper = FALSE)
widyr
head(paper_words_pairs )
<- paper_words_pairs %>%
linkk select(item1, item2, n, Category_ESI_cn) %>%
filter(n >= 5) %>%
filter(Category_ESI_cn %in% c("物理学", "化学", "数学", "工程学", "计算机科学", "材料科学")) %>%
rename(from = item1, to = item2, weight = n)
linkk
library(igraph)
library(ggraph)
%>%
linkk graph_from_data_frame(directed = F) %>%
ggraph(layout = 'kk') +
geom_edge_fan(aes(edge_alpha = weight,
edge_width = weight,
edge_colour = Category_ESI_cn)) +
geom_node_point(size = 1) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
#theme_void() +
facet_wrap(~ Category_ESI_cn)
糟糕,又看不清楚了
library(igraph)
library(ggraph)
<- linkk %>%
p2p graph_from_data_frame(directed = F) %>%
ggraph(layout = 'kk') +
geom_edge_fan(aes(edge_alpha = weight,
edge_width = weight,
edge_colour = Category_ESI_cn)) +
geom_node_point(size = 1) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
#theme_void() +
facet_wrap(~ Category_ESI_cn)
ggsave("plt3.png", plot=p2p, dpi=300, width = 40, height = 90, units = "cm")
这下清楚了
参考文献
Silge, Julia, and David Robinson. 2017. Text Mining with r: A Tidy Approach. 1 edition. O’Reilly Media. https://www.tidytextmining.com/.