第 11 章 主题词分析

这里要用到文本挖掘技术(Silge and Robinson 2017)

11.1 化学学科

complete_set %>% count(Category_ESI) %>% arrange(-n)

我们就看看四川省的化学学科吧

chem_set <- complete_set %>% filter(Category_ESI == "CHEMISTRY")  
word_freq <- chem_set %>% 
       with(str_split(DE, ";")) %>% 
       unlist() %>% 
       tibble(keywords = .) %>%
       filter(!is.na(keywords) ) %>% 
       filter(!keywords %in% c("")  ) %>% 
       count(keywords = keywords %>% str_to_lower()) %>% 
       arrange(desc(n))

head(word_freq)

11.2 词云

library(wordcloud2)
wordcloud2(word_freq, size=1)
#wordcloud2(word_freq,
#           size=1,
#           figPath = "images/twitter.jpg")

然并卵。不知道大家为什么喜欢词云这个东西

11.3 共现矩阵

occur <- chem_set %>% 
        mutate(id = row_number()) %>% 
        select(id, DE) %>% 
        separate_rows(DE, sep = ";") %>% 
        filter(!is.na(DE) ) 
library(widyr) #devtools::install_github("dgrtwo/widyr")
paper_words_pair <- occur %>% 
       pairwise_count(DE, id, sort = TRUE)

head(paper_words_pair )
links <- paper_words_pair %>% select(item1, item2, n) %>%
                              filter(n >= 10) %>%
                              rename(from = item1, to = item2, weight = n)
links
library(igraph)
library(ggraph) 
#the development version of ggraph requires the development version of ggplot2

 links %>%
   graph_from_data_frame(directed = T) %>%
   ggraph(layout = 'kk') + 
   geom_edge_fan(aes(edge_alpha = weight, edge_width = weight)) +
   geom_node_point(size = 1) +
   geom_node_text(aes(label = name), repel = TRUE, 
                  point.padding = unit(0.2, "lines")) 

  • 这个图需要加点颜色

  • unnest_tokens + stemming + stopword ?

11.4 全部学科

不过瘾吗? 那我们就看看全局

occurrence <- complete_set %>% 
         select(Category_ESI_cn, UT, DE) %>% 
         separate_rows(DE, sep = ";") %>% 
         mutate(DE = str_trim(DE)) %>% 
         filter(!is.na(DE) ) 
paper_words_pairs <- occurrence %>% 
       dplyr::filter(!is.na(Category_ESI_cn)) %>% 
       dplyr::group_by(Category_ESI_cn) %>%
       widyr::pairwise_count(item = DE, feature = UT, sort = TRUE, upper = FALSE)

head(paper_words_pairs )
linkk <- paper_words_pairs %>% 
         select(item1, item2, n, Category_ESI_cn) %>%
         filter(n >= 5) %>% 
         filter(Category_ESI_cn %in% c("物理学", "化学", "数学", "工程学", "计算机科学", "材料科学")) %>%
         rename(from = item1, to = item2, weight = n)
linkk
library(igraph)
library(ggraph) 


linkk %>%
  graph_from_data_frame(directed = F) %>%
  ggraph(layout = 'kk') + 
  geom_edge_fan(aes(edge_alpha = weight, 
                    edge_width = weight,
                    edge_colour = Category_ESI_cn)) +
  geom_node_point(size = 1) +
  geom_node_text(aes(label = name), repel = TRUE, 
                 point.padding = unit(0.2, "lines")) +
  #theme_void() +
  facet_wrap(~ Category_ESI_cn)  

糟糕,又看不清楚了

library(igraph)
library(ggraph) 


p2p <- linkk %>%
  graph_from_data_frame(directed = F) %>%
  ggraph(layout = 'kk') + 
  geom_edge_fan(aes(edge_alpha = weight, 
                    edge_width = weight,
                    edge_colour = Category_ESI_cn)) +
  geom_node_point(size = 1) +
  geom_node_text(aes(label = name), repel = TRUE, 
                 point.padding = unit(0.2, "lines")) +
  #theme_void() +
  facet_wrap(~ Category_ESI_cn)  

ggsave("plt3.png", plot=p2p, dpi=300, width = 40, height = 90, units = "cm")

这下清楚了

参考文献

Silge, Julia, and David Robinson. 2017. Text Mining with r: A Tidy Approach. 1 edition. O’Reilly Media. https://www.tidytextmining.com/.