第 11 章 主题词分析
这里要用到文本挖掘技术(Silge and Robinson 2017)
11.1 化学学科
%>% count(Category_ESI) %>% arrange(-n) complete_set
Category_ESI <chr> | n <int> | |||
---|---|---|---|---|
CHEMISTRY | 3447 | |||
PHYSICS | 2104 | |||
ENGINEERING | 1694 | |||
NA | 1492 | |||
MATERIALS SCIENCE | 1400 | |||
PLANT & ANIMAL SCIENCE | 1322 | |||
GEOSCIENCES | 1131 | |||
MATHEMATICS | 923 | |||
CLINICAL MEDICINE | 840 | |||
BIOLOGY & BIOCHEMISTRY | 824 |
我们就看看四川省的化学学科吧
<- complete_set %>% filter(Category_ESI == "CHEMISTRY") chem_set
<- chem_set %>%
word_freq with(str_split(DE, ";")) %>%
unlist() %>%
tibble(keywords = .) %>%
filter(!is.na(keywords) ) %>%
filter(!keywords %in% c("") ) %>%
count(keywords = keywords %>% str_to_lower()) %>%
arrange(desc(n))
head(word_freq)
keywords <chr> | n <int> | |||
---|---|---|---|---|
density functional theory | 89 | |||
crystal structure | 56 | |||
synthesis | 53 | |||
adsorption | 43 | |||
solubility | 42 | |||
fluorescence | 27 |
11.2 词云
library(wordcloud2)
wordcloud2(word_freq, size=1)
#wordcloud2(word_freq,
# size=1,
# figPath = "images/twitter.jpg")
然并卵。不知道大家为什么喜欢词云这个东西
11.3 共现矩阵
<- chem_set %>%
occur mutate(id = row_number()) %>%
select(id, DE) %>%
separate_rows(DE, sep = ";") %>%
filter(!is.na(DE) )
library(widyr) #devtools::install_github("dgrtwo/widyr")
<- occur %>%
paper_words_pair pairwise_count(DE, id, sort = TRUE)
head(paper_words_pair )
item1 <chr> | item2 <chr> | n <dbl> | ||
---|---|---|---|---|
Metal carbonyls | Density functional theory | 19 | ||
Density functional theory | Metal carbonyls | 19 | ||
Metal-metal bonding | Density functional theory | 17 | ||
Density functional theory | Metal-metal bonding | 17 | ||
Metal-metal bonding | Metal carbonyls | 15 | ||
Metal carbonyls | Metal-metal bonding | 15 |
<- paper_words_pair %>% select(item1, item2, n) %>%
links filter(n >= 10) %>%
rename(from = item1, to = item2, weight = n)
links
from <chr> | to <chr> | weight <dbl> | ||
---|---|---|---|---|
Metal carbonyls | Density functional theory | 19 | ||
Density functional theory | Metal carbonyls | 19 | ||
Metal-metal bonding | Density functional theory | 17 | ||
Density functional theory | Metal-metal bonding | 17 | ||
Metal-metal bonding | Metal carbonyls | 15 | ||
Metal carbonyls | Metal-metal bonding | 15 | ||
EIS | SEM | 11 | ||
SEM | EIS | 11 | ||
catalytic kinetics | mimic hydrolase | 10 | ||
mimic hydrolase | catalytic kinetics | 10 |
library(igraph)
library(ggraph)
#the development version of ggraph requires the development version of ggplot2
%>%
links graph_from_data_frame(directed = T) %>%
ggraph(layout = 'kk') +
geom_edge_fan(aes(edge_alpha = weight, edge_width = weight)) +
geom_node_point(size = 1) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines"))
这个图需要加点颜色
unnest_tokens + stemming + stopword ?
11.4 全部学科
不过瘾吗? 那我们就看看全局
<- complete_set %>%
occurrence select(Category_ESI_cn, UT, DE) %>%
separate_rows(DE, sep = ";") %>%
mutate(DE = str_trim(DE)) %>%
filter(!is.na(DE) )
<- occurrence %>%
paper_words_pairs ::filter(!is.na(Category_ESI_cn)) %>%
dplyr::group_by(Category_ESI_cn) %>%
dplyr::pairwise_count(item = DE, feature = UT, sort = TRUE, upper = FALSE)
widyr
head(paper_words_pairs )
Category_ESI_cn <chr> | item1 <chr> | item2 <chr> | n <dbl> | |
---|---|---|---|---|
工程学 | sensitivity analysis | coupled system | 5 | |
工程学 | China | Energy consumption | 4 | |
工程学 | Adsorption | Methylene blue | 4 | |
工程学 | Adsorption | Mechanism | 4 | |
工程学 | Adsorption | Fluoride | 4 | |
工程学 | Ruthenium | Ammonia borane | 4 |
<- paper_words_pairs %>%
linkk select(item1, item2, n, Category_ESI_cn) %>%
filter(n >= 5) %>%
filter(Category_ESI_cn %in% c("物理学", "化学", "数学", "工程学", "计算机科学", "材料科学")) %>%
rename(from = item1, to = item2, weight = n)
linkk
from <chr> | to <chr> | weight <dbl> | Category_ESI_cn <chr> | |
---|---|---|---|---|
sensitivity analysis | coupled system | 5 | 工程学 | |
Rough set theory | Granular computing | 6 | 计算机科学 | |
Attribute reduction | Rough set theory | 5 | 计算机科学 | |
global existence | blow-up | 14 | 数学 | |
global existence | chemotaxis | 5 | 数学 | |
global existence | blowup | 5 | 数学 | |
greatest-type divisor | divisibility | 5 | 数学 | |
Density functional theory | Metal carbonyls | 19 | 化学 | |
Density functional theory | Metal-metal bonding | 17 | 化学 | |
triphenylene | discotic liquid crystal | 16 | 化学 |
library(igraph)
library(ggraph)
%>%
linkk graph_from_data_frame(directed = F) %>%
ggraph(layout = 'kk') +
geom_edge_fan(aes(edge_alpha = weight,
edge_width = weight,
edge_colour = Category_ESI_cn)) +
geom_node_point(size = 1) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
#theme_void() +
facet_wrap(~ Category_ESI_cn)
糟糕,又看不清楚了
library(igraph)
library(ggraph)
<- linkk %>%
p2p graph_from_data_frame(directed = F) %>%
ggraph(layout = 'kk') +
geom_edge_fan(aes(edge_alpha = weight,
edge_width = weight,
edge_colour = Category_ESI_cn)) +
geom_node_point(size = 1) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
#theme_void() +
facet_wrap(~ Category_ESI_cn)
ggsave("plt3.png", plot=p2p, dpi=300, width = 40, height = 90, units = "cm")
这下清楚了
参考文献
Silge, Julia, and David Robinson. 2017. Text Mining with r: A Tidy Approach. 1 edition. O’Reilly Media. https://www.tidytextmining.com/.