第 17 章 利用文本挖掘技术分析文献摘要
library(tidyverse)
library(here)
library(fs)
library(purrr)
library(tidytext)
library(widyr)
library(tidygraph)
library(ggraph)
本章基于R语言文本挖掘技术,分析文献摘要。具体做法是,利用R语言tidyverse、tidytext、widyr、tidygraph、ggraph等宏包分析我校文献(Web of Science)摘要的文本信息。6
17.1 数据导入
为了研究的可重复性,我列出了数据获取步骤: - 打开https://www.webofknowledge.com/,进入核心合集 - 输入学校全名:比如 Sichuan Normal University - 选择“机构扩展”检索 - 选择时间范围:“2009-2018年” - 选择“SCI/SSCI/A&HCI” - 点击检索 - 文档类型精炼:”Article + Review “ - 一次显示最多 50 条,一次下载最多 500 条 - 选择“其他类型下载” + “全记录与引用的参考文献” + “win UTF” - 依此下载保存
我们共获取了 1988 条文献题录数据。
<- function(flnm) {
read_plus read_tsv(flnm, quote = "", col_names = TRUE) %>%
#or
#read_delim(flnm, delim="\t" , quote = "", col_names = TRUE) %>%
#select(AU, AF, SO, DE, C1, RP, FU, CR, TC, SN, PY, UT) %>%
select(AB, SN, UT) #%>%
# mutate(University = flnm %>% # 加入了学校名
# str_split("/", 7, simplify = TRUE) %>%
# .[, 6] %>%
# str_sub(start = 4)
# )
}
<- here("data", "newdata") %>%
tbl dir_ls(regexp = "*.txt", recursive = TRUE) %>%
map_dfr(~read_plus(.))
tbl
<- read_rds(here("data", "esiJournalsList", "esi_plus_cas_IF_set.rds"))
esi_plus_cas_IF_set
<- tbl %>%
complete_set left_join(esi_plus_cas_IF_set, by = c("SN" = "ISSN") ) %>%
select(category = Category_ESI_cn, abstract = AB, pubs = UT) #%>%
# rename(ISSN = SN) %>%
17.2 数据规整
complete_set
数据整理和文本分词
<- complete_set %>%
text_df filter(!is.na(abstract)) %>%
unnest_tokens(output = grams, input = abstract, token = "ngrams", n = 2)
text_df
过滤无用词汇
<- text_df %>%
text_filted separate(grams, into = c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
text_filted
<- text_filted %>%
text_unite unite(grams, word1, word2, sep = " ")
text_unite
17.3 计算tf_idf
<- text_unite %>%
text_tf_idf count(pubs, grams) %>%
bind_tf_idf(pubs, grams, n) %>%
arrange(desc(tf_idf))
text_tf_idf
%>% group_by(pubs) %>%
text_tf_idf filter(max(tf_idf) > 9.1) %>%
#dplyr::distinct(pubs)
ggplot(aes(x = fct_reorder(grams, tf_idf), y = tf_idf, fill = pubs)) +
geom_col(show.legend = FALSE) +
coord_flip() +
facet_wrap(vars(pubs), ncol = 2, scales = "free")
17.4 文本相似性
\[ \text{similarity} = \cos ( \theta ) = \frac { \mathbf { A } \cdot \mathbf { B } } { \| \mathbf { A } \| \| \mathbf { B } \| } = \frac { \sum _ { i = 1 } ^ { n } A _ { i } B _ { i } } { \sqrt { \sum _ { i = 1 } ^ { n } A _ { i } ^ { 2 } } \sqrt { \sum _ { i = 1 } ^ { n } B _ { i } ^ { 2 } } } \]
%>%
text_tf_idf pairwise_similarity(pubs, grams, tf_idf, upper = FALSE, sort = TRUE)
17.5 关联词汇
前面我们计算了过滤词汇text_filted
,我们现在研究这些词汇之间的关联
<- text_filted %>%
text_count count(category, word1, word2, sort = TRUE) %>%
select(word1, word2, n, category) %>%
arrange(category)
text_count
%>%
text_count filter(n > 20) %>%
as_tbl_graph()
## # A tbl_graph: 124 nodes and 94 edges
## #
## # A directed multigraph with 46 components
## #
## # Node Data: 124 x 1 (active)
## name
## <chr>
## 1 piezoelectric
## 2 rights
## 3 lead
## 4 elsevier
## 5 sintering
## 6 phase
## # ... with 118 more rows
## #
## # Edge Data: 94 x 4
## from to n category
## <int> <int> <int> <chr>
## 1 1 74 65 材料科学
## 2 2 75 58 材料科学
## 3 3 52 45 材料科学
## # ... with 91 more rows
%>%
text_count filter(n > 20) %>%
as_tbl_graph() %>%
ggraph(layout = "fr") +
geom_node_point() +
geom_edge_link(aes(color = category, edge_width = n)) +
geom_node_text(aes(label = name), repel = TRUE) +
#facet_wrap(vars(category), ncol = 3, scales = "free")
facet_edges(vars(category), ncol = 3, scales = "free")
17.6 下一步工作
- 数据量很多,需要精炼,从而提前有用的关键信息
- 还没想好