第 2 章 学科结构与发展
2.1 PROS
关于文献计量的数据库很多,比如PROS,Arxiv等等,对PROS和Arxiv数据集的分析,另外单独介绍。
2.2 Web of Science
我们选择Web of Science,原因是因为当前ESI学科是各高校关注的热点,而Web of Science数据库是ESI的上游数据库,挖掘的空间更大。
Web of Science数据库的题录数据结构如下表:
2.3 数据集的构建
由于wos网站限制每次只能下载500条记录,因此每个学校的题录文件(一个文件夹)存放不止一个txt文件。具体文件结构如下:
除了题录数据,还包括辅助的数据集
编号 | 名称 | 简介 | 变量数量 | 共有变量 |
---|---|---|---|---|
1 | web of science | wos题录 | 13 | University,ISSN |
2 | sichuan_shool_encn2.csv | 四川高校的名称 | 2 | University |
3 | esi_plus_cas_IF_set.rds | esi期刊学科分类 | 11 | ISSN |
中科院学科分类 | ||||
是否中国SCI期刊 | ||||
期刊影响因子等 | ||||
接下来,我们来读取规整文件
# Load Packages
library(tidyverse)
library(here)
library(fs)
library(knitr)
library(kableExtra)
构建读取相应文件的子函数
<- function(flnm) {
read_plus read_tsv(flnm, quote = "", col_names = TRUE) %>%
#or
#read_delim(flnm, delim="\t" , quote = "", col_names = TRUE) %>%
select(AU, AF, SO, DE, C1, RP, FU, CR, TC, SN, PY, UT) %>%
mutate(University = flnm %>% # 加入了学校名
str_split("/", simplify = TRUE) %>%
length(.) - 1] %>% # 倒数第二列
.[, str_sub(start = 4)
) }
"G:/Guide_DS_R/63R4IS/data/Record/sichuan/01_Chengdu_Univ_Technol/1-500.txt" %>%
str_split("/", simplify = F) %>%
map_chr(.,
~.x[length(.x) - 1]
)
## [1] "01_Chengdu_Univ_Technol"
依次迭代读取
<- here("data", "Record", "sichuan") %>%
tbl dir_ls(regexp = "*.txt", recursive = TRUE) %>%
map_dfr(~read_plus(.))
tbl
测试下呢
%>% count(University) tbl
读取四川高校的名称
<- read_csv(here("data", "map_list", "sichuan_shool_encn2.csv") )
sichuan_univ_name sichuan_univ_name
合并到tbl
, 形成 sichuan_set
<- tbl %>%
sichuan_set left_join(sichuan_univ_name, by = "University")
这里是esi期刊列表数据集, esi_plus_cas_set = esiJCR + CAS + chineseJCR
(即,每个期刊的esi学科分类,中科院学科分类,是否是中国JCR期刊),目的是方便今后分析
<- read_rds(here("data", "esiJournalsList", "esi_plus_cas_IF_set.rds")) esi_plus_cas_IF_set
将 esi_plus_cas_IF_set
合并到 sichuan_set
, 形成 complete_set
<- left_join(sichuan_set, esi_plus_cas_IF_set, by = c("SN" = "ISSN") ) %>%
complete_set select(University, University_cn, everything()) %>%
rename(ISSN = SN) %>%
mutate(SO = str_to_title(SO))
2.4 数据集的导入
最终,我们围绕四川省各高校,选择Web of Science数据库中的10个维度(上图中红色圈出的部分),然后映射并融合了ESI学科分类、中科院期刊分类、期刊等级以及影响因子等,构建了包含24个维度的完整数据集。
#complete_set <- read_rds("data/dataset/complete_set.rds")
glimpse(complete_set)
## Rows: 19,341
## Columns: 24
## $ University <chr> "Chengdu_Univ_Technol", "Chen~
## $ University_cn <chr> "成都理工大学", "成都理工大学~
## $ AU <chr> "Duan, YP; Shang, RS; Zhong, ~
## $ AF <chr> "Duan, Yiping; Shang, Ruishu;~
## $ SO <chr> "International Journal Of Hyd~
## $ DE <chr> "Dry reforming of methane; Hy~
## $ C1 <chr> "[Duan, Yiping; Shang, Ruishu~
## $ RP <chr> "Huang, LH (reprint author), ~
## $ FU <chr> "National Natural Science Fou~
## $ CR <lgl> NA, NA, NA, NA, NA, NA, NA, N~
## $ TC <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,~
## $ ISSN <chr> "0360-3199", "1367-9120", "09~
## $ PY <dbl> 2016, 2016, 2016, 2016, 2016,~
## $ UT <chr> "WOS:000389786500012", "WOS:0~
## $ Full_title <chr> "INTERNATIONAL JOURNAL OF HYD~
## $ Category_ESI <chr> "ENGINEERING", "GEOSCIENCES",~
## $ Category_ESI_cn <chr> "工程学", "地球科学", "计算机~
## $ Category_CAS <chr> "工程技术", "地学", "工程技术~
## $ Degree_CAS <int> 2, 3, 2, 4, 4, 4, 3, 3, 3, 4,~
## $ Title <chr> NA, NA, NA, NA, NA, NA, "石油~
## $ is_chinese <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,~
## $ Cites <dbl> 75123, 12551, 22735, 696, 890~
## $ ImpactFactor <dbl> 4.229, 2.866, 3.241, 0.314, 2~
## $ Eigenfactor <dbl> 0.08137, 0.02432, 0.04892, 0.~
2.5 各高校对比
建立数据集有很大的好处,比如我们可以很方便的对比四川各高校的科研情况
%>% count(University_cn) %>% arrange(desc(n)) complete_set
被引情况呢
%>% count(University_cn, wt = TC) %>% arrange(desc(n)) complete_set
从未被引用的数量和占比(太讨厌了吧)
%>% group_by(University_cn) %>%
complete_set summarise(num = n(),
not_cited = sum(TC == 0),
prop = not_cited/num) %>%
arrange(desc(prop))
2.6 分析单个学校
看看我们川师的情况吧
<- complete_set %>% filter(University == "Sichuan_Normal_Univ")
sicnu_set %>% summarise(count = n()) sicnu_set
%>%
sicnu_set filter(str_detect(C1, "Sichuan Normal Univ,\\s+([^,]*),")) %>%
summarise(count = n())
为什么少了2各呢?大家想想吧
2.7 第一作者筛查
%>%
sicnu_set filter(str_detect(C1, "^\\[([^]]+)\\]\\s+Sichuan Normal Univ,\\s+([^,]*),")) %>%
summarise(count = n())
2.8 通讯作者筛查
%>%
sicnu_set filter(str_detect(RP, "Sichuan Normal Univ")) %>%
summarise(count = n())
2.9 可视化之小试牛刀
川师逐年论文发表情况
%>% ggplot() +
sicnu_set geom_bar(aes(x = PY))
2.10 可视化之初显身手
<- c("四川师范大学", "成都理工大学", "西南科技大学",
some_univ "成信大", "西南石油大学", "四川农业大学",
"四川理工学院", "西华师范大学","西华大学")
<- c("物理学", "化学", "数学", "工程学", "计算机科学", "材料科学") some_cate
library(showtext)
showtext_auto()
%>%
complete_set group_by(University_cn, Category_ESI_cn, PY) %>%
summarise(sumpaper = n()) %>%
filter(University_cn %in% some_univ,
%in% some_cate,
Category_ESI_cn >= 2010
PY %>%
) ggplot(aes(x = PY, y = sumpaper, group = University_cn, colour = factor(University_cn)) ) +
geom_point() +
geom_line() +
facet_wrap(vars(Category_ESI_cn), scales = "free") +
labs(x = NULL, y = NULL, colour = "") +
ggtitle("我省部分省属高校近几年各学科论文情况", subtitle = '(数据来源 Web of Science)') +
guides(fill = guide_legend( nrow = 1, byrow = TRUE)) +
theme(
#plot.title = element_text(size = 16),
#plot.subtitle=element_text(size=14, hjust=0.01, face="italic"),
legend.box = "horizontal",
legend.position = "bottom",
legend.direction = "horizontal"#,
#text=element_text(size=14)
)
%>%
complete_set filter(!is.na(Category_ESI_cn)) %>%
group_by(University_cn, Category_ESI_cn) %>%
summarise(sumpaper = n()) %>%
filter(University_cn %in% some_univ) %>%
ggplot(aes(x = University_cn, y = Category_ESI_cn, colour = University_cn, fill = University_cn, size = sumpaper)) +
geom_point(shape = 21 ) +
scale_size(range = c(0,20)) +
xlab("") + ylab("") +
ggtitle("我省部分省属高校近几年各学科论文情况", subtitle = '(数据来源 Web of Science)') +
theme(legend.position = "none")
2.11 可视化之渐入佳境
library(scales)
library(ggiraph)
library(ggiraphExtra)
<- complete_set %>%
df_radar filter(!is.na(Category_ESI_cn)) %>%
count(University_cn, Category_ESI_cn) %>%
spread(Category_ESI_cn, n, fill = 0) %>%
mutate_at(vars(-University_cn), funs(rescale)) %>%
filter(University_cn %in% some_univ)
ggRadar(data = df_radar,
mapping = aes(colour = University_cn),
size = 2,
legend.position = "none",
rescale = TRUE) +
facet_wrap(vars(University_cn) )
来点动画吧,(gganimate升级后,该代码无法工作)
现在我们可以使用新的 gganimate宏包了
library(gganimate)
<- complete_set %>%
ttb filter(University_cn %in% some_univ, !is.na(Category_ESI_cn)) %>%
count(University_cn, PY, Category_ESI_cn) #%>%
看看这个动画
%>%
ttb ggplot(aes(x = University_cn, y = Category_ESI_cn, colour = University_cn, fill = University_cn, size = n)) +
geom_point(shape = 21 ) +
scale_size(range = c(0,10)) +
# Here comes the gganimate code
labs(title = "{closest_state}") +
transition_states(PY, state_length = 2, transition_length = 1) +
enter_fade() +
exit_shrink() +
ease_aes('linear') +
xlab("") + ylab("") +
theme(legend.position = "none")
又来一个
<- complete_set %>%
tsb count(University_cn, PY) %>%
filter(University_cn %in% some_univ)
%>%
tsb ggplot(aes(PY, n, size = n, colour = University_cn)) +
#geom_point(alpha = 0.7, show.legend = FALSE) +
geom_point(alpha = 0.7, show.legend = TRUE) +
scale_size(range = c(2, 12)) +
# Here comes the gganimate specific bits
labs(title = "我省部分省属高校近几年各学科论文情况Year: {frame_time}",
subtitle = '(数据来源 Web of Science)',
y = "", x = "Years",
caption = "Source: Web of Science",
colour = "学校") +
transition_time(PY) +
shadow_mark(past = TRUE) + #equivalent to cumulative= TRUE in the old package
ease_aes('linear')
再来一个
library(RColorBrewer)
library("viridis")
<- brewer.pal(9,"Oranges")[2:8]
mycols
theme_set(theme_bw())
%>%
shapenumber filter(!is.na(Category_CAS)) %>%
ggplot(aes(y = Category_CAS, x = Degree_CAS, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradientn(colours = mycols) +
geom_text( aes(label = round(value,2)),
size = 6
+
) labs(x = NULL, y = NULL) +
ggtitle("四川师范大学近几年各学科论文发表情况", subtitle = '(数据来源 Web of Science)') +
guides(fill = guide_legend(
title = "数量"
#reverse = TRUE
)+
) theme(
plot.title = element_text(size = 18),
plot.subtitle = element_text(size = 14, hjust = 0.01, face = "italic"),
legend.title = element_text(size = 14),
legend.position = "top",
legend.justification = c(1,0),
legend.margin = margin(0, 0, 0, 0),
axis.text.x = element_text(margin = margin(0.4, unit = "cm")),
text = element_text(family = "fzxbsj",size = 20)
#plot.margin = unit(c(0, 0, 0, 0), "cm") #top, right, bottom, left
+
) scale_x_discrete(expand = c(0,0),
labels = c("degree1" = "一区", "degree2" = "二区",
"degree3" = "三区", "degree4" = "四区")
+
) scale_y_discrete(expand = c(0,0))
2.12 更多
还可以做各种统计和可视化,在后面的章节一一展示吧