第 10 章 预测

预测不好写啊,这里包含发文量的预测和引文量的预测。先从学科相关性说起吧

10.1 学科发展的相关性

我们选择四川农业大学的数据来看看。

scau_set <- complete_set %>%
  filter(University == "Sichuan_Agr_Univ") %>%
  filter(!is.na(Category_ESI_cn))
scau_set %>% count(Category_ESI_cn, PY)
ABCDEFGHIJ0123456789
Category_ESI_cn
<chr>
PY
<dbl>
n
<int>
材料科学20083
材料科学20092
材料科学20111
材料科学20122
材料科学20133
材料科学20145
材料科学20157
材料科学20168
地球科学20101
地球科学20123
df_scau <- scau_set %>%
  count(Category_ESI_cn, PY) %>%
  spread(Category_ESI_cn, n, fill = 0) %>%
  select(-PY)

# Correlation matrix
corr <- round(cor(df_scau), 1)
library(corrplot)
corrplot(corr, order = "hclust", tl.cex = 1, addrect = 8)

ESI学科分类对生命科学领域情有独钟啊,谁叫它那么重要!

10.2 发文量的预测

用机器学习的套路搞搞

train_set <- scau_set %>%
  filter(!PY %in% c("2016")) %>%
  count(Category_ESI_cn, PY) %>%
  rename(paper = n, year = PY) %>%
  group_by(Category_ESI_cn) %>%
  nest()

test_set <- scau_set %>%
  filter(PY %in% c("2016")) %>%
  count(Category_ESI_cn, PY) %>%
  rename(paper = n, year = PY) %>%
  group_by(Category_ESI_cn) %>%
  nest()

set <- left_join(train_set, test_set, by = "Category_ESI_cn")
set
ABCDEFGHIJ0123456789
Category_ESI_cn
<chr>
data.x
<list>
data.y
<list>
材料科学<tibble[,2]><tibble[,2]>
地球科学<tibble[,2]><tibble[,2]>
分子生物学与遗传学<tibble[,2]><tibble[,2]>
工程学<tibble[,2]><tibble[,2]>
化学<tibble[,2]><tibble[,2]>
环境科学与生态学<tibble[,2]><tibble[,2]>
计算机科学<tibble[,2]><tibble[,2]>
经济与商业<tibble[,2]><NULL>
临床医学<tibble[,2]><tibble[,2]>
免疫学<tibble[,2]><tibble[,2]>
set %>% unnest(data.x)
ABCDEFGHIJ0123456789
Category_ESI_cn
<chr>
year
<dbl>
paper
<int>
data.y
<list>
材料科学20083<tibble[,2]>
材料科学20092<tibble[,2]>
材料科学20111<tibble[,2]>
材料科学20122<tibble[,2]>
材料科学20133<tibble[,2]>
材料科学20145<tibble[,2]>
材料科学20157<tibble[,2]>
地球科学20101<tibble[,2]>
地球科学20123<tibble[,2]>
地球科学20132<tibble[,2]>

10.3 建模

library(broom)
library(modelr)

fit_model <- function(df) lm(paper ~ year, data = df)
get_rsq <- function(mod) glance(mod)$r.squared
get_output <- function(mod) augment(mod)


master <- set %>%
  mutate(
    model = map(data.x, fit_model),
    predictions = map2(data.y, model, add_predictions),
    resids = map2(data.x, model, add_residuals),
    glance = map(model, broom::glance),
    tidy = map(model, broom::tidy),
    augment = map(model, broom::augment)
  )

master
ABCDEFGHIJ0123456789
Category_ESI_cn
<chr>
data.x
<list>
data.y
<list>
model
<list>
predictions
<list>
resids
<list>
glance
<list>
tidy
<list>
augment
<list>
材料科学<tibble[,2]><tibble[,2]><S3: lm><tibble[,3]><tibble[,3]><tibble[,12]><tibble[,5]><tibble[,8]>
地球科学<tibble[,2]><tibble[,2]><S3: lm><tibble[,3]><tibble[,3]><tibble[,12]><tibble[,5]><tibble[,8]>
分子生物学与遗传学<tibble[,2]><tibble[,2]><S3: lm><tibble[,3]><tibble[,3]><tibble[,12]><tibble[,5]><tibble[,8]>
工程学<tibble[,2]><tibble[,2]><S3: lm><tibble[,3]><tibble[,3]><tibble[,12]><tibble[,5]><tibble[,8]>
化学<tibble[,2]><tibble[,2]><S3: lm><tibble[,3]><tibble[,3]><tibble[,12]><tibble[,5]><tibble[,8]>
环境科学与生态学<tibble[,2]><tibble[,2]><S3: lm><tibble[,3]><tibble[,3]><tibble[,12]><tibble[,5]><tibble[,8]>
计算机科学<tibble[,2]><tibble[,2]><S3: lm><tibble[,3]><tibble[,3]><tibble[,12]><tibble[,5]><tibble[,8]>
经济与商业<tibble[,2]><NULL><S3: lm><named list [1]><tibble[,3]><tibble[,12]><tibble[,5]><tibble[,8]>
临床医学<tibble[,2]><tibble[,2]><S3: lm><tibble[,3]><tibble[,3]><tibble[,12]><tibble[,5]><tibble[,8]>
免疫学<tibble[,2]><tibble[,2]><S3: lm><tibble[,3]><tibble[,3]><tibble[,12]><tibble[,5]><tibble[,8]>
master %>%
  unnest(glance) %>%
  select(Category_ESI_cn, r.squared) %>%
  arrange(-r.squared)
ABCDEFGHIJ0123456789
Category_ESI_cn
<chr>
r.squared
<dbl>
农业科学0.952580
化学0.941295
综合交叉学科0.935869
生物学与生物化学0.922594
植物学与动物学0.915566
药理学和毒理学0.893394
分子生物学与遗传学0.809984
地球科学0.779425
免疫学0.757739
环境科学与生态学0.714520
selected <- master %>%
  unnest(glance) %>%
  filter(r.squared >= 0.75)

master %>%
  unnest(glance) %>%
  ggplot(
    mapping = aes(x = Category_ESI_cn, y = r.squared, label = Category_ESI_cn)
  ) +
  geom_point() +
  geom_text(check_overlap = TRUE, na.rm = TRUE) +
  theme(
    axis.line = element_blank(),
    axis.text.x = element_blank(),
    text = element_text(size = 14)
  ) +
  geom_point(data = selected, aes(x = Category_ESI_cn, y = r.squared), colour = "red")

绝大部分是线性的

selected <- master %>%
  unnest(glance) %>%
  filter(r.squared >= 0.75)
selected %>% unnest(data.x)
ABCDEFGHIJ0123456789
Category_ESI_cn
<chr>
year
<dbl>
paper
<int>
data.y
<list>
model
<list>
predictions
<list>
resids
<list>
r.squared
<dbl>
adj.r.squared
<dbl>
sigma
<dbl>
地球科学20101<tibble[,2]><S3: lm><tibble[,3]><tibble[,3]>0.77940.66911.513
地球科学20123<tibble[,2]><S3: lm><tibble[,3]><tibble[,3]>0.77940.66911.513
地球科学20132<tibble[,2]><S3: lm><tibble[,3]><tibble[,3]>0.77940.66911.513
地球科学20157<tibble[,2]><S3: lm><tibble[,3]><tibble[,3]>0.77940.66911.513
分子生物学与遗传学20077<tibble[,2]><S3: lm><tibble[,3]><tibble[,3]>0.81000.78288.532
分子生物学与遗传学200819<tibble[,2]><S3: lm><tibble[,3]><tibble[,3]>0.81000.78288.532
分子生物学与遗传学200922<tibble[,2]><S3: lm><tibble[,3]><tibble[,3]>0.81000.78288.532
分子生物学与遗传学201032<tibble[,2]><S3: lm><tibble[,3]><tibble[,3]>0.81000.78288.532
分子生物学与遗传学201138<tibble[,2]><S3: lm><tibble[,3]><tibble[,3]>0.81000.78288.532
分子生物学与遗传学201238<tibble[,2]><S3: lm><tibble[,3]><tibble[,3]>0.81000.78288.532
selected %>%
  unnest(data.x) %>%
  ggplot(mapping = aes(x = year, y = paper, colour = Category_ESI_cn)) +
  geom_point() +
  geom_smooth(method = lm, se = FALSE) +
  facet_wrap(~Category_ESI_cn) +
  theme(legend.position = "none")

checkdata <- selected %>%
  unnest(predictions) %>%
  select(Category_ESI_cn, data.y, pred) %>%
  unnest(data.y, names_repair = "universal")

checkdata
ABCDEFGHIJ0123456789
Category_ESI_cn
<chr>
year
<dbl>
paper
<int>
pred
<dbl>
地球科学201647.154
分子生物学与遗传学201611965.861
化学20165451.333
免疫学2016117.891
农业科学20169687.861
生物学与生物化学20168775.111
药理学和毒理学20162218.861
植物学与动物学2016187181.139
综合交叉学科20168160.286
compare <- checkdata %>%
  mutate(delta = abs((paper - pred) / paper)) %>%
  select(Category_ESI_cn, year, paper, pred, delta) %>%
  arrange(delta)

compare
ABCDEFGHIJ0123456789
Category_ESI_cn
<chr>
year
<dbl>
paper
<int>
pred
<dbl>
delta
<dbl>
植物学与动物学2016187181.1390.03134
化学20165451.3330.04938
农业科学20169687.8610.08478
生物学与生物化学20168775.1110.13665
药理学和毒理学20162218.8610.14268
综合交叉学科20168160.2860.25573
免疫学2016117.8910.28261
分子生物学与遗传学201611965.8610.44655
地球科学201647.1540.78846
library(showtext)
showtext_auto()

selected %>%
  unnest(data.x) %>%
  ggplot(mapping = aes(x = year, y = paper, colour = Category_ESI_cn)) +
  geom_point() +
  geom_smooth(method = lm, se = FALSE) +
  geom_point(data = compare, mapping = aes(x = year, y = paper), colour = "black") +
  geom_point(data = compare, mapping = aes(x = year, y = pred), colour = "red") +
  facet_wrap(~Category_ESI_cn, scales = "free") +
  labs(x = NULL, y = NULL, colour = "") +
  theme(legend.position = "none")

这种方法准确性比较还算高,而可以用来发文预测。

10.4 引文量的预测

论文发表后其学术影响力可以被预测?(我想影响引文量的因素很多,我也不知道有哪些。)

在学校规划中规划化学学科为ESI冲刺学科,那么很自然地一个问题是,化学学科未来需要多少科研产出才能保证其学术影响力达到该学科的ESI阈值?

一般来说,单篇论文发表后,其学术影响力(即被引情况)往往是随机的,但对于大量文献,其发表后受到关注的整体特征却是有规律可循的,它有一个从不为人知,到被发现被引用,再到新技术出现后趋于沉寂,完整的生命周期。

近年来文献5研究给出了论文发表后其学术影响力(被引用总次数)随时间变化的动力学模型,即WSB模型。 该模型一定程度上可以预测某一学科的科研论文未来若干年的被引用情况,模型的数学表达式如下:

cti=m[eβηjAΦ(lntμjσi)1]m[eλjΦ(lntμjσi)1]

其中Φ为累积分布函数

Φ(x)(2π)1/2xey2/2dy

等式中,λ为即时性因子,μ为相对重要性因子,σ为衰减因子,这三个重要参数决定了引文分布函数。显然,不同的学科、不同等级的文章具有不同的引文分布。

那么基于这一数学模型,不仅可以预测未来学科发展趋势,而且可以实现超前谋划和战略预判。 例如化学学科要冲刺ESI,那么近十年论文被引总量就必须达到一个阈值。 而完成这个引文阈值,以当前的科研发展规模和速度是不够的。因此必须提前谋划和重新布局,即化学学科未来五年至少要以怎么样的增速和结构持续发展,才能保障这一既定目标的完成。 而这里等式(10.1)可以给出实现ESI冲刺的科研产出下限。


  1. D. Wang, C. Song, A.-L. Barabási, Science 342, 127 - 132 (2013).↩︎