# 第 10 章 预测

## 10.1 学科发展的相关性

scau_set <- complete_set %>%
filter(University == "Sichuan_Agr_Univ") %>%
filter(!is.na(Category_ESI_cn))
scau_set %>% count(Category_ESI_cn, PY)
df_scau <- scau_set %>%
count(Category_ESI_cn, PY) %>%
spread(Category_ESI_cn, n, fill = 0) %>%
select(-PY)

# Correlation matrix
corr <- round(cor(df_scau), 1)
library(corrplot)
corrplot(corr, order = "hclust", tl.cex = 1, addrect = 8)

ESI学科分类对生命科学领域情有独钟啊，谁叫它那么重要！

## 10.2 发文量的预测

train_set <- scau_set %>%
filter(!PY %in% c("2016")) %>%
count(Category_ESI_cn, PY) %>%
rename(paper = n, year = PY) %>%
group_by(Category_ESI_cn) %>%
nest()

test_set <- scau_set %>%
filter(PY %in% c("2016")) %>%
count(Category_ESI_cn, PY) %>%
rename(paper = n, year = PY) %>%
group_by(Category_ESI_cn) %>%
nest()

set <- left_join(train_set, test_set, by = "Category_ESI_cn")
set
set %>% unnest(data.x)

## 10.3 建模

library(broom)
library(modelr)

fit_model <- function(df) lm(paper ~ year, data = df)
get_rsq <- function(mod) glance(mod)\$r.squared
get_output <- function(mod) augment(mod)

master <- set %>%
mutate(
model = map(data.x, fit_model),
glance = map(model, broom::glance),
tidy = map(model, broom::tidy),
augment = map(model, broom::augment)
)

master
master %>%
unnest(glance) %>%
select(Category_ESI_cn, r.squared) %>%
arrange(-r.squared)
selected <- master %>%
unnest(glance) %>%
filter(r.squared >= 0.75)

master %>%
unnest(glance) %>%
ggplot(
mapping = aes(x = Category_ESI_cn, y = r.squared, label = Category_ESI_cn)
) +
geom_point() +
geom_text(check_overlap = TRUE, na.rm = TRUE) +
theme(
axis.line = element_blank(),
axis.text.x = element_blank(),
text = element_text(size = 14)
) +
geom_point(data = selected, aes(x = Category_ESI_cn, y = r.squared), colour = "red")

selected <- master %>%
unnest(glance) %>%
filter(r.squared >= 0.75)
selected %>% unnest(data.x)
selected %>%
unnest(data.x) %>%
ggplot(mapping = aes(x = year, y = paper, colour = Category_ESI_cn)) +
geom_point() +
geom_smooth(method = lm, se = FALSE) +
facet_wrap(~Category_ESI_cn) +
theme(legend.position = "none")

checkdata <- selected %>%
unnest(predictions) %>%
select(Category_ESI_cn, data.y, pred) %>%
unnest(data.y, names_repair = "universal")

checkdata
compare <- checkdata %>%
mutate(delta = abs((paper - pred) / paper)) %>%
select(Category_ESI_cn, year, paper, pred, delta) %>%
arrange(delta)

compare
library(showtext)
showtext_auto()

selected %>%
unnest(data.x) %>%
ggplot(mapping = aes(x = year, y = paper, colour = Category_ESI_cn)) +
geom_point() +
geom_smooth(method = lm, se = FALSE) +
geom_point(data = compare, mapping = aes(x = year, y = paper), colour = "black") +
geom_point(data = compare, mapping = aes(x = year, y = pred), colour = "red") +
facet_wrap(~Category_ESI_cn, scales = "free") +
labs(x = NULL, y = NULL, colour = "") +
theme(legend.position = "none")

## 10.4 引文量的预测

$$$c^t_i = m\left[ e^{ \frac{\beta\eta_j}{A} \Phi\left( \frac{\ln t -\mu_j}{\sigma_i}\right)} -1\right] \equiv m\left[ e^{ \lambda_j \Phi\left( \frac{\ln t -\mu_j}{\sigma_i}\right)} -1\right] \tag{10.1}$$$

$\Phi(x) \equiv (2\pi)^{-1/2}\int_{-\infty}^x e^{-y^2/2} \,\mathrm{d} y$

1. D. Wang, C. Song, A.-L. Barabási, Science 342, 127 - 132 (2013).↩︎