第 79 章 探索性数据分析-奥林匹克
这是Nature期刊上的一篇文章Nature. 2004 September 30; 431(7008),

79.1 导入数据
d <- read_excel("./demo_data/olympics.xlsx")
79.2 可视化
d %>%
ggplot() +
geom_point(aes(x = Olympic_year, y = Men_score), color = "blue") +
geom_point(aes(x = Olympic_year, y = Women_score), color = "red")

d1 <- d %>%
cols = -Olympic_year,
names_to = "sex",
values_to = "winning_time"
d1 %>%
ggplot(aes(x = Olympic_year, y = winning_time, color = sex)) +
geom_point() +
# geom_smooth(method = "lm") +
values = c("Men_score" = "blue", "Women_score" = "red")
) +
breaks = seq(1900, 2004, by = 4),
labels = seq(1900, 2004, by = 4)
) +
theme(axis.text.x = element_text(
size = 10, angle = 45, colour = "black",
vjust = 1, hjust = 1

79.3 回归分析
建立年份与成绩的线性关系 scorei=α+β×yeari+ϵi;ϵi∈Normal(μ,σ)
(lm(y ~ 1 + x,data = d)
, 要求得 α和β,就是对应 1 和 x 前的系数)
## Call:
## lm(formula = Men_score ~ 1 + Olympic_year, data = d)
## Residuals:
## Min 1Q Median 3Q Max
## -0.263708 -0.052702 0.007381 0.080048 0.214559
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.8264525 1.6796428 18.95 4.11e-15 ***
## Olympic_year -0.0110056 0.0008593 -12.81 1.13e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Residual standard error: 0.1347 on 22 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.8817, Adjusted R-squared: 0.8764
## F-statistic: 164 on 1 and 22 DF, p-value: 1.128e-11
## Call:
## lm(formula = Women_score ~ 1 + Olympic_year, data = d)
## Residuals:
## Min 1Q Median 3Q Max
## -0.37579 -0.08460 0.00929 0.08285 0.32234
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.347049 4.284251 10.35 1.70e-08 ***
## Olympic_year -0.016822 0.002176 -7.73 8.63e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Residual standard error: 0.2104 on 16 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.7888, Adjusted R-squared: 0.7756
## F-statistic: 59.76 on 1 and 16 DF, p-value: 8.626e-07
79.4 预测
df <- data.frame(Olympic_year = 2020)
predict(fit_1, newdata = df)
## 1
## 9.595218
为了图片中的一致,我们使用1900年到2252年(seq(1900, 2252, by = 4)
grid <- tibble(
Olympic_year = as.numeric(seq(1900, 2252, by = 4))
tb <- grid %>%
Predict_Men = predict(fit_1, newdata = grid),
Predict_Women = predict(fit_2, newdata = grid)
grid %>%
add_predictions(fit_1, var = "Predict_Men") %>%
add_predictions(fit_2, var = "Predict_Women")
79.5 再次可视化
tb1 <- tb %>%
cols = -Olympic_year,
names_to = "sex",
values_to = "winning_time"
tb1 %>%
x = Olympic_year,
y = winning_time,
color = sex
)) +
geom_line(size = 2) +
geom_point(data = d1) +
values = c(
"Men_score" = "blue",
"Women_score" = "red",
"Predict_Men" = "#588B8B",
"Predict_Women" = "#C8553D"
labels = c(
"Men_score" = "Men score",
"Women_score" = "Women score",
"Predict_Men" = "Men Predict score",
"Predict_Women" = "Women Predict score"
) +
breaks = seq(1900, 2252, by = 16),
labels = as.character(seq(1900, 2252, by = 16))
) +
theme(axis.text.x = element_text(
size = 10, angle = 45, colour = "black",
vjust = 1, hjust = 1

79.6 list_column
d1 <- d %>%
cols = -Olympic_year,
names_to = "sex",
values_to = "winning_time"
fit_model <- function(df) lm(winning_time ~ Olympic_year, data = df)
d2 <- d1 %>%
group_nest(sex) %>%
mod = map(data, fit_model)
## # A tibble: 2 × 3
## sex data mod
## <chr> <list<tibble[,2]>> <list>
## 1 Men_score [27 × 2] <lm>
## 2 Women_score [27 × 2] <lm>
tb4 <- d2 %>%
predictions = map(mod, ~ add_predictions(grid, .))
) %>%
select(sex, predictions) %>%
tb4 %>%
x = Olympic_year,
y = pred,
group = sex,
color = sex
)) +
geom_point() +
geom_line(size = 2) +
data = d1,
x = Olympic_year,
y = winning_time,
group = sex,
color = sex
) +
breaks = seq(1900, 2252, by = 16),
labels = as.character(seq(1900, 2252, by = 16))
) +
theme(axis.text.x = element_text(
size = 10, angle = 45, colour = "black",
vjust = 1, hjust = 1