# 第 37 章 探索性数据分析-奥林匹克

## 37.1 导入数据

library(tidyverse)
library(readxl)
d <- read_excel("./demo_data/olympics.xlsx")
d
## # A tibble: 27 x 3
##    Olympic_year Men_score Women_score
##           <dbl>     <dbl>       <dbl>
##  1         1900      11          NA
##  2         1904      11          NA
##  3         1908      10.8        NA
##  4         1912      10.8        NA
##  5         1916      NA          NA
##  6         1920      10.8        NA
##  7         1924      10.6        NA
##  8         1928      10.8        12.2
##  9         1932      10.3        11.9
## 10         1936      10.3        11.5
## # ... with 17 more rows

## 37.2 可视化

d %>%
ggplot() +
geom_point(aes(x = Olympic_year, y = Men_score), color = "blue") +
geom_point(aes(x = Olympic_year, y = Women_score), color = "red")

d1 <- d %>%
pivot_longer(
cols = -Olympic_year,
names_to = "sex",
values_to = "winning_time"
)

d1
## # A tibble: 54 x 3
##    Olympic_year sex         winning_time
##           <dbl> <chr>              <dbl>
##  1         1900 Men_score           11
##  2         1900 Women_score         NA
##  3         1904 Men_score           11
##  4         1904 Women_score         NA
##  5         1908 Men_score           10.8
##  6         1908 Women_score         NA
##  7         1912 Men_score           10.8
##  8         1912 Women_score         NA
##  9         1916 Men_score           NA
## 10         1916 Women_score         NA
## # ... with 44 more rows

d1 %>%
ggplot(aes(x = Olympic_year, y = winning_time, color = sex)) +
geom_point() +
# geom_smooth(method = "lm") +
scale_color_manual(
values = c("Men_score" = "blue", "Women_score" = "red")
) +
scale_x_continuous(
breaks = seq(1900, 2004, by = 4),
labels = seq(1900, 2004, by = 4)
) +
theme(axis.text.x = element_text(
size = 10, angle = 45, colour = "black",
vjust = 1, hjust = 1
))

## 37.3 回归分析

fit_1 <- lm(Men_score ~ 1 + Olympic_year, data = d)

summary(fit_1)
##
## Call:
## lm(formula = Men_score ~ 1 + Olympic_year, data = d)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -0.26371 -0.05270  0.00738  0.08005  0.21456
##
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  31.826453   1.679643    18.9  4.1e-15 ***
## Olympic_year -0.011006   0.000859   -12.8  1.1e-11 ***
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.135 on 22 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.882,  Adjusted R-squared:  0.876
## F-statistic:  164 on 1 and 22 DF,  p-value: 1.13e-11
fit_2 <- lm(Women_score ~ 1 + Olympic_year, data = d)

summary(fit_2)
##
## Call:
## lm(formula = Women_score ~ 1 + Olympic_year, data = d)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -0.3758 -0.0846  0.0093  0.0829  0.3223
##
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)  44.34705    4.28425   10.35  1.7e-08 ***
## Olympic_year -0.01682    0.00218   -7.73  8.6e-07 ***
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.21 on 16 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.789,  Adjusted R-squared:  0.776
## F-statistic: 59.8 on 1 and 16 DF,  p-value: 8.63e-07

## 37.4 预测

df <- data.frame(Olympic_year = 2020)

predict(fit_1, newdata = df)
##     1
## 9.595

grid <- tibble(
Olympic_year = as.numeric(seq(1900, 2252, by = 4))
)
grid
## # A tibble: 89 x 1
##    Olympic_year
##           <dbl>
##  1         1900
##  2         1904
##  3         1908
##  4         1912
##  5         1916
##  6         1920
##  7         1924
##  8         1928
##  9         1932
## 10         1936
## # ... with 79 more rows
tb <- grid %>%
mutate(
Predict_Men = predict(fit_1, newdata = grid),
Predict_Women = predict(fit_2, newdata = grid)
)
tb
## # A tibble: 89 x 3
##    Olympic_year Predict_Men Predict_Women
##           <dbl>       <dbl>         <dbl>
##  1         1900        10.9          12.4
##  2         1904        10.9          12.3
##  3         1908        10.8          12.3
##  4         1912        10.8          12.2
##  5         1916        10.7          12.1
##  6         1920        10.7          12.0
##  7         1924        10.7          12.0
##  8         1928        10.6          11.9
##  9         1932        10.6          11.8
## 10         1936        10.5          11.8
## # ... with 79 more rows

library(modelr)
grid %>%
add_predictions(fit_2, var = "Predict_Women")
## # A tibble: 89 x 3
##    Olympic_year Predict_Men Predict_Women
##           <dbl>       <dbl>         <dbl>
##  1         1900        10.9          12.4
##  2         1904        10.9          12.3
##  3         1908        10.8          12.3
##  4         1912        10.8          12.2
##  5         1916        10.7          12.1
##  6         1920        10.7          12.0
##  7         1924        10.7          12.0
##  8         1928        10.6          11.9
##  9         1932        10.6          11.8
## 10         1936        10.5          11.8
## # ... with 79 more rows

## 37.5 再次可视化

tb1 <- tb %>%
pivot_longer(
cols = -Olympic_year,
names_to = "sex",
values_to = "winning_time"
)
tb1
## # A tibble: 178 x 3
##    Olympic_year sex           winning_time
##           <dbl> <chr>                <dbl>
##  1         1900 Predict_Men           10.9
##  2         1900 Predict_Women         12.4
##  3         1904 Predict_Men           10.9
##  4         1904 Predict_Women         12.3
##  5         1908 Predict_Men           10.8
##  6         1908 Predict_Women         12.3
##  7         1912 Predict_Men           10.8
##  8         1912 Predict_Women         12.2
##  9         1916 Predict_Men           10.7
## 10         1916 Predict_Women         12.1
## # ... with 168 more rows
tb1 %>%
ggplot(aes(
x = Olympic_year,
y = winning_time,
color = sex
)) +
geom_line(size = 2) +
geom_point(data = d1) +
scale_color_manual(
name = "标记",
values = c(
"Men_score" = "blue",
"Women_score" = "red",
"Predict_Men" = "#588B8B",
"Predict_Women" = "#C8553D"
),
labels = c(
"Men_score" = "男性历史成绩",
"Women_score" = "女性历史成绩",
"Predict_Men" = "男性预测成绩",
"Predict_Women" = "女性预测成绩"
)
) +
scale_x_continuous(
breaks = seq(1900, 2252, by = 16),
labels = as.character(seq(1900, 2252, by = 16))
) +
theme(axis.text.x = element_text(
size = 10, angle = 45, colour = "black",
vjust = 1, hjust = 1
))

## 37.6 list_column

library(modelr)
d1 <- d %>%
pivot_longer(
cols = -Olympic_year,
names_to = "sex",
values_to = "winning_time"
)

fit_model <- function(df) lm(winning_time ~ Olympic_year, data = df)

d2 <- d1 %>%
group_nest(sex) %>%
mutate(
mod = map(data, fit_model)
)
d2
## # A tibble: 2 x 3
##   sex                       data mod
##   <chr>       <list<tbl_df[,2]>> <list>
## 1 Men_score             [27 x 2] <lm>
## 2 Women_score           [27 x 2] <lm>
# d2 %>% mutate(p = list(grid, grid))
# d3 <- d2 %>% mutate(p = list(grid, grid))
# d3
# d3 %>%
#   mutate(
#     predictions = map2(p, mod, add_predictions),
#   )

# or
tb4 <- d2 %>%
mutate(
predictions = map(mod, ~ add_predictions(grid, .))
) %>%
select(sex, predictions) %>%
unnest(predictions)

tb4 %>%
ggplot(aes(
x = Olympic_year,
y = pred,
group = sex,
color = sex
)) +
geom_point() +
geom_line(size = 2) +
geom_point(
data = d1,
aes(
x = Olympic_year,
y = winning_time,
group = sex,
color = sex
)
) +
scale_x_continuous(
breaks = seq(1900, 2252, by = 16),
labels = as.character(seq(1900, 2252, by = 16))
) +
theme(axis.text.x = element_text(
size = 10, angle = 45, colour = "black",
vjust = 1, hjust = 1
))

## 37.7 课后作业

• 探索数据，建立身高体重的线性模型