第 86 章 探索性数据分析-移民缺口

86.1 引言

今天看到一张图,觉得很不错,简单清晰。

数据是公开的,因此不难找到,我是在这里图中获取。

先观察这张图想表达的意思:

  • 蓝色的是历年移民人口真实数据

  • 依据前6个点(2011年到2016年)建立线性模型,并依此预测后5个点(2016到2021年)的情况,从而得到黄色的直线

  • 预测情况与实际情况的差,得到缺口总数210万

86.2 开始

library(tidyverse)
library(modelr)

df <- tibble::tribble(
  ~year,   ~num,
  2011,  795300,
  2012,  858740,
  2013,  849730,
  2014,  945640,
  2015, 1060000,
  2016, 1065000,
  2017,  948390,
  2018,  719870,
  2019,  568540,
  2020,  477030
) %>%
  mutate(num = num / 1000000)

df
## # A tibble: 10 × 2
##     year   num
##    <dbl> <dbl>
##  1  2011 0.795
##  2  2012 0.859
##  3  2013 0.850
##  4  2014 0.946
##  5  2015 1.06 
##  6  2016 1.06 
##  7  2017 0.948
##  8  2018 0.720
##  9  2019 0.569
## 10  2020 0.477

86.2.1 线性回归模型

依据2011到2016年的数据,建立线性模型

mod <- df %>%
  filter(year <= 2016) %>%
  lm(num ~ 1 + year, data = .)

86.2.2 预测

根据线性模型预测2016年到2020的情况

pred_df <- tibble(
  year = seq(2016, 2020, by = 1)
) %>%
  modelr::add_predictions(model = mod)

pred_df
## # A tibble: 5 × 2
##    year  pred
##   <dbl> <dbl>
## 1  2016  1.08
## 2  2017  1.13
## 3  2018  1.19
## 4  2019  1.25
## 5  2020  1.31

合并成新的数据框

combine_df <- df %>%
  left_join(pred_df, by = "year")
combine_df
## # A tibble: 10 × 3
##     year   num  pred
##    <dbl> <dbl> <dbl>
##  1  2011 0.795 NA   
##  2  2012 0.859 NA   
##  3  2013 0.850 NA   
##  4  2014 0.946 NA   
##  5  2015 1.06  NA   
##  6  2016 1.06   1.08
##  7  2017 0.948  1.13
##  8  2018 0.720  1.19
##  9  2019 0.569  1.25
## 10  2020 0.477  1.31
# 一个等价的方法
df %>%
  modelr::add_predictions(model = mod) %>%
  mutate(pred = if_else(year < 2016, NA_real_, pred))

86.2.3 可视化

  • 基本绘图,画折线图和散点图
combine_df %>%
  ggplot(aes(x = year, y = num)) +
  geom_point() +
  geom_line() +
  geom_line(aes(y = pred)) +
  geom_point(aes(y = pred))
## Warning: Removed 5 rows containing missing values (`geom_line()`).
## Warning: Removed 5 rows containing missing values (`geom_point()`).
  • 调整坐标和配色
mycolor <- c("#3D81E0", "#f4a261")

scales::show_col(mycolor)
combine_df %>%
  ggplot(aes(x = year, y = num)) +
  geom_point(size = 4, fill = "#3D81E0", color = "#3D81E0", shape = 23) +
  geom_line(size = 2, color = "#3D81E0") +
  geom_line(aes(y = pred), size = 2, color = "#f4a261") +
  geom_point(aes(y = pred), size = 4, fill = "#f4a261", color = "#f4a261", shape = 23) +
  labs(
    title = "Figure2: Contribution of international migration to population growth",
    subtitle = "millions per year",
    caption = "Source: Census Bureau, J.P. Morgan",
    x = NULL,
    y = NULL
  ) +
  scale_y_continuous(
    limits = c(0, 1.5),
    breaks = seq(0, 1.5, by = 0.25),
    expand = c(0, 0)
  ) +
  scale_x_continuous(
    limits = c(2011, 2021),
    breaks = seq(2011, 2021, by = 1),
    expand = c(0.1, 0)
  ) +
  theme_classic(base_size = 14) +
  theme(
    legend.position = "none",
    plot.title.position = 'plot',
    plot.caption = element_text(hjust = 0)
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 5 rows containing missing values (`geom_line()`).
## Warning: Removed 5 rows containing missing values (`geom_point()`).
  • 添加标注
arrows <- tibble::tribble(
  ~x1, ~y1, ~x2, ~y2, ~color,
  2016, 0.54, 2017.5, 0.75, "a",
  2017, 1.42, 2018.0, 1.25, "b"
)



combine_df %>%
  ggplot(aes(x = year, y = num)) +
  geom_point(size = 4, fill = "#3D81E0", color = "#3D81E0", shape = 23) +
  geom_line(size = 2, color = "#3D81E0") +
  geom_line(aes(y = pred), size = 2, color = "#f4a261") +
  geom_point(aes(y = pred), size = 4, fill = "#f4a261", color = "#f4a261", shape = 23) +
  geom_ribbon(
    aes(ymin = num, ymax = pred),
    fill = "orange",
    alpha = 0.2
  ) +
  geom_segment(
    data = arrows,
    aes(x = x1, y = y1, xend = x2, yend = y2, color = color),
    arrow = arrow(length = unit(0.15, "inch")), size = 1.5
  ) +
  annotate("text",
    x = c(2017, 2016, 2021), y = c(1.47, 0.5, 0.9),
    size = 6, face = "bold",
    label = c("Pre-2017 trend", "Actual", "Shortfall:\n 2.1 million")
  ) +
  labs(
    title = "Figure2: Contribution of international migration to population growth",
    subtitle = "millions per year",
    caption = "Source: Census Bureau, J.P. Morgan",
    x = NULL,
    y = NULL
  ) +
  scale_y_continuous(
    limits = c(0, 1.5),
    breaks = seq(0, 1.5, by = 0.25),
    expand = c(0, 0)
  ) +
  scale_x_continuous(
    limits = c(2011, 2021),
    breaks = seq(2011, 2021, by = 1),
    expand = c(0.1, 0)
  ) +
  scale_color_manual(
    values = c(a = "#3D81E0", b = "#f4a261")
  ) +
  theme_classic(base_size = 14) +
  theme(
    legend.position = "none",
    plot.title.position = 'plot',
    plot.caption = element_text(hjust = 0)
  )
## Warning in annotate("text", x = c(2017, 2016, 2021), y = c(1.47, 0.5, 0.9), :
## Ignoring unknown parameters: `face`
## Warning: Removed 5 rows containing missing values (`geom_line()`).
## Warning: Removed 5 rows containing missing values (`geom_point()`).
  • 保存
ggsave("migration.pdf", width = 8, height = 5)