# 第 30 章 多层线性模型

## 30.2 案例

create_data <- function() {
df <- tibble(
ids = 1:100,
department = rep(c("sociology", "biology", "english", "informatics", "statistics"), 20),
bases = rep(c(40000, 50000, 60000, 70000, 80000), 20) * runif(100, .9, 1.1),
experience = floor(runif(100, 0, 10)),
raises = rep(c(2000, 500, 500, 1700, 500), 20) * runif(100, .9, 1.1)
)

df <- df %>% mutate(
salary = bases + experience * raises
)
df
}
library(tidyverse)
library(lme4)
library(modelr)
library(broom)
library(broom.mixed)

df <- create_data()
df
## # A tibble: 100 x 6
##      ids department   bases experience raises salary
##    <int> <chr>        <dbl>      <dbl>  <dbl>  <dbl>
##  1     1 sociology   38729.          4  1886. 46273.
##  2     2 biology     47922.          2   472. 48866.
##  3     3 english     54097.          9   513. 58710.
##  4     4 informatics 68611.          0  1724. 68611.
##  5     5 statistics  74399.          2   480. 75359.
##  6     6 sociology   41570.          6  1811. 52439.
##  7     7 biology     45737.          1   459. 46197.
##  8     8 english     56917.          1   512. 57429.
##  9     9 informatics 70903.          0  1749. 70903.
## 10    10 statistics  82830.          1   530. 83360.
## # ... with 90 more rows

## 30.3 线性模型

$\hat{y} = \alpha + \beta_1x_1 + ... + \beta_nx_n$

# Model without respect to grouping
m1 <- lm(salary ~ experience, data = df)
m1
##
## Call:
## lm(formula = salary ~ experience, data = df)
##
## Coefficients:
## (Intercept)   experience
##       59861         1107
broom::tidy(m1)
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   59861.     2522.     23.7  2.02e-42
## 2 experience     1107.      498.      2.22 2.85e- 2
df %>% modelr::add_predictions(m1)
## # A tibble: 100 x 7
##      ids department  bases experience raises salary
##    <int> <chr>       <dbl>      <dbl>  <dbl>  <dbl>
##  1     1 sociology  38729.          4  1886. 46273.
##  2     2 biology    47922.          2   472. 48866.
##  3     3 english    54097.          9   513. 58710.
##  4     4 informati~ 68611.          0  1724. 68611.
##  5     5 statistics 74399.          2   480. 75359.
##  6     6 sociology  41570.          6  1811. 52439.
##  7     7 biology    45737.          1   459. 46197.
##  8     8 english    56917.          1   512. 57429.
##  9     9 informati~ 70903.          0  1749. 70903.
## 10    10 statistics 82830.          1   530. 83360.
## # ... with 90 more rows, and 1 more variable:
## #   pred <dbl>
# Model without respect to grouping
df %>%
ggplot(aes(x = experience, y = salary)) +
geom_point() +
geom_line(aes(x = experience, y = pred)) +
labs(x = "Experience", y = "Predicted Salary") +
ggtitle("linear model Salary Prediction") +
scale_colour_discrete("Department")

## 30.4 变化的截距

$\hat{y_i} = \alpha_{j[i]} + \beta x_i$

# Model with varying intercept
m2 <- lmer(salary ~ experience + (1 | department), data = df)
m2
## Linear mixed model fit by REML ['lmerMod']
## Formula: salary ~ experience + (1 | department)
##    Data: df
## REML criterion at convergence: 1958
## Random effects:
##  Groups     Name        Std.Dev.
##  department (Intercept) 14532
##  Residual                4475
## Number of obs: 100, groups:  department, 5
## Fixed Effects:
## (Intercept)   experience
##       59506         1191
broom.mixed::tidy(m2, effects = "fixed")
## # A tibble: 2 x 5
##   effect term        estimate std.error statistic
##   <chr>  <chr>          <dbl>     <dbl>     <dbl>
## 1 fixed  (Intercept)   59506.     6552.      9.08
## 2 fixed  experience     1191.      167.      7.13
broom.mixed::tidy(m2, effects = "ran_vals")
## # A tibble: 5 x 6
##   effect   group   level    term     estimate std.error
##   <chr>    <chr>   <chr>    <chr>       <dbl>     <dbl>
## 1 ran_vals depart~ biology  (Interc~  -11998.      998.
## 2 ran_vals depart~ english  (Interc~   -3007.      998.
## 3 ran_vals depart~ informa~ (Interc~   13607.      998.
## 4 ran_vals depart~ sociolo~ (Interc~  -15137.      998.
## 5 ran_vals depart~ statist~ (Interc~   16535.      998.
df %>%
ggplot(aes(
x = experience, y = salary, group = department,
colour = department
)) +
geom_point() +
geom_line(aes(x = experience, y = pred)) +
labs(x = "Experience", y = "Predicted Salary") +
ggtitle("Varying Intercept Salary Prediction") +
scale_colour_discrete("Department")

## 30.5 变化的斜率

$\hat{y_i} = \alpha + \beta_{j[i]}x_i$

# Model with varying slope
m3 <- lmer(salary ~ experience + (0 + experience | department), data = df)
m3
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## salary ~ experience + (0 + experience | department)
##    Data: df
## REML criterion at convergence: 2059
## Random effects:
##  Groups     Name       Std.Dev.
##  department experience 2678
##  Residual              7667
## Number of obs: 100, groups:  department, 5
## Fixed Effects:
## (Intercept)   experience
##       59229         1246
broom.mixed::tidy(m3, effects = "fixed")
## # A tibble: 2 x 5
##   effect term        estimate std.error statistic
##   <chr>  <chr>          <dbl>     <dbl>     <dbl>
## 1 fixed  (Intercept)   59229.     1420.     41.7
## 2 fixed  experience     1246.     1232.      1.01
broom.mixed::tidy(m3, effects = "ran_vals")
## # A tibble: 5 x 6
##   effect   group    level    term    estimate std.error
##   <chr>    <chr>    <chr>    <chr>      <dbl>     <dbl>
## 1 ran_vals departm~ biology  experi~   -2799.      441.
## 2 ran_vals departm~ english  experi~    -714.      306.
## 3 ran_vals departm~ informa~ experi~    2938.      379.
## 4 ran_vals departm~ sociolo~ experi~   -2062.      288.
## 5 ran_vals departm~ statist~ experi~    2637.      325.
df %>%
ggplot(aes(
x = experience, y = salary, group = department,
colour = department
)) +
geom_point() +
geom_line(aes(x = experience, y = pred)) +
labs(x = "Experience", y = "Predicted Salary") +
ggtitle("Varying slope Salary Prediction") +
scale_colour_discrete("Department")

## 30.6 变化的斜率 + 变化的截距

$\hat{y_i} = \alpha_{j[i]} + \beta_{j[i]}x_i$ 具体来说，教师$$i$$，所在的学院$$j$$, 他的入职的起始收入表示为 ($$\alpha_{j[i]}$$)，年度增长率表示为($$\beta_{j[i]}$$).

# Model with varying slope and intercept
m4 <- lmer(salary ~ experience + (1 + experience | department), data = df)
m4
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## salary ~ experience + (1 + experience | department)
##    Data: df
## REML criterion at convergence: 1930
## Random effects:
##  Groups     Name        Std.Dev. Corr
##  department (Intercept) 14651
##             experience   1054    -0.04
##  Residual                3645
## Number of obs: 100, groups:  department, 5
## Fixed Effects:
## (Intercept)   experience
##       59647         1153
broom.mixed::tidy(m4, effects = "fixed")
## # A tibble: 2 x 5
##   effect term        estimate std.error statistic
##   <chr>  <chr>          <dbl>     <dbl>     <dbl>
## 1 fixed  (Intercept)   59647.     6588.      9.05
## 2 fixed  experience     1153.      492.      2.35
broom.mixed::tidy(m4, effects = "ran_vals")
## # A tibble: 10 x 6
##    effect  group    level    term    estimate std.error
##    <chr>   <chr>    <chr>    <chr>      <dbl>     <dbl>
##  1 ran_va~ departm~ biology  (Inter~   -9402.     1332.
##  2 ran_va~ departm~ english  (Inter~     558.     1442.
##  3 ran_va~ departm~ informa~ (Inter~    8357.     1334.
##  4 ran_va~ departm~ sociolo~ (Inter~  -18347.     1603.
##  5 ran_va~ departm~ statist~ (Inter~   18833.     1693.
##  6 ran_va~ departm~ biology  experi~    -853.      341.
##  7 ran_va~ departm~ english  experi~    -762.      257.
##  8 ran_va~ departm~ informa~ experi~    1467.      294.
##  9 ran_va~ departm~ sociolo~ experi~     631.      269.
## 10 ran_va~ departm~ statist~ experi~    -482.      320.
df %>%
ggplot(aes(
x = experience, y = salary, group = department,
colour = department
)) +
geom_point() +
geom_line(aes(x = experience, y = pred)) +
labs(x = "Experience", y = "Predicted Salary") +
ggtitle("Varying Intercept and Slopes Salary Prediction") +
scale_colour_discrete("Department")

## 30.7 信息池

### 30.7.1 提问

df %>%
group_by(department) %>%
group_modify(
~ broom::tidy(lm(salary ~ 1 + experience, data = .))
)
## # A tibble: 10 x 6
## # Groups:   department [5]
##    department term  estimate std.error statistic
##    <chr>      <chr>    <dbl>     <dbl>     <dbl>
##  1 biology    (Int~   50477.     1144.    44.1
##  2 biology    expe~     215.      298.     0.721
##  3 english    (Int~   60431.     1446.    41.8
##  4 english    expe~     342.      259.     1.32
##  5 informati~ (Int~   67628.     1431.    47.3
##  6 informati~ expe~    2732.      320.     8.54
##  7 sociology  (Int~   40860.     1196.    34.2
##  8 sociology  expe~    1859.      202.     9.20
##  9 statistics (Int~   78958.     2342.    33.7
## 10 statistics expe~     581.      447.     1.30
## # ... with 1 more variable: p.value <dbl>

### 30.7.2 信息共享

• 完全共享
broom::tidy(m1)
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   59861.     2522.     23.7  2.02e-42
## 2 experience     1107.      498.      2.22 2.85e- 2
complete_pooling <-
broom::tidy(m1) %>%
dplyr::select(term, estimate) %>%
tidyr::pivot_wider(
names_from = term,
values_from = estimate
) %>%
dplyr::rename(Intercept = (Intercept), slope = experience) %>%
dplyr::mutate(pooled = "complete_pool") %>%
dplyr::select(pooled, Intercept, slope)

complete_pooling
## # A tibble: 1 x 3
##   pooled        Intercept slope
##   <chr>             <dbl> <dbl>
## 1 complete_pool    59861. 1107.
• 部分共享
fix_effect <- broom.mixed::tidy(m4, effects = "fixed")
fix_effect
fix_effect$estimate[1] fix_effect$estimate[2]
var_effect <- broom.mixed::tidy(m4, effects = "ran_vals")
var_effect
# random effects plus fixed effect parameters
partial_pooling <- var_effect %>%
dplyr::select(level, term, estimate) %>%
tidyr::pivot_wider(
names_from = term,
values_from = estimate
) %>%
dplyr::rename(Intercept = (Intercept), estimate = experience) %>%
dplyr::mutate(
Intercept = Intercept + fix_effect$estimate[1], estimate = estimate + fix_effect$estimate[2]
) %>%
dplyr::mutate(pool = "partial_pool") %>%
dplyr::select(pool, level, Intercept, estimate)

partial_pooling
partial_pooling <-
coef(m4)\$department %>%
tibble::rownames_to_column() %>%
dplyr::rename(level = rowname, Intercept = (Intercept), slope = experience) %>%
dplyr::mutate(pooled = "partial_pool") %>%
dplyr::select(pooled, level, Intercept, slope)

partial_pooling
##         pooled       level Intercept  slope
## 1 partial_pool     biology     50245  299.6
## 2 partial_pool     english     60205  390.8
## 3 partial_pool informatics     68003 2620.3
## 4 partial_pool   sociology     41300 1783.9
## 5 partial_pool  statistics     78480  670.9
• 不共享
no_pool <- df %>%
dplyr::group_by(department) %>%
dplyr::group_modify(
~ broom::tidy(lm(salary ~ 1 + experience, data = .))
)
no_pool
## # A tibble: 10 x 6
## # Groups:   department [5]
##    department term  estimate std.error statistic
##    <chr>      <chr>    <dbl>     <dbl>     <dbl>
##  1 biology    (Int~   50477.     1144.    44.1
##  2 biology    expe~     215.      298.     0.721
##  3 english    (Int~   60431.     1446.    41.8
##  4 english    expe~     342.      259.     1.32
##  5 informati~ (Int~   67628.     1431.    47.3
##  6 informati~ expe~    2732.      320.     8.54
##  7 sociology  (Int~   40860.     1196.    34.2
##  8 sociology  expe~    1859.      202.     9.20
##  9 statistics (Int~   78958.     2342.    33.7
## 10 statistics expe~     581.      447.     1.30
## # ... with 1 more variable: p.value <dbl>
un_pooling <- no_pool %>%
dplyr::select(department, term, estimate) %>%
tidyr::pivot_wider(
names_from = term,
values_from = estimate
) %>%
dplyr::rename(Intercept = (Intercept), slope = experience) %>%
dplyr::mutate(pooled = "no_pool") %>%
dplyr::select(pooled, level = department, Intercept, slope)

un_pooling
## # A tibble: 5 x 4
## # Groups:   level [5]
##   pooled  level       Intercept slope
##   <chr>   <chr>           <dbl> <dbl>
## 1 no_pool biology        50477.  215.
## 2 no_pool english        60431.  342.
## 3 no_pool informatics    67628. 2732.
## 4 no_pool sociology      40860. 1859.
## 5 no_pool statistics     78958.  581.

### 30.7.3 可视化

library(ggrepel)

un_pooling %>%
dplyr::bind_rows(partial_pooling) %>%
ggplot(aes(x = Intercept, y = slope)) +
purrr::map(
c(seq(from = 0.1, to = 0.9, by = 0.1)),
.f = function(level) {
stat_ellipse(
geom = "polygon", type = "norm",
size = 0, alpha = 1 / 10, fill = "gray10",
level = level
)
}
) +
geom_point(aes(group = pooled, color = pooled)) +
geom_line(aes(group = level), size = 1 / 4) +
# geom_point(data = complete_pooling, size = 4, color = "red") +
geom_text_repel(
data = . %>% filter(pooled == "no_pool"),
aes(label = level)
) +
scale_color_manual(
name = "信息池",
values = c(
"no_pool" = "black",
"partial_pool" = "red" # ,
# "complete_pool" = "#A65141"
),
labels = c(
"no_pool" = "不共享",
"partial_pool" = "部分共享" # ,
# "complete_pool" = "完全共享"
)
) #+

# theme_classic()

## 30.8 更多

• 解释模型的含义
lmer(salary ~ 1 + (0 + experience | department), data = df)
# vs
lmer(salary ~ 1 + experience + (0 + experience | department), data = df)
lmer(salary ~ 1 + (1 + experience | department), data = df)
## Linear mixed model fit by REML ['lmerMod']
## Formula: salary ~ 1 + (1 + experience | department)
##    Data: df
## REML criterion at convergence: 1948
## Random effects:
##  Groups     Name        Std.Dev. Corr
##  department (Intercept) 14706
##             experience   1483    -0.07
##  Residual                3644
## Number of obs: 100, groups:  department, 5
## Fixed Effects:
## (Intercept)
##       60573
# vs
lmer(salary ~ 1 + (1 | department) + (0 + experience | department), data = df)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## salary ~ 1 + (1 | department) + (0 + experience | department)
##    Data: df
## REML criterion at convergence: 1948
## Random effects:
##  Groups       Name        Std.Dev.
##  department   (Intercept) 14671
##  department.1 experience   1483
##  Residual                  3644
## Number of obs: 100, groups:  department, 5
## Fixed Effects:
## (Intercept)
##       59854
• 课后阅读文献，读完后大家一起分享