# 第 58 章 logistic回归模型

## 58.1 问题

library(tidyverse)
gredata
## # A tibble: 400 × 4
##    <dbl> <dbl> <dbl> <dbl>
##  1     0   380  3.61     3
##  2     1   660  3.67     3
##  3     1   800  4        1
##  4     1   640  3.19     4
##  5     0   520  2.93     4
##  6     1   760  3        2
##  7     1   560  2.98     1
##  8     0   400  3.08     2
##  9     1   540  3.39     3
## 10     0   700  3.92     2
## # ℹ 390 more rows

\begin{align*} \text{admit}_{i} &\sim \mathrm{Binomial}(1, p_{i}) \\ \text{logit}(p_{i}) &= \log\Big(\frac{p_{i}}{1 - p_{i}}\Big) = \alpha + \beta \cdot \text{gre}_{i} \\ \text{equivalent to,} \quad p_{i} &= \frac{1}{1 + \exp[- (\alpha + \beta \cdot \text{gre}_{i})]} \\ & = \frac{\exp(\alpha + \beta \cdot \text{gre}_{i})}{1 + \exp (\alpha + \beta \cdot\text{gre}_{i})} \\ \end{align*}

model_logit <- glm(admit ~ gre,
data = gredata,
)

summary(model_logit)
##
## Call:
##     data = gredata)
##
## Deviance Residuals:
##     Min       1Q   Median       3Q      Max
## -1.1623  -0.9052  -0.7547   1.3486   1.9879
##
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.901344   0.606038  -4.787 1.69e-06 ***
## gre          0.003582   0.000986   3.633  0.00028 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 486.06  on 398  degrees of freedom
## AIC: 490.06
##
## Number of Fisher Scoring iterations: 4

coef(model_logit)[2]
##         gre
## 0.003582212

## 58.2 模型的输出

num scale formula
1 The log-odds scale $$log \frac{p}{1 - p}$$
2 The odds scale $$\frac{p}{1 -p}$$
3 The probability scale $$p$$

### 58.2.1 The log-odds scale

logit_log_odds <- broom::augment_columns(
model_logit,
data = gredata,
) %>%
rename(log_odds = .fitted) 
library(latex2exp)
## Warning: package 'latex2exp' was built under R version 4.2.2
logit_log_odds %>%
ggplot(aes(x = gre, y = log_odds)) +
geom_path(color = "#771C6D", size = 2) +
labs(title = "Log odds",
subtitle = "This is linear!",
x = NULL,
y = TeX("$log \\frac{p}{1 - p}$")) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold"),
axis.title.y = element_text(angle = 90)
)
## Warning: Using size aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use linewidth instead.
## This warning is displayed once every 8 hours.
## Call lifecycle::last_lifecycle_warnings() to see where this warning was
## generated.

### 58.2.2 The odds scale

exp(0.003582)
## [1] 1.003588

1.003588的含义是: GRE考试成绩每增加1个单位，那么odds(录取概率)就会增大1.003588倍；若增加2个单位，那么odds(录取概率)就会增大(1.003588 * 1.003588)倍，也就说是个乘法关系。

(exp(0.003582) - 1) x 100% = (1.003588 - 1) x 100% = 0.36%

logit_odds <- broom::augment_columns(
model_logit,
data = gredata,
) %>%
rename(log_odds = .fitted) %>%
mutate(odds_ratio = exp(log_odds))
logit_odds %>%
ggplot(aes(x = gre, y = odds_ratio)) +
geom_line(color = "#FB9E07", size = 2) +
labs(title = "Odds",
subtitle = "This is curvy, but it's a mathy transformation of a linear value",
x = NULL,
y = TeX("$\\frac{p}{1 - p}$")) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold"),
axis.title.y = element_text(angle = 90)
)

### 58.2.3 The probability scale.

logit_probs <- broom::augment_columns(
model_logit,
data = gredata,
type.predict = c("response")
) %>%
rename(pred_prob = .fitted)
logit_probs %>%
ggplot(aes(x = gre, y = pred_prob)) +
#geom_point(aes(x = gre, y = admit)) +
geom_line(color = "#CF4446", size = 2) +
labs(title = "Predicted probabilities",
sutitle = "Plug values of X into ",
x = "X (value of explanatory variable)",
y = TeX("\\hat{P(Y)} ")) +
theme_minimal() +
theme(plot.title = element_text(face = "bold"))

• GRE分数200分左右，录取概率约0.1；
• GRE分数500分左右，录取概率约0.25；
• GRE分数800分左右，录取概率接近0.5；

• log_odds scale，预测因子与log_odds()的关系是一个固定值，具有可加性
• odds scale， 预测因子与odds()的关系是一个固定值，具有乘法性
• probability，预测因子与probability的关系不再是一个固定值了

## 58.3 预测

### 58.3.1 预测与拟合

gredata %>%
mutate(
pred = predict(model_logit),
fitted = fitted(model_logit)
)
## # A tibble: 400 × 6
##    admit   gre   gpa  rank    pred fitted
##    <dbl> <dbl> <dbl> <dbl>   <dbl>  <dbl>
##  1     0   380  3.61     3 -1.54    0.177
##  2     1   660  3.67     3 -0.537   0.369
##  3     1   800  4        1 -0.0356  0.491
##  4     1   640  3.19     4 -0.609   0.352
##  5     0   520  2.93     4 -1.04    0.261
##  6     1   760  3        2 -0.179   0.455
##  7     1   560  2.98     1 -0.895   0.290
##  8     0   400  3.08     2 -1.47    0.187
##  9     1   540  3.39     3 -0.967   0.275
## 10     0   700  3.92     2 -0.394   0.403
## # ℹ 390 more rows

$p = \exp(\alpha) / (1 + \exp(\alpha) )$

gredata %>%
mutate(
pred = predict(model_logit),
fitted = fitted(model_logit),
pred2 = exp(pred) / (1 + exp(pred) )
)
## # A tibble: 400 × 7
##    admit   gre   gpa  rank    pred fitted pred2
##    <dbl> <dbl> <dbl> <dbl>   <dbl>  <dbl> <dbl>
##  1     0   380  3.61     3 -1.54    0.177 0.177
##  2     1   660  3.67     3 -0.537   0.369 0.369
##  3     1   800  4        1 -0.0356  0.491 0.491
##  4     1   640  3.19     4 -0.609   0.352 0.352
##  5     0   520  2.93     4 -1.04    0.261 0.261
##  6     1   760  3        2 -0.179   0.455 0.455
##  7     1   560  2.98     1 -0.895   0.290 0.290
##  8     0   400  3.08     2 -1.47    0.187 0.187
##  9     1   540  3.39     3 -0.967   0.275 0.275
## 10     0   700  3.92     2 -0.394   0.403 0.403
## # ℹ 390 more rows

gredata %>%
mutate(
pred = predict(model_logit, type = "response"),
fitted = fitted(model_logit)
)
## # A tibble: 400 × 6
##    admit   gre   gpa  rank  pred fitted
##    <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
##  1     0   380  3.61     3 0.177  0.177
##  2     1   660  3.67     3 0.369  0.369
##  3     1   800  4        1 0.491  0.491
##  4     1   640  3.19     4 0.352  0.352
##  5     0   520  2.93     4 0.261  0.261
##  6     1   760  3        2 0.455  0.455
##  7     1   560  2.98     1 0.290  0.290
##  8     0   400  3.08     2 0.187  0.187
##  9     1   540  3.39     3 0.275  0.275
## 10     0   700  3.92     2 0.403  0.403
## # ℹ 390 more rows

### 58.3.2 预测

newdata <- tibble(
gre = c(550, 660, 700, 780)
)
newdata
## # A tibble: 4 × 1
##     gre
##   <dbl>
## 1   550
## 2   660
## 3   700
## 4   780

• type = "link"，预测的是log_odds，实际上就是coef(model_logit)[1] + gre * coef(model_logit)[2]
newdata %>%
mutate(
pred_log_odds = predict(model_logit, newdata = newdata, type = "link"),
#
pred_log_odds2 = coef(model_logit)[1] + gre * coef(model_logit)[2]

)
## # A tibble: 4 × 3
##     gre pred_log_odds pred_log_odds2
##   <dbl>         <dbl>          <dbl>
## 1   550        -0.931         -0.931
## 2   660        -0.537         -0.537
## 3   700        -0.394         -0.394
## 4   780        -0.107         -0.107
• type = "response""，预测的是probabilities， 实际上就是exp(pred_log_odds) / (1 + exp(pred_log_odds) )
newdata %>%
mutate(
pred_log_odds = predict(model_logit, newdata = newdata, type = "link")
) %>%
mutate(
pred_prob = predict(model_logit, newdata = newdata, type = "response"),
#
pred_prob2 = exp(pred_log_odds) / (1 + exp(pred_log_odds) )
)
## # A tibble: 4 × 4
##     gre pred_log_odds pred_prob pred_prob2
##   <dbl>         <dbl>     <dbl>      <dbl>
## 1   550        -0.931     0.283      0.283
## 2   660        -0.537     0.369      0.369
## 3   700        -0.394     0.403      0.403
## 4   780        -0.107     0.473      0.473
• type = "terms"，返回一个矩阵，具体计算为：模型的设计矩阵中心化以后，与模型返回的系数相乘而得到的新矩阵10
predict(model_logit, gredata, type = 'terms') %>%
as_tibble() %>%
head()
## # A tibble: 6 × 1
##      gre
##    <dbl>
## 1 -0.744
## 2  0.259
## 3  0.761
## 4  0.187
## 5 -0.243
## 6  0.617

X <- model.matrix(admit ~ gre, data = gredata)

X %>%
as.data.frame() %>%
mutate(
across(everything(), ~ .x - mean(.x))
) %>%
transmute(
term = coef(model_logit)[2] * gre
) %>%
head()
##         term
## 1 -0.7440254
## 2  0.2589939
## 3  0.7605035
## 4  0.1873497
## 5 -0.2425157
## 6  0.6172151