# 第 70 章 贝叶斯分类模型

library(tidyverse)
library(tidybayes)
library(rstan)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

## 70.1 数据

df <- readr::read_rds(here::here("demo_data", "career.rds"))
df
## # A tibble: 500 × 2
##   family_income career
##           <dbl>  <int>
## 1         0.696      3
## 2         0.296      3
## 3         0.104      3
## 4         0.633      3
## 5         0.270      2
## 6         0.536      3
## # … with 494 more rows

career = 3为基线(baseline)，我们要估计下面公式中的四个参数，

\begin{align*} log\left(\frac{P(\text{career}=1)}{P(\text{career}=3)}\right) &= \alpha_{1} + \beta_{1} \text{income} \\ log\left(\frac{P(\text{career}=2)}{P(\text{career}=3)}\right) &= \alpha_{2} + \beta_{2} \text{income} \\ \end{align*}

df %>%
dplyr::mutate(career = fct_rev(as_factor(career))) %>%
nnet::multinom(career ~ family_income, data = .)
## # weights:  9 (4 variable)
## initial  value 549.306144
## iter  10 value 338.904780
## final  value 337.833351
## converged
## Call:
## nnet::multinom(formula = career ~ family_income, data = .)
##
## Coefficients:
##   (Intercept) family_income
## 2     -0.3915        -1.844
## 1     -0.9066        -4.162
##
## Residual Deviance: 675.7
## AIC: 683.7

## 70.2 stan for multi-logit Regression

### 70.2.1 stan 1

stan_program <- "
data{
int N;              // number of observations
int K;              // number of outcome values
int career[N];      // outcome
real family_income[N];
}
parameters{
vector[K-1] a;      // intercepts
vector[K-1] b;      // coefficients on family income
}
model{
vector[K] p;
vector[K] s;
a ~ normal(0, 5);
b ~ normal(0, 5);
for ( i in 1:N ) {
for ( j in 1:(K-1) ) s[j] = a[j] + b[j]*family_income[i];
s[K] = 0;
p = softmax( s );
career[i] ~ categorical( p );
}
}
"

stan_data <- list(
N             = nrow(df),
K             = 3,
career        = df$career, family_income = df$family_income
)

m1 <- stan(model_code = stan_program, data = stan_data)
m1
## Inference for Stan model: anon_model.
## 4 chains, each with iter=2000; warmup=1000; thin=1;
## post-warmup draws per chain=1000, total post-warmup draws=4000.
##
##         mean se_mean   sd    2.5%     25%     50%
## a[1]   -0.94    0.01 0.32   -1.59   -1.16   -0.94
## a[2]   -0.40    0.00 0.21   -0.82   -0.54   -0.39
## b[1]   -4.13    0.02 0.88   -5.96   -4.70   -4.10
## b[2]   -1.84    0.01 0.42   -2.68   -2.12   -1.84
## lp__ -340.22    0.03 1.37 -343.65 -340.91 -339.91
##          75%   97.5% n_eff Rhat
## a[1]   -0.72   -0.33  1955    1
## a[2]   -0.25    0.01  1962    1
## b[1]   -3.53   -2.49  1730    1
## b[2]   -1.57   -1.04  1842    1
## lp__ -339.21 -338.49  1631    1
##
## Samples were drawn using NUTS(diag_e) at Sun May  8 10:12:14 2022.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).

### 70.2.2 stan 2

stan_program <- "
data {
int<lower = 2> K;
int<lower = 0> N;
int<lower = 1> D;
int<lower = 1, upper = K> y[N];
matrix[N, D] x;
}
transformed data {
vector[D] zeros = rep_vector(0, D);
}
parameters {
matrix[D, K - 1] beta_raw;
}
transformed parameters {
matrix[D, K] beta;
beta = append_col(beta_raw, zeros);
}
model {
matrix[N, K] x_beta = x * beta;

to_vector(beta_raw) ~ normal(0, 5);

for (n in 1:N)
y[n] ~ categorical_logit(to_vector(x_beta[n]));
}
"

stan_data <- list(
N = nrow(df),
K = 3,
D = 2,
y = df\$career,
x = model.matrix( ~1 + family_income, data = df)
)

m2 <- stan(model_code = stan_program, data = stan_data)
m2
## Inference for Stan model: anon_model.
## 4 chains, each with iter=2000; warmup=1000; thin=1;
## post-warmup draws per chain=1000, total post-warmup draws=4000.
##
##                  mean se_mean   sd    2.5%     25%
## beta_raw[1,1]   -0.95    0.01 0.32   -1.59   -1.17
## beta_raw[1,2]   -0.41    0.00 0.21   -0.83   -0.55
## beta_raw[2,1]   -4.12    0.02 0.87   -5.83   -4.70
## beta_raw[2,2]   -1.83    0.01 0.41   -2.67   -2.11
## beta[1,1]       -0.95    0.01 0.32   -1.59   -1.17
## beta[1,2]       -0.41    0.00 0.21   -0.83   -0.55
## beta[1,3]        0.00     NaN 0.00    0.00    0.00
## beta[2,1]       -4.12    0.02 0.87   -5.83   -4.70
## beta[2,2]       -1.83    0.01 0.41   -2.67   -2.11
## beta[2,3]        0.00     NaN 0.00    0.00    0.00
## lp__          -340.24    0.03 1.38 -343.71 -340.95
##                   50%     75%   97.5% n_eff Rhat
## beta_raw[1,1]   -0.94   -0.74   -0.35  2022    1
## beta_raw[1,2]   -0.41   -0.27    0.00  2093    1
## beta_raw[2,1]   -4.09   -3.54   -2.48  1951    1
## beta_raw[2,2]   -1.83   -1.54   -1.03  2044    1
## beta[1,1]       -0.94   -0.74   -0.35  2022    1
## beta[1,2]       -0.41   -0.27    0.00  2093    1
## beta[1,3]        0.00    0.00    0.00   NaN  NaN
## beta[2,1]       -4.09   -3.54   -2.48  1951    1
## beta[2,2]       -1.83   -1.54   -1.03  2044    1
## beta[2,3]        0.00    0.00    0.00   NaN  NaN
## lp__          -339.93 -339.22 -338.52  1723    1
##
## Samples were drawn using NUTS(diag_e) at Sun May  8 10:13:42 2022.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).