# 第 69 章 贝叶斯混合模型

library(tidyverse)
library(tidybayes)
library(rstan)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

## 69.1 僧侣抄经书

rbinom(n = 100, size = 5, prob = 0.2)
##   [1] 0 3 1 0 1 2 0 1 2 0 1 0 2 1 0 1 0 1 1 2 1 2 0 1 2
##  [26] 1 2 2 1 1 0 1 0 0 2 0 1 0 2 2 0 0 0 1 2 1 2 4 1 2
##  [51] 1 0 1 2 0 0 1 1 0 0 0 1 1 2 0 2 1 2 0 1 2 2 1 0 1
##  [76] 1 2 2 1 1 1 1 0 0 1 3 1 1 1 2 0 1 0 0 0 0 0 0 2 3
• n = 100，表示试验100次，生成的向量会有100个元素
• size = 5， 这里有5个硬币(有正反面)
• prob = 0.2，每个硬币朝正面的概率是0.2

$\text{Poisson}(n|\lambda) = \frac{1}{n!} \, \lambda^n \, \exp(-\lambda).$

## 69.3 混合模型

• 每天喝酒没抄经书
• 工作但没有完成任何经书

• 观测到 $$y = 0$$ 的概率 \begin{align*} \text{Pr}(0 | p, \lambda) & = \text{Pr}(\text{drink} | p) + \text{Pr}(\text{work} | p) \times \text{Pr}(0 | \lambda) \\ & = p + (1 - p) \; \exp (- \lambda). \end{align*}

• 观测到 $$y > 0$$ 对应的似然函数

\begin{align*} \text{Pr}(y | p, \lambda) & = \text{Pr}(\text{drink} | p) (0) + \text{Pr}(\text{work} | p) \text{Pr}(y | \lambda) \\ & = (1 - p) \frac {\lambda^y \; \exp (- \lambda)}{y!} \end{align*}

### 69.3.1 zero-inflated Poisson模型的数学表达式

\begin{align*} y_i & \sim \operatorname{ZIPoisson}(p_i, \lambda_i)\\ \operatorname{logit}(p_i) & = \alpha_p + \beta_p x_i \\ \log (\lambda_i) & = \alpha_\lambda + \beta_\lambda x_i. \end{align*}

# define parameters
prob_drink <- 0.2  # 20% of days
rate_work  <- 1    # average 1 manuscript per day

# sample one year of production
n <- 365
# simulate days monks drink
set.seed(11)
drink <- rbinom(n, 1, prob_drink)
# simulate manuscripts completed
y <- (1 - drink) * rpois(n, rate_work)

library(ggthemes)

d <-
tibble(Y = y) %>%
arrange(Y) %>%
mutate(zeros = c(rep("zeros_drink", times = sum(drink)),
rep("zeros_work",  times = sum(y == 0 & drink == 0)),
rep("nope",        times = n - sum(y == 0)))
)

d %>%
ggplot(aes(x = Y)) +
geom_histogram(aes(fill = zeros),
binwidth = 1, size = 1/10, color = "grey92") +
scale_fill_manual(values = c(canva_pal("Green fields")(4)[1],
canva_pal("Green fields")(4)[2],
canva_pal("Green fields")(4)[1])) +
xlab("Manuscripts completed") +
theme_hc() +
theme(legend.position = "none")

### 69.3.2 Stan 代码

stan_program <- "
data {
int n;
int y[n];
}
parameters {
real ap;
real al;
}
model {
real p;
real lambda;
p = inv_logit(ap);
lambda = exp(al);

ap ~ normal(-1.5, 1);
al ~ normal(1, .5);

for ( i in 1:n ) {
if (y[i] == 0)
target += log_mix( p , 0 , poisson_lpmf(0 | lambda) );
if (y[i] > 0)
target += log1m( p ) + poisson_lpmf(y[i] | lambda );
}
}
"

stan_data <- list(
n = length(y),
y = y
)

fit <- stan(model_code = stan_program, data = stan_data)
summary(fit, c('al', 'ap'))\$summary
##        mean  se_mean      sd    2.5%      25%     50%
## al  0.09789 0.002231 0.08366 -0.0706  0.04401  0.1009
## ap -1.26929 0.008606 0.32481 -2.0237 -1.45500 -1.2292
##        75%   97.5% n_eff  Rhat
## al  0.1557  0.2546  1407 1.003
## ap -1.0465 -0.7282  1424 1.002

exp(0.09780912)
## [1] 1.103
inv_logit <- function(x) {
exp(x) / (1 + exp(x))
}

inv_logit(-1.25875323)
## [1] 0.2212

fit %>%
tidybayes::gather_draws(ap) %>%
tidybayes::mean_qi(.width = .89) %>%
ungroup()
## # A tibble: 1 × 7
##   .variable .value .lower .upper .width .point
##   <chr>      <dbl>  <dbl>  <dbl>  <dbl> <chr>
## 1 ap         -1.27  -1.83 -0.822   0.89 mean
## # … with 1 more variable: .interval <chr>