# 第 69 章 贝叶斯混合模型

library(tidyverse)
library(tidybayes)
library(rstan)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

## 69.1 僧侣抄经书

rbinom(n = 100, size = 5, prob = 0.2)
##   [1] 1 2 3 2 2 1 0 1 2 2 1 0 1 0 2 0 2 1 1 3 3 2 0 0 0 0 0 2 1 1 2 0 1 1 1 0 1
##  [38] 0 1 0 0 0 2 2 2 0 1 3 1 0 3 1 0 1 3 2 0 1 2 1 1 2 2 2 1 1 0 2 3 0 0 0 1 2
##  [75] 1 2 0 1 1 2 1 1 1 1 3 1 2 1 2 1 0 0 1 0 2 1 1 3 3 0
• n = 100，表示试验100次，生成的向量会有100个元素
• size = 5， 这里有5个硬币(有正反面)
• prob = 0.2，每个硬币朝正面的概率是0.2

$\text{Poisson}(n|\lambda) = \frac{1}{n!} \, \lambda^n \, \exp(-\lambda).$

## 69.3 混合模型

• 每天喝酒没抄经书
• 工作但没有完成任何经书

• 观测到 $$y = 0$$ 的概率 \begin{align*} \text{Pr}(0 | p, \lambda) & = \text{Pr}(\text{drink} | p) + \text{Pr}(\text{work} | p) \times \text{Pr}(0 | \lambda) \\ & = p + (1 - p) \; \exp (- \lambda). \end{align*}

• 观测到 $$y > 0$$ 对应的似然函数

\begin{align*} \text{Pr}(y | p, \lambda) & = \text{Pr}(\text{drink} | p) (0) + \text{Pr}(\text{work} | p) \text{Pr}(y | \lambda) \\ & = (1 - p) \frac {\lambda^y \; \exp (- \lambda)}{y!} \end{align*}

### 69.3.1 zero-inflated Poisson模型的数学表达式

\begin{align*} y_i & \sim \operatorname{ZIPoisson}(p_i, \lambda_i)\\ \operatorname{logit}(p_i) & = \alpha_p + \beta_p x_i \\ \log (\lambda_i) & = \alpha_\lambda + \beta_\lambda x_i. \end{align*}

# define parameters
prob_drink <- 0.2  # 20% of days
rate_work  <- 1    # average 1 manuscript per day

# sample one year of production
n <- 365
# simulate days monks drink
set.seed(11)
drink <- rbinom(n, 1, prob_drink)
# simulate manuscripts completed
y <- (1 - drink) * rpois(n, rate_work)

library(ggthemes)

d <-
tibble(Y = y) %>%
arrange(Y) %>%
mutate(zeros = c(rep("zeros_drink", times = sum(drink)),
rep("zeros_work",  times = sum(y == 0 & drink == 0)),
rep("nope",        times = n - sum(y == 0)))
)

d %>%
ggplot(aes(x = Y)) +
geom_histogram(aes(fill = zeros),
binwidth = 1, size = 1/10, color = "grey92") +
scale_fill_manual(values = c(canva_pal("Green fields")(4)[1],
canva_pal("Green fields")(4)[2],
canva_pal("Green fields")(4)[1])) +
xlab("Manuscripts completed") +
theme_hc() +
theme(legend.position = "none")
## Warning: Using size aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use linewidth instead.
## This warning is displayed once every 8 hours.
## Call lifecycle::last_lifecycle_warnings() to see where this warning was
## generated.

### 69.3.2 Stan 代码

stan_program <- "
data {
int n;
int y[n];
}
parameters {
real ap;
real al;
}
model {
real p;
real lambda;
p = inv_logit(ap);
lambda = exp(al);

ap ~ normal(-1.5, 1);
al ~ normal(1, .5);

for ( i in 1:n ) {
if (y[i] == 0)
target += log_mix( p , 0 , poisson_lpmf(0 | lambda) );
if (y[i] > 0)
target += log1m( p ) + poisson_lpmf(y[i] | lambda );
}
}
"

stan_data <- list(
n = length(y),
y = y
)

fit <- stan(model_code = stan_program, data = stan_data)
summary(fit, c('al', 'ap'))\$summary
##           mean     se_mean         sd        2.5%         25%        50%
## al  0.09788969 0.002230667 0.08366158 -0.07059784  0.04401349  0.1009258
## ap -1.26929212 0.008606286 0.32480816 -2.02371056 -1.45499581 -1.2292388
##           75%      97.5%    n_eff     Rhat
## al  0.1557251  0.2545978 1406.639 1.002628
## ap -1.0464590 -0.7282135 1424.368 1.001551

exp(0.09780912)
## [1] 1.102752
inv_logit <- function(x) {
exp(x) / (1 + exp(x))
}

inv_logit(-1.25875323)
## [1] 0.2211886

fit %>%
tidybayes::gather_draws(ap) %>%
tidybayes::mean_qi(.width = .89) %>%
ungroup()
## # A tibble: 1 × 7
##   .variable .value .lower .upper .width .point .interval
##   <chr>      <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>
## 1 ap         -1.27  -1.83 -0.822   0.89 mean   qi