第 65 章 贝叶斯假设检验

## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'readr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.2
## Warning: package 'lubridate' was built under R version 4.2.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'tidybayes' was built under R version 4.2.2
## Loading required package: StanHeaders
## Warning: package 'StanHeaders' was built under R version 4.2.3
## 
## rstan version 2.26.13 (Stan version 2.26.1)
## 
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
## For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
## change `threads_per_chain` option:
## rstan_options(threads_per_chain = 1)
## 
## Do not specify '-march=native' in 'LOCAL_CPPFLAGS' or a Makevars file
## 
## Attaching package: 'rstan'
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
theme_set(bayesplot::theme_default())

65.1 人们会给爱情片打高分?

这是一个关于电影评分的数据。我们想看下爱情片与动作片的平均得分是否存在显著不同?

movies <- read_rds(here::here("demo_data", "movies.rds"))
movies
## # A tibble: 100 × 5
##    title                          year rating genre  genre_numeric
##    <chr>                         <int>  <dbl> <fct>          <dbl>
##  1 One of Our Spies Is Missing    1966    5.6 Action             1
##  2 Black Pirate, The              1926    7.5 Action             1
##  3 Caged Heat 3000                1995    2.9 Action             1
##  4 G.P.U.                         1942    8.8 Action             1
##  5 Goal Club                      2001    9.2 Action             1
##  6 Dead or Alive: Hanzaisha       1999    6.9 Action             1
##  7 Lost in Africa                 1994    4   Action             1
##  8 Guns of the Magnificent Seven  1969    4.6 Action             1
##  9 Kiler                          1997    6.9 Action             1
## 10 Steamboy                       2004    6.6 Action             1
## # ℹ 90 more rows

65.1.1 可视化探索

看下两种题材电影评分的分布

movies %>%
  ggplot(aes(x = genre, y = rating, color = genre)) +
  geom_boxplot() +
  geom_jitter() +
  scale_x_discrete(
    expand = expansion(mult = c(0.5, 0.5))
  ) +
  theme(legend.position = "none") 

65.1.2 计算均值差

统计两种题材电影评分的均值

group_diffs <- movies %>% 
  group_by(genre) %>% 
  summarize(avg_rating = mean(rating, na.rm = TRUE)) %>% 
  mutate(diff_means = avg_rating - lag(avg_rating))

group_diffs
## # A tibble: 2 × 3
##   genre   avg_rating diff_means
##   <fct>        <dbl>      <dbl>
## 1 Action        5.55      NA   
## 2 Romance       6.22       0.67

65.1.3 t检验

传统的t检验

t.test(
  rating ~ genre,
  data = movies,
  var.equal = FALSE
) 
## 
##  Welch Two Sample t-test
## 
## data:  rating by genre
## t = -2.2695, df = 83.737, p-value = 0.02581
## alternative hypothesis: true difference in means between group Action and group Romance is not equal to 0
## 95 percent confidence interval:
##  -1.25709829 -0.08290171
## sample estimates:
##  mean in group Action mean in group Romance 
##                  5.55                  6.22

65.2 stan 代码

65.2.1 normal分布

先假定rating评分,服从正态分布,同时不同的电影题材分组考虑

\[ \begin{aligned} \textrm{rating} & \sim \textrm{normal}(\mu_{\textrm{genre}}, \, \sigma _{\textrm{genre}}) \\ \mu &\sim \textrm{normal}(\textrm{mean_rating}, \, 2) \\ \sigma &\sim \textrm{cauchy}(0, \, 1) \end{aligned} \]

stan_program <- '
data {
  int<lower=1> N;                            
  int<lower=2> n_groups;                     
  vector[N] y;                               
  int<lower=1, upper=n_groups> group_id[N];  
}
transformed data {
  real mean_y;
  mean_y = mean(y); 
}
parameters {
  vector[2] mu;                    
  vector<lower=0>[2] sigma;        
}
model {
  mu ~ normal(mean_y, 2);
  sigma ~ cauchy(0, 1);

  for (n in 1:N){
    y[n] ~ normal(mu[group_id[n]], sigma[group_id[n]]);
  }
}

generated quantities {
  real mu_diff;
  mu_diff = mu[2] - mu[1];
}

'

stan_data <- movies %>% 
  select(genre, rating, genre_numeric) %>% 
  tidybayes::compose_data(
    N        = nrow(.), 
    n_groups = n_distinct(genre), 
    group_id = genre_numeric, 
    y        = rating
  )

stan_best_normal <- stan(model_code = stan_program, data = stan_data)
stan_best_normal
## Inference for Stan model: anon_model.
## 4 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=4000.
## 
##            mean se_mean   sd   2.5%    25%    50%    75%  97.5% n_eff Rhat
## mu[1]      5.56    0.00 0.25   5.07   5.40   5.55   5.72   6.05  3420    1
## mu[2]      6.22    0.00 0.16   5.91   6.12   6.22   6.32   6.54  4526    1
## sigma[1]   1.78    0.00 0.18   1.47   1.65   1.76   1.89   2.18  3937    1
## sigma[2]   1.15    0.00 0.12   0.95   1.06   1.14   1.22   1.39  4259    1
## mu_diff    0.66    0.00 0.30   0.08   0.47   0.67   0.86   1.24  4013    1
## lp__     -86.82    0.03 1.44 -90.42 -87.51 -86.48 -85.75 -85.05  1783    1
## 
## Samples were drawn using NUTS(diag_e) at Tue Jul 18 20:08:15 2023.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).
stan_best_normal %>% 
  tidybayes::spread_draws(mu_diff) %>%
  ggplot(aes(x = mu_diff)) +
  tidybayes::stat_halfeye() +
  geom_vline(xintercept = 0)
stan_best_normal %>% 
  tidybayes::spread_draws(mu_diff) %>%
  
    ggplot(aes(x = mu_diff)) +
  stat_eye(side = "right", 
           fill = "skyblue",
             point_interval = mode_hdi, 
             .width = c(0.5, 0.89),
             interval_colour = "red", 
             point_colour = "red",
             width = 15.5, 
             height = 0.1
             ) +
  geom_vline(xintercept = c(-0.1, 0.1), linetype = "dashed", size = 1) +

  coord_cartesian(xlim = c(-1, 2)) +
    labs(x = "mu_diff", y = NULL)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

65.2.2 等效检验

我们一般会采用实用等效区间 region of practical equivalence ROPE。实用等效区间,就是我们感兴趣值附近的一个区间,比如这里的均值差。频率学中的零假设是看均值差是否为0,贝叶斯则是看均值差有多少落入0附近的区间。具体方法就是,先算出后验分布的高密度区间,然后看这个高密度区间落在[-0.1, 0.1]的比例.

lower <- -0.1*sd(movies$rating)
upper <-  0.1*sd(movies$rating)

stan_best_normal %>% 
  tidybayes::spread_draws(mu_diff) %>%
  filter(
    mu_diff > ggdist::mean_hdi(mu_diff, .width = c(0.89))$ymin,
    mu_diff < ggdist::mean_hdi(mu_diff, .width = c(0.89))$ymax
  ) %>%
  summarise(
    percentage_in_rope = mean(between(mu_diff, lower, upper))
  )
## # A tibble: 1 × 1
##   percentage_in_rope
##                <dbl>
## 1                  0

在做假设检验的时候,我们内心是期望,后验概率的高密度区间落在实际等效区间的比例越小越小,如果小于2.5%,我们就可以拒绝零假设了;如果大于97.5%,我们就接受零假设。

stan_best_normal %>% 
  tidybayes::spread_draws(mu_diff) %>%
  pull(mu_diff) %>% 
  bayestestR::rope(x,
    range = c(-0.1, 0.1)*sd(movies$rating),
    ci = 0.89,
    ci_method = "HDI"
  )
## # Proportion of samples inside the ROPE [-0.15, 0.15]:
## 
## inside ROPE
## -----------
## 0.00 %

65.2.3 Student-t 分布

标准正态分布是t分布的极限分布

for (nu in c(1, seq(5, 50, by = 10))) {
 p <- tibble(x = seq(-5, 5, by=0.1)) %>% 
    ggplot(aes(x)) + 
    stat_function(fun = dnorm, color = 'gray') + 
    stat_function(fun = dt, args = list(df = nu), color = 'blue') +
    theme_classic() + 
    ylab("Density") + 
    xlab('Value') + 
    ggtitle(paste("df =", nu))
 
  print(p)
}

假定rating评分服从student-t分布,

\[ \begin{aligned} \textrm{rating} & \sim \textrm{student_t}(\nu, \,\mu_{\textrm{genre}}, \, \sigma _{\textrm{genre}}) \\ \mu &\sim \textrm{normal}(\textrm{mean_rating}, \, 2) \\ \sigma &\sim \textrm{cauchy}(0, \, 1) \\ \nu &\sim \textrm{exponential}(1.0/29) \end{aligned} \]

stan_program <- '
data {
  int<lower=1> N;                            
  int<lower=2> n_groups;                     
  vector[N] y;                               
  int<lower=1, upper=n_groups> group_id[N];  
}
transformed data {
  real mean_y;
  mean_y = mean(y); 
}
parameters {
  vector[2] mu;                    
  vector<lower=0>[2] sigma;        
  real<lower=0, upper=100> nu;     
}
model {
  mu ~ normal(mean_y, 2);
  sigma ~ cauchy(0, 1);
  nu ~ exponential(1.0/29);

  for (n in 1:N){
    y[n] ~ student_t(nu, mu[group_id[n]], sigma[group_id[n]]);
  }
}

generated quantities {
  real mu_diff;
  mu_diff = mu[2] - mu[1];
}

'

stan_data <- movies %>% 
  select(genre, rating, genre_numeric) %>% 
  tidybayes::compose_data(
    N        = nrow(.), 
    n_groups = n_distinct(genre), 
    group_id = genre_numeric, 
    y        = rating
  )

stan_best_student <- stan(model_code = stan_program, data = stan_data)
stan_best_student
## Inference for Stan model: anon_model.
## 4 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=4000.
## 
##             mean se_mean    sd    2.5%     25%     50%     75%   97.5% n_eff
## mu[1]       5.54    0.00  0.25    5.05    5.36    5.54    5.71    6.04  3803
## mu[2]       6.24    0.00  0.15    5.93    6.13    6.24    6.34    6.54  4129
## sigma[1]    1.71    0.00  0.19    1.37    1.57    1.69    1.83    2.12  4192
## sigma[2]    1.05    0.00  0.13    0.81    0.96    1.04    1.13    1.33  3362
## nu         27.73    0.43 20.68    5.02   12.11   21.10   37.86   83.38  2351
## mu_diff     0.70    0.00  0.30    0.10    0.50    0.71    0.90    1.28  3941
## lp__     -119.54    0.04  1.65 -123.61 -120.37 -119.22 -118.32 -117.39  1599
##          Rhat
## mu[1]       1
## mu[2]       1
## sigma[1]    1
## sigma[2]    1
## nu          1
## mu_diff     1
## lp__        1
## 
## Samples were drawn using NUTS(diag_e) at Tue Jul 18 20:09:15 2023.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).
stan_best_student %>% 
  tidybayes::spread_draws(mu_diff) %>%
  ggplot(aes(x = mu_diff)) +
  tidybayes::stat_halfeye() +
  geom_vline(xintercept = 0)
stan_best_student %>%
  as.data.frame() %>% 
  head()
##      mu[1]    mu[2] sigma[1]  sigma[2]       nu   mu_diff      lp__
## 1 5.118808 6.270142 1.549989 1.2949051 21.28581 1.1513343 -120.8732
## 2 5.967722 6.459934 1.790923 0.9558584 22.51517 0.4922120 -120.0670
## 3 5.950175 6.433393 1.854735 1.0242260 19.36858 0.4832173 -119.4204
## 4 5.245263 6.025418 2.052233 0.9888823 18.47944 0.7801542 -120.2714
## 5 5.464269 6.373180 1.861468 1.0654877 50.48844 0.9089110 -118.6317
## 6 5.648799 6.307200 1.775365 1.1710360 67.58613 0.6584016 -119.0480
stan_best_student %>%
  as.data.frame() %>%
  ggplot(aes(x = `mu[1]`)) +
  geom_density()
stan_best_student %>%
  tidybayes::gather_draws(mu[i], sigma[i]) %>%
  tidybayes::mean_hdi(.width = 0.89)
## # A tibble: 4 × 8
##       i .variable .value .lower .upper .width .point .interval
##   <int> <chr>      <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1     1 mu          5.54  5.16    5.97   0.89 mean   hdi      
## 2     1 sigma       1.71  1.41    2.00   0.89 mean   hdi      
## 3     2 mu          6.24  6.00    6.49   0.89 mean   hdi      
## 4     2 sigma       1.05  0.840   1.25   0.89 mean   hdi

65.3 小结

65.4 作业

  • 将上一章线性模型的stan代码应用到电影评分数据中

\[ \begin{aligned} \textrm{rating} & \sim \textrm{Normal}(\mu, \, \sigma) \\ \mu & = \alpha + \beta \, \textrm{genre} \\ \alpha &\sim \textrm{Normal}(0, \, 5) \\ \beta &\sim \textrm{Normal}(0, \, 1) \\ \sigma &\sim \textrm{Exponential}(1) \\ \end{aligned} \]

stan_program <- '
data {
  int<lower=1> N;            
  vector[N] y;        
  vector[N] x;           
}
parameters {
  real<lower=0> sigma;    
  real alpha;                     
  real beta;                     
}

model {
  y ~ normal(alpha + beta * x, sigma);  
  
  alpha ~ normal(0, 5);
  beta ~ normal(0, 1);
  sigma ~ exponential(1);  
}

'

stan_data <- list(
    N   = nrow(movies), 
    x   = as.numeric(movies$genre),  
    y   = movies$rating
  )

stan_linear <- stan(model_code = stan_program, data = stan_data)
stan_linear
## Inference for Stan model: anon_model.
## 4 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=4000.
## 
##         mean se_mean   sd   2.5%    25%    50%    75%  97.5% n_eff Rhat
## sigma   1.48    0.00 0.10   1.29   1.41   1.48   1.55   1.69  1838    1
## alpha   4.93    0.01 0.46   4.04   4.63   4.93   5.23   5.86  1117    1
## beta    0.63    0.01 0.29   0.06   0.44   0.63   0.82   1.20  1127    1
## lp__  -91.23    0.04 1.23 -94.48 -91.81 -90.92 -90.30 -89.83  1062    1
## 
## Samples were drawn using NUTS(diag_e) at Tue Jul 18 20:10:12 2023.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).
stan_linear %>% 
  tidybayes::spread_draws(beta) %>%
  ggplot(aes(x = beta)) +
  tidybayes::stat_halfeye() +
  geom_vline(xintercept = 0)
stan_linear %>%
  tidybayes::gather_draws(beta) %>%
  tidybayes::mean_hdi(.width = 0.89)
## # A tibble: 1 × 7
##   .variable .value .lower .upper .width .point .interval
##   <chr>      <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 beta       0.631  0.168   1.11   0.89 mean   hdi