第 65 章 贝叶斯假设检验
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'readr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.2
## Warning: package 'lubridate' was built under R version 4.2.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'tidybayes' was built under R version 4.2.2
## Loading required package: StanHeaders
## Warning: package 'StanHeaders' was built under R version 4.2.3
##
## rstan version 2.26.13 (Stan version 2.26.1)
##
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
## For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
## change `threads_per_chain` option:
## rstan_options(threads_per_chain = 1)
##
## Do not specify '-march=native' in 'LOCAL_CPPFLAGS' or a Makevars file
##
## Attaching package: 'rstan'
##
## The following object is masked from 'package:tidyr':
##
## extract
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
theme_set(bayesplot::theme_default())
65.1 人们会给爱情片打高分?
这是一个关于电影评分的数据。我们想看下爱情片与动作片的平均得分是否存在显著不同?
## # A tibble: 100 × 5
## title year rating genre genre_numeric
## <chr> <int> <dbl> <fct> <dbl>
## 1 One of Our Spies Is Missing 1966 5.6 Action 1
## 2 Black Pirate, The 1926 7.5 Action 1
## 3 Caged Heat 3000 1995 2.9 Action 1
## 4 G.P.U. 1942 8.8 Action 1
## 5 Goal Club 2001 9.2 Action 1
## 6 Dead or Alive: Hanzaisha 1999 6.9 Action 1
## 7 Lost in Africa 1994 4 Action 1
## 8 Guns of the Magnificent Seven 1969 4.6 Action 1
## 9 Kiler 1997 6.9 Action 1
## 10 Steamboy 2004 6.6 Action 1
## # ℹ 90 more rows
65.1.1 可视化探索
看下两种题材电影评分的分布
movies %>%
ggplot(aes(x = genre, y = rating, color = genre)) +
geom_boxplot() +
geom_jitter() +
scale_x_discrete(
expand = expansion(mult = c(0.5, 0.5))
) +
theme(legend.position = "none")
65.1.2 计算均值差
统计两种题材电影评分的均值
group_diffs <- movies %>%
group_by(genre) %>%
summarize(avg_rating = mean(rating, na.rm = TRUE)) %>%
mutate(diff_means = avg_rating - lag(avg_rating))
group_diffs
## # A tibble: 2 × 3
## genre avg_rating diff_means
## <fct> <dbl> <dbl>
## 1 Action 5.55 NA
## 2 Romance 6.22 0.67
65.1.3 t检验
传统的t检验
t.test(
rating ~ genre,
data = movies,
var.equal = FALSE
)
##
## Welch Two Sample t-test
##
## data: rating by genre
## t = -2.2695, df = 83.737, p-value = 0.02581
## alternative hypothesis: true difference in means between group Action and group Romance is not equal to 0
## 95 percent confidence interval:
## -1.25709829 -0.08290171
## sample estimates:
## mean in group Action mean in group Romance
## 5.55 6.22
65.2 stan 代码
65.2.1 normal分布
先假定rating评分,服从正态分布,同时不同的电影题材分组考虑
\[ \begin{aligned} \textrm{rating} & \sim \textrm{normal}(\mu_{\textrm{genre}}, \, \sigma _{\textrm{genre}}) \\ \mu &\sim \textrm{normal}(\textrm{mean_rating}, \, 2) \\ \sigma &\sim \textrm{cauchy}(0, \, 1) \end{aligned} \]
stan_program <- '
data {
int<lower=1> N;
int<lower=2> n_groups;
vector[N] y;
int<lower=1, upper=n_groups> group_id[N];
}
transformed data {
real mean_y;
mean_y = mean(y);
}
parameters {
vector[2] mu;
vector<lower=0>[2] sigma;
}
model {
mu ~ normal(mean_y, 2);
sigma ~ cauchy(0, 1);
for (n in 1:N){
y[n] ~ normal(mu[group_id[n]], sigma[group_id[n]]);
}
}
generated quantities {
real mu_diff;
mu_diff = mu[2] - mu[1];
}
'
stan_data <- movies %>%
select(genre, rating, genre_numeric) %>%
tidybayes::compose_data(
N = nrow(.),
n_groups = n_distinct(genre),
group_id = genre_numeric,
y = rating
)
stan_best_normal <- stan(model_code = stan_program, data = stan_data)
stan_best_normal
## Inference for Stan model: anon_model.
## 4 chains, each with iter=2000; warmup=1000; thin=1;
## post-warmup draws per chain=1000, total post-warmup draws=4000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## mu[1] 5.56 0.00 0.25 5.07 5.40 5.55 5.72 6.05 3420 1
## mu[2] 6.22 0.00 0.16 5.91 6.12 6.22 6.32 6.54 4526 1
## sigma[1] 1.78 0.00 0.18 1.47 1.65 1.76 1.89 2.18 3937 1
## sigma[2] 1.15 0.00 0.12 0.95 1.06 1.14 1.22 1.39 4259 1
## mu_diff 0.66 0.00 0.30 0.08 0.47 0.67 0.86 1.24 4013 1
## lp__ -86.82 0.03 1.44 -90.42 -87.51 -86.48 -85.75 -85.05 1783 1
##
## Samples were drawn using NUTS(diag_e) at Tue Jul 18 20:08:15 2023.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
stan_best_normal %>%
tidybayes::spread_draws(mu_diff) %>%
ggplot(aes(x = mu_diff)) +
tidybayes::stat_halfeye() +
geom_vline(xintercept = 0)
stan_best_normal %>%
tidybayes::spread_draws(mu_diff) %>%
ggplot(aes(x = mu_diff)) +
stat_eye(side = "right",
fill = "skyblue",
point_interval = mode_hdi,
.width = c(0.5, 0.89),
interval_colour = "red",
point_colour = "red",
width = 15.5,
height = 0.1
) +
geom_vline(xintercept = c(-0.1, 0.1), linetype = "dashed", size = 1) +
coord_cartesian(xlim = c(-1, 2)) +
labs(x = "mu_diff", y = NULL)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
65.2.2 等效检验
我们一般会采用实用等效区间 region of practical equivalence ROPE。实用等效区间,就是我们感兴趣值附近的一个区间,比如这里的均值差。频率学中的零假设是看均值差是否为0,贝叶斯则是看均值差有多少落入0附近的区间。具体方法就是,先算出后验分布的高密度区间,然后看这个高密度区间落在[-0.1, 0.1]的比例.
lower <- -0.1*sd(movies$rating)
upper <- 0.1*sd(movies$rating)
stan_best_normal %>%
tidybayes::spread_draws(mu_diff) %>%
filter(
mu_diff > ggdist::mean_hdi(mu_diff, .width = c(0.89))$ymin,
mu_diff < ggdist::mean_hdi(mu_diff, .width = c(0.89))$ymax
) %>%
summarise(
percentage_in_rope = mean(between(mu_diff, lower, upper))
)
## # A tibble: 1 × 1
## percentage_in_rope
## <dbl>
## 1 0
在做假设检验的时候,我们内心是期望,后验概率的高密度区间落在实际等效区间的比例越小越小,如果小于2.5%,我们就可以拒绝零假设了;如果大于97.5%,我们就接受零假设。
stan_best_normal %>%
tidybayes::spread_draws(mu_diff) %>%
pull(mu_diff) %>%
bayestestR::rope(x,
range = c(-0.1, 0.1)*sd(movies$rating),
ci = 0.89,
ci_method = "HDI"
)
## # Proportion of samples inside the ROPE [-0.15, 0.15]:
##
## inside ROPE
## -----------
## 0.00 %
65.2.3 Student-t 分布
标准正态分布是t分布的极限分布
for (nu in c(1, seq(5, 50, by = 10))) {
p <- tibble(x = seq(-5, 5, by=0.1)) %>%
ggplot(aes(x)) +
stat_function(fun = dnorm, color = 'gray') +
stat_function(fun = dt, args = list(df = nu), color = 'blue') +
theme_classic() +
ylab("Density") +
xlab('Value') +
ggtitle(paste("df =", nu))
print(p)
}
假定rating评分服从student-t分布,
\[ \begin{aligned} \textrm{rating} & \sim \textrm{student_t}(\nu, \,\mu_{\textrm{genre}}, \, \sigma _{\textrm{genre}}) \\ \mu &\sim \textrm{normal}(\textrm{mean_rating}, \, 2) \\ \sigma &\sim \textrm{cauchy}(0, \, 1) \\ \nu &\sim \textrm{exponential}(1.0/29) \end{aligned} \]
stan_program <- '
data {
int<lower=1> N;
int<lower=2> n_groups;
vector[N] y;
int<lower=1, upper=n_groups> group_id[N];
}
transformed data {
real mean_y;
mean_y = mean(y);
}
parameters {
vector[2] mu;
vector<lower=0>[2] sigma;
real<lower=0, upper=100> nu;
}
model {
mu ~ normal(mean_y, 2);
sigma ~ cauchy(0, 1);
nu ~ exponential(1.0/29);
for (n in 1:N){
y[n] ~ student_t(nu, mu[group_id[n]], sigma[group_id[n]]);
}
}
generated quantities {
real mu_diff;
mu_diff = mu[2] - mu[1];
}
'
stan_data <- movies %>%
select(genre, rating, genre_numeric) %>%
tidybayes::compose_data(
N = nrow(.),
n_groups = n_distinct(genre),
group_id = genre_numeric,
y = rating
)
stan_best_student <- stan(model_code = stan_program, data = stan_data)
stan_best_student
## Inference for Stan model: anon_model.
## 4 chains, each with iter=2000; warmup=1000; thin=1;
## post-warmup draws per chain=1000, total post-warmup draws=4000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff
## mu[1] 5.54 0.00 0.25 5.05 5.36 5.54 5.71 6.04 3803
## mu[2] 6.24 0.00 0.15 5.93 6.13 6.24 6.34 6.54 4129
## sigma[1] 1.71 0.00 0.19 1.37 1.57 1.69 1.83 2.12 4192
## sigma[2] 1.05 0.00 0.13 0.81 0.96 1.04 1.13 1.33 3362
## nu 27.73 0.43 20.68 5.02 12.11 21.10 37.86 83.38 2351
## mu_diff 0.70 0.00 0.30 0.10 0.50 0.71 0.90 1.28 3941
## lp__ -119.54 0.04 1.65 -123.61 -120.37 -119.22 -118.32 -117.39 1599
## Rhat
## mu[1] 1
## mu[2] 1
## sigma[1] 1
## sigma[2] 1
## nu 1
## mu_diff 1
## lp__ 1
##
## Samples were drawn using NUTS(diag_e) at Tue Jul 18 20:09:15 2023.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
stan_best_student %>%
tidybayes::spread_draws(mu_diff) %>%
ggplot(aes(x = mu_diff)) +
tidybayes::stat_halfeye() +
geom_vline(xintercept = 0)
stan_best_student %>%
as.data.frame() %>%
head()
## mu[1] mu[2] sigma[1] sigma[2] nu mu_diff lp__
## 1 5.118808 6.270142 1.549989 1.2949051 21.28581 1.1513343 -120.8732
## 2 5.967722 6.459934 1.790923 0.9558584 22.51517 0.4922120 -120.0670
## 3 5.950175 6.433393 1.854735 1.0242260 19.36858 0.4832173 -119.4204
## 4 5.245263 6.025418 2.052233 0.9888823 18.47944 0.7801542 -120.2714
## 5 5.464269 6.373180 1.861468 1.0654877 50.48844 0.9089110 -118.6317
## 6 5.648799 6.307200 1.775365 1.1710360 67.58613 0.6584016 -119.0480
stan_best_student %>%
as.data.frame() %>%
ggplot(aes(x = `mu[1]`)) +
geom_density()
stan_best_student %>%
tidybayes::gather_draws(mu[i], sigma[i]) %>%
tidybayes::mean_hdi(.width = 0.89)
## # A tibble: 4 × 8
## i .variable .value .lower .upper .width .point .interval
## <int> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 1 mu 5.54 5.16 5.97 0.89 mean hdi
## 2 1 sigma 1.71 1.41 2.00 0.89 mean hdi
## 3 2 mu 6.24 6.00 6.49 0.89 mean hdi
## 4 2 sigma 1.05 0.840 1.25 0.89 mean hdi
65.4 作业
- 将上一章线性模型的stan代码应用到电影评分数据中
\[ \begin{aligned} \textrm{rating} & \sim \textrm{Normal}(\mu, \, \sigma) \\ \mu & = \alpha + \beta \, \textrm{genre} \\ \alpha &\sim \textrm{Normal}(0, \, 5) \\ \beta &\sim \textrm{Normal}(0, \, 1) \\ \sigma &\sim \textrm{Exponential}(1) \\ \end{aligned} \]
stan_program <- '
data {
int<lower=1> N;
vector[N] y;
vector[N] x;
}
parameters {
real<lower=0> sigma;
real alpha;
real beta;
}
model {
y ~ normal(alpha + beta * x, sigma);
alpha ~ normal(0, 5);
beta ~ normal(0, 1);
sigma ~ exponential(1);
}
'
stan_data <- list(
N = nrow(movies),
x = as.numeric(movies$genre),
y = movies$rating
)
stan_linear <- stan(model_code = stan_program, data = stan_data)
stan_linear
## Inference for Stan model: anon_model.
## 4 chains, each with iter=2000; warmup=1000; thin=1;
## post-warmup draws per chain=1000, total post-warmup draws=4000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## sigma 1.48 0.00 0.10 1.29 1.41 1.48 1.55 1.69 1838 1
## alpha 4.93 0.01 0.46 4.04 4.63 4.93 5.23 5.86 1117 1
## beta 0.63 0.01 0.29 0.06 0.44 0.63 0.82 1.20 1127 1
## lp__ -91.23 0.04 1.23 -94.48 -91.81 -90.92 -90.30 -89.83 1062 1
##
## Samples were drawn using NUTS(diag_e) at Tue Jul 18 20:10:12 2023.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
stan_linear %>%
tidybayes::spread_draws(beta) %>%
ggplot(aes(x = beta)) +
tidybayes::stat_halfeye() +
geom_vline(xintercept = 0)
stan_linear %>%
tidybayes::gather_draws(beta) %>%
tidybayes::mean_hdi(.width = 0.89)
## # A tibble: 1 × 7
## .variable .value .lower .upper .width .point .interval
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 beta 0.631 0.168 1.11 0.89 mean hdi