# Load packages
library(tidyverse)
library(cowplot)
library(ggridges)
library(ggforce)
library(patchwork)
library(viridis)
library(brms)
library(bayesplot)
library(tidybayes)
# Define function
<- function(k) {
beta_by_k <- 0.25
w tibble(x = seq(from = 0, to = 1, length.out = 1000)) |>
mutate(theta = dbeta(x = x,
shape1 = w * (k - 2) + 1,
shape2 = (1 - w) * (k - 2) + 1))
}
# Data
tibble(k = seq(from = 10, to = 200, by = 10)) |>
mutate(theta = map(k, beta_by_k)) |>
unnest(theta) |>
# Plot
ggplot(aes(x = x, y = k,
height = theta,
group = k, fill = k)) +
geom_vline(xintercept = 0.25, color = "grey85", linewidth = 1/2) +
geom_ridgeline(color = "grey92", scale = 2, size = 1/5) +
scale_y_continuous(expression(kappa), breaks = 1:20 * 10) +
scale_fill_viridis_c(expression(kappa), option = "A") +
xlab(expression(theta)) +
theme_minimal_hgrid()
9 Hierarchical Models
As Kruschke put it, “There are many realistic situations that involve meaningful hierarchical structure. Bayesian modeling software makes it straightforward to specify and analyze complex hierarchical models” (2015, p. 221). IMO, brms makes it even easier than JAGS. Further down, we read:
The parameters at different levels in a hierarchical model are all merely parameters that coexist in a joint parameter space. We simply apply Bayes’ rule to the joint parameter space, as we did for example when estimating two coin biases back in Figure 7.5, p. 167. To say it a little more formally with our parameters
and , Bayes’ rule applies to the joint parameter space: . What is special to hierarchical models is that the terms on the right-hand side can be factored into a chain of dependencies, like this:
The refactoring in the second line means that the data depend only on the value of
, in the sense that when the value is set then the data are independent of all other parameter values. Moreover, the value of depends on the value of and the value of is conditionally independent of all other parameters. Any model that can be factored into a chain of dependencies like [this] is a hierarchical model. (pp. 222–223)
9.1 A single coin from a single mint
Recall from the last chapter that our likelihood is the Bernoulli distribution,
We’ll use the beta density for our prior distribution for
And we can re-express
As a consequence, we can re-express
On page 224, Kruschke wrote: “The value of
Holding
<- tibble(x = seq(from = 0.01, to = 0.99, by = 0.01),
p1 d = (dbeta(x, 2, 2)) / max(dbeta(x, 2, 2))) |>
ggplot(aes(x = x, y = d)) +
geom_area(fill = "grey67") +
annotate(geom = "text",
x = 0.5, y = 0.2,
label = "beta",
size = 7) +
annotate(geom = "text",
x = 0.5, y = 0.6,
label = "italic(A[omega])*', '*italic(B[omega])",
family = "Times", parse = TRUE, size = 7) +
scale_x_continuous(expand = c(0, 0)) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
<- tibble(
p2 x = c(0.5, 0.475, 0.26, 0.08, 0.06, 0.5, 0.55, 0.85, 1.15, 1.2),
y = c(1, 0.7, 0.6, 0.5, 0.2, 1, 0.7, 0.6, 0.5, 0.2),
line = rep(letters[2:1], each = 5)) |>
ggplot(aes(x = x, y = y)) +
geom_bspline(aes(color = line),
linewidth = 2/3, show.legend = FALSE) +
annotate(geom = "text",
x = 0, y = 0.125,
label = "omega(italic(K)-2)+1*', '*(1-omega)(italic(K)-2)+1",
family = "Times", hjust = 0, parse = TRUE, size = 7) +
annotate(geom = "text",
x = 1/3, y = 0.7,
label = "'~'",
family = "Times", parse = TRUE, size = 10) +
scale_x_continuous(expand = c(0, 0), limits = c(0, 2)) +
scale_color_manual(values = c("grey75", "black")) +
ylim(0, 1) +
theme_void()
<- tibble(x = seq(from = 0.01, to = 0.99, by = 0.01),
p3 d = (dbeta(x, 2, 2)) / max(dbeta(x, 2, 2))) |>
ggplot(aes(x = x, y = d)) +
geom_area(fill = "grey67") +
annotate(geom = "text",
x = 0.5, y = 0.2,
label = "beta",
size = 7) +
scale_x_continuous(expand = c(0, 0)) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
## An annotated arrow
# Save our custom arrow settings
<- arrow(angle = 20, length = unit(0.35, "cm"), type = "closed")
my_arrow
<- tibble(x = 0.5,
p4 y = 1,
xend = 0.5,
yend = 0) |>
ggplot(aes(x = x, xend = xend,
y = y, yend = yend)) +
geom_segment(arrow = my_arrow) +
annotate(geom = "text",
x = 0.375, y = 1/3,
label = "'~'",
family = "Times", parse = TRUE, size = 10) +
xlim(0, 1) +
theme_void()
# Bar plot of Bernoulli data
<- tibble(x = 0:1,
p5 d = (dbinom(x, size = 1, prob = 0.6)) / max(dbinom(x, size = 1, prob = 0.6))) |>
ggplot(aes(x = x, y = d)) +
geom_col(fill = "grey67", width = 0.4) +
annotate(geom = "text",
x = 0.5, y = 0.2,
label = "Bernoulli",
size = 7) +
annotate(geom = "text",
x = 0.5, y = 0.94,
label = "theta",
family = "Times", parse = TRUE, size = 7) +
xlim(-0.75, 1.75) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
# another annotated arrow
<- tibble(x = c(0.375, 0.625),
p6 y = c(1/3, 1/3),
label = c("'~'", "italic(i)")) |>
ggplot(aes(x = x, y = y, label = label)) +
geom_text(family = "Times", parse = TRUE, size = c(10, 7)) +
geom_segment(x = 0.5, xend = 0.5,
y = 1, yend = 0,
arrow = my_arrow) +
xlim(0, 1) +
theme_void()
# Some text
<- tibble(x = 0.5,
p7 y = 0.5,
label = "italic(y[i])") |>
ggplot(aes(x = x, y = y, label = label)) +
geom_text(family = "Times", parse = TRUE, size = 7) +
xlim(0, 1) +
theme_void()
# Define the layout
<- c(
layout area(t = 1, b = 2, l = 1, r = 1),
area(t = 4, b = 5, l = 1, r = 1),
area(t = 3, b = 4, l = 1, r = 2),
area(t = 6, b = 6, l = 1, r = 1),
area(t = 7, b = 8, l = 1, r = 1),
area(t = 9, b = 9, l = 1, r = 1),
area(t = 10, b = 10, l = 1, r = 1))
# Combine, adjust, and plot!
+ p3 + p2 + p4 + p5 + p6 + p7) +
(p1 plot_layout(design = layout) &
ylim(0, 1) &
theme(plot.margin = margin(0, 5.5, 0, 5.5))
Note that whereas this model includes a hierarchical prior for
9.1.1 Posterior via grid approximation.
When the parameters extend over a finite domain, and there are not too many of them, then we can approximate the posterior via grid approximation. In our present situation, we have the parameters
and that both have finite domains, namely the interval . Therefore, a grid approximation is tractable and the distributions can be readily graphed. (p. 226)
Given
<- 2
alpha <- 2
beta
- 1) / (alpha + beta - 2) (alpha
[1] 0.5
That is, the mode of
We won’t be able to make the wireframe plots on the left of Figure 9.2, but we can make the others. We’ll make the initial data following Kruschke’s (p. 226) formulas.
First, we’ll make a custom function, make_prior()
based on the formulas.
<- function(theta, omega, alpha, beta, kappa) {
make_prior
# p(theta | omega)
<- dbeta(x = theta,
t shape1 = omega * (kappa - 2) + 1,
shape2 = (1 - omega) * (kappa - 2) + 1)
# p(omega)
<- dbeta(x = omega,
o shape1 = alpha,
shape2 = beta)
# p(theta, omega) = p(theta | omega) * p(omega)
* o
t }
Next we’ll define the parameter space as a tightly-spaced sequence of values ranging from 0 to 1.
<- seq(from = 0, to = 1, by = 0.01) parameter_space
Now we’ll use parameter_space
to define the ranges for the two variables, theta
and omega
, which we’ll save in a tibble. We’ll then sequentially feed those theta
and omega
values into our make_prior()
while manually specifying the desired values for alpha
, beta
, and kappa
.
# Define the grid for our grid approximation
<- crossing(theta = parameter_space,
d omega = parameter_space) |>
# Compute the joint prior
mutate(prior = make_prior(theta, omega, alpha = 2, beta = 2, kappa = 100)) |>
# Convert the prior from the density metric to the probability mass metric
mutate(prior = prior / sum(prior))
head(d)
# A tibble: 6 × 3
theta omega prior
<dbl> <dbl> <dbl>
1 0 0 0
2 0 0.01 0
3 0 0.02 0
4 0 0.03 0
5 0 0.04 0
6 0 0.05 0
Now we’re ready to plot the top middle panel of Figure 9.2.
|>
d ggplot(aes(x = theta, y = omega, fill = prior)) +
geom_raster(interpolate = TRUE) +
scale_x_continuous(expression(theta), expand = c(0, 0), limits = c(0, 1)) +
scale_y_continuous(expression(omega), expand = c(0, 0), limits = c(0, 1)) +
scale_fill_viridis_c(option = "A") +
coord_equal() +
theme_minimal_grid() +
theme(legend.position = "none")
You could also make this with geom_tile()
, but geom_raster()
with interpolate = TRUE
smooths the color transitions. Since we are going to be making a lot of plots like this in this chapter, we should consider streamlining our plotting code. In Chapter 19 of Wichkam’s (2016) ggplot2: Elegant graphics for data analysis, we learn how to make a custom geom. Here we’ll use those skills to wrap the bulk of the plot code from above into a single geom we’ll call geom_2dd()
, for 2D-density plots.
<- function(...) {
geom_2dd list(
geom_raster(interpolate = TRUE),
scale_x_continuous(breaks = 0:5 / 5, expand = c(0, 0), limits = c(0, 1)),
scale_y_continuous(breaks = 0:5 / 5, expand = c(0, 0), limits = c(0, 1)),
scale_fill_viridis_c(option = "A"),
coord_equal(),
theme_minimal_grid(...),
theme(legend.position = "none")
) }
Try it out.
|>
d ggplot(aes(x = theta, y = omega, fill = prior)) +
geom_2dd() +
labs(x = expression(theta),
y = expression(omega))
If we collapse “the joint prior across group_by(omega)
and then sum(prior)
), we plot the marginal distribution for
<- viridis_pal(option = "A")(9)[4]
a_purple
|>
d group_by(omega) |>
summarise(prior = sum(prior)) |>
ggplot(aes(x = omega, y = prior)) +
geom_area(fill = a_purple) +
scale_x_continuous(expression(omega), expand = c(0, 0), limits = c(0, 1), breaks = 0:5 / 5) +
scale_y_continuous(expression(Marginal~p(omega)), expand = c(0, 0), limits = c(0, 0.035)) +
coord_flip() +
theme_cowplot() +
panel_border() +
theme(axis.line = element_blank())
Note how we loaded the viridis package (Garnier, 2021). That gave us access to the viridis_pal()
function, which will allow us to discretize the viridis palettes and save the color names as objects. In our case, we discretized the "A"
palette into nine colors and saved the fourth as a_purple
. Here’s the color name.
a_purple
[1] "#822681FF"
We’ll use that color in many of the plots to follow. It’ll be something of a signature color for this chapter.
Anyway, since we are going to be making a lot of plots like this in this chapter, we’ll make another custom geom called geom_marginal()
.
<- function(ul, ...) {
geom_marginal list(
geom_area(fill = viridis_pal(option = "A")(9)[4]),
scale_x_continuous(expand = c(0, 0), limits = c(0, 1), breaks = 0:5 / 5),
scale_y_continuous(expand = c(0, 0), limits = c(0, ul)),
theme_cowplot(...),
panel_border(),
theme(axis.line = element_blank())
) }
Try it out.
|>
d group_by(omega) |>
summarise(prior = sum(prior)) |>
ggplot(aes(x = omega, y = prior)) +
geom_marginal(ul = 0.035) +
labs(x = expression(omega),
y = expression(Marginal~p(omega))) +
coord_flip()
We’ll follow a similar procedure to get the marginal probability distribution for theta
.
|>
d group_by(theta) |>
summarise(prior = sum(prior)) |>
ggplot(aes(x = theta, y = prior)) +
geom_marginal(ul = 0.035) +
labs(x = expression(theta),
y = expression(Marginal~p(theta)))
We’ll use the filter()
function to take the two slices from the posterior grid. Since we’re taking slices, we’re no longer working with the joint probability distribution. As such, our two marginal prior distributions for theta
no longer sum to 1, which means they’re no longer in a probability metric. No worries. After we group by omega
, we can simply divide prior
by the sum()
of prior
which renormalizes the two slices “so that they are individually proper probability densities that sum to
|>
d filter(omega %in% c(0.25, 0.75)) |>
group_by(omega) |>
mutate(prior = prior / sum(prior)) |>
mutate(label = factor(str_c("omega==", omega),
levels = c("omega==0.75", "omega==0.25"))) |>
ggplot(aes(x = theta, y = prior)) +
geom_marginal(ul = 0.095) +
labs(x = expression(theta),
y = expression(p(theta*"|"*omega))) +
facet_wrap(~ label, ncol = 1, labeller = label_parsed)
As Kruschke pointed out at the top of page 228, these are indeed beta densities. Here’s proof.
# We'll want this for the annotation
<- tibble(theta = c(0.75, 0.25),
d_text y = 10,
label = c("Beta(74.5, 25.5)", "Beta(25.5, 74.5)"),
omega = letters[1:2])
# Here's the primary data for the plot
tibble(theta = rep(parameter_space, times = 2),
alpha = rep(c(74.5, 25.5), each = 101),
beta = rep(c(25.5, 74.5), each = 101),
omega = rep(letters[1:2], each = 101)) |>
# The plot
ggplot(aes(x = theta, fill = omega)) +
geom_area(aes(y = dbeta(x = theta, shape1 = alpha, shape2 = beta))) +
geom_text(data = d_text,
aes(y = y, label = label, color = omega)) +
scale_x_continuous(expression(theta), breaks = 0:5 / 5,
expand = c(0, 0), limits = c(0, 1)) +
scale_y_continuous("density", expand = c(0, 0), limits = c(0, 11)) +
scale_fill_viridis_d(option = "A", begin = 2/9, end = 6/9) +
scale_color_viridis_d(option = "A", begin = 2/9, end = 6/9) +
theme_cowplot() +
panel_border() +
theme(axis.line = element_blank(),
legend.position = "none")
But back on track, we need the dbern()
function for the lower three rows of Figure 9.2.
<- function(x, z = NULL, n = NULL) {
dbern ^z * (1 - x)^(n - z)
x }
Time to feed theta
and our data into the dbern()
function, which will allow us to make the 2-dimensional density plot in the middle of Figure 9.2.
# Define the data
<- 12
n <- 9
z
# Compute the likelihood
<- d |>
d mutate(likelihood = dbern(x = theta, z = z, n = n))
# Plot
|>
d ggplot(aes(x = theta, y = omega, fill = likelihood)) +
geom_2dd() +
labs(x = expression(theta),
y = expression(omega))
Note how this plot demonstrates how the likelihood is solely dependent on
That is, in the second line of the equation, the probability of
From Formula 9.1, the posterior prior
column. So we just need to multiply prior
by likelihood
and divide by their sum.
Our first depiction will be the middle panel of the second row from the bottom.
<- d |>
d mutate(posterior = (likelihood * prior) / sum(likelihood * prior))
|>
d ggplot(aes(x = theta, y = omega, fill = posterior)) +
geom_2dd() +
labs(x = expression(theta),
y = expression(omega))
Although the likelihood was orthogonal to
Making the marginal plots for posterior
is much like when making them for prior
, above.
# For omega
|>
d group_by(omega) |>
summarise(posterior = sum(posterior)) |>
ggplot(aes(x = omega, y = posterior)) +
geom_marginal(ul = 0.035) +
labs(x = expression(omega),
y = expression(Marginal~p(omega*"|"*D))) +
coord_flip()
# For theta
|>
d group_by(theta) |>
summarise(posterior = sum(posterior)) |>
ggplot(aes(x = theta, y = posterior)) +
geom_marginal(ul = 0.035) +
labs(x = expression(theta),
y = expression(Marginal~p(theta*"|"*D))) +
coord_cartesian()
Note that after we slice with filter()
, the next two wrangling lines renormalize those posterior slices into probability metrics. That is, when we take a slice through the joint posterior at a particular value of
|>
d filter(omega %in% c(0.25, 0.75)) |>
group_by(omega) |>
mutate(posterior = posterior / sum(posterior)) |>
mutate(label = factor(str_c("omega==", omega),
levels = c("omega==0.75", "omega==0.25"))) |>
ggplot(aes(x = theta, y = posterior)) +
geom_marginal(ul = 0.1) +
labs(x = expression(theta),
y = expression(p(theta*"|"*omega))) +
facet_wrap(~ label, ncol = 1, labeller = label_parsed, scales = "free")
In the next example depicted in Figure 9.3, we consider what happens when we combine the same data of 9 heads out of 12 trials to the same Bernoulli likelihood
To repeat the process for Figure 9.3, we’ll first compute the new joint prior.
<- crossing(theta = parameter_space,
d omega = parameter_space) |>
mutate(prior = make_prior(theta, omega, alpha = 20, beta = 20, kappa = 6)) |>
mutate(prior = prior / sum(prior))
Here’s the initial data and the 2-dimensional density plot for the prior, the middle plot in the top row of Figure 9.3.
|>
d ggplot(aes(x = theta, y = omega, fill = prior)) +
geom_2dd() +
labs(x = expression(theta),
y = expression(omega))
That higher certainty in omega
and theta
to plot their marginal prior distributions.
# For omega
|>
d group_by(omega) |>
summarise(prior = sum(prior)) |>
ggplot(aes(x = omega, y = prior)) +
geom_marginal(ul = 0.052) +
labs(x = expression(omega),
y = expression(Marginal~p(omega))) +
coord_flip()
# For theta
|>
d group_by(theta) |>
summarise(prior = sum(prior)) |>
ggplot(aes(x = theta, y = prior)) +
geom_marginal(ul = 0.039) +
labs(x = expression(theta),
y = expression(Marginal~p(theta)))
Here are the two short plots in the right panel of the second row from the top of Figure 9.3.
|>
d filter(omega %in% c(0.25, 0.75)) |>
group_by(omega) |>
mutate(prior = prior / sum(prior)) |>
mutate(label = factor(str_c("omega == ", omega),
levels = c("omega == 0.75", "omega == 0.25"))) |>
ggplot(aes(x = theta, y = prior)) +
geom_marginal(ul = 0.039) +
labs(x = expression(theta),
y = expression(p(theta*"|"*omega))) +
facet_wrap(~ label, ncol = 1, labeller = label_parsed)
Now we’re ready for the likelihood.
# Compute
<- d |>
d mutate(likelihood = dbern(x = theta, z = z, n = n))
# Plot
|>
d ggplot(aes(x = theta, y = omega, fill = likelihood)) +
geom_2dd() +
labs(x = expression(theta),
y = expression(omega))
Now on to the posterior. Our first depiction will be the middle panel of the second row from the bottom of Figure 9.3. This will be
# Compute the posterior
<- d |>
d mutate(posterior = (likelihood * prior) / sum(likelihood * prior))
# Plot
|>
d ggplot(aes(x = theta, y = omega, fill = posterior)) +
geom_2dd() +
labs(x = expression(theta),
y = expression(omega))
Here are the marginal plots for the two dimensions in our posterior
.
# For omega
|>
d group_by(omega) |>
summarise(posterior = sum(posterior)) |>
ggplot(aes(x = omega, y = posterior)) +
geom_marginal(ul = 0.052) +
labs(x = expression(omega),
y = expression(Marginal~p(omega*"|"*D))) +
coord_flip()
# For theta
|>
d group_by(theta) |>
summarise(posterior = sum(posterior)) |>
ggplot(aes(x = theta, y = posterior)) +
geom_marginal(ul = 0.039) +
labs(x = expression(theta),
y = expression(Marginal~p(theta*"|"*D)))
And we’ll finish off with the plots of Figure 9.3’s lower right panel.
|>
d filter(omega %in% c(0.25, 0.75)) |>
group_by(omega) |>
mutate(posterior = posterior / sum(posterior)) |>
mutate(label = factor(str_c("omega==", omega),
levels = c("omega==0.75", "omega==0.25"))) |>
ggplot(aes(x = theta, y = posterior)) +
geom_marginal(ul = 0.039) +
labs(x = expression(theta),
y = expression(p(theta*"|"*omega))) +
facet_wrap(~ label, ncol = 1, labeller = label_parsed, scales = "free")
In summary, Bayesian inference in a hierarchical model is merely Bayesian inference on a joint parameter space, but we look at the joint distribution (e.g.,
) in terms of its marginal on a subset of parameters (e.g., ) and its conditional distribution for other parameters (e.g., ). We do this primarily because it is meaningful in the context of particular models. (p. 230)
9.2 Multiple coins from a single mint
What if we collect data from more than one coin created by the mint? If each coin has its own distinct bias
, then we are estimating a distinct parameter value for each coin, and using all the data to estimate . (p. 230)
Kruschke broke down a model of this form with his diagram in Figure 9.4. Here’s our version of that figure.
<- tibble(x = seq(from = 0.01, to = 0.99, by = 0.01),
p1 d = (dbeta(x, 2, 2)) / max(dbeta(x, 2, 2))) |>
ggplot(aes(x = x, y = d)) +
geom_area(fill = a_purple) +
annotate(geom = "text",
x = 0.5, y = 0.2,
label = "beta",
size = 7) +
annotate(geom = "text",
x = 0.5, y = 0.6,
label = "italic(A[omega])*', '*italic(B[omega])",
family = "Times", parse = TRUE, size = 7) +
scale_x_continuous(expand = c(0, 0)) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
<- tibble(
p2 x = c(0.5, 0.475, 0.26, 0.08, 0.06, 0.5, 0.55, 0.85, 1.15, 1.2),
y = c(1, 0.7, 0.6, 0.5, 0.2, 1, 0.7, 0.6, 0.5, 0.2),
line = rep(letters[2:1], each = 5)) |>
ggplot(aes(x = x, y = y)) +
geom_bspline(aes(color = line),
linewidth = 2/3, show.legend = FALSE) +
annotate(geom = "text",
x = 0, y = 0.125,
label = "omega(italic(K)-2)+1*', '*(1-omega)(italic(K)-2)+1",
family = "Times", hjust = 0, parse = TRUE, size = 7) +
annotate(geom = "text",
x = 1/3, y = 0.7,
label = "'~'",
family = "Times", parse = TRUE, size = 10) +
scale_x_continuous(expand = c(0, 0), limits = c(0, 2)) +
scale_color_manual(values = c("grey75", "black")) +
ylim(0, 1) +
theme_void()
<- tibble(x = seq(from = 0.01, to = 0.99, by = 0.01),
p3 d = (dbeta(x, 2, 2)) / max(dbeta(x, 2, 2))) |>
ggplot(aes(x = x, y = d)) +
geom_area(fill = a_purple) +
annotate(geom = "text",
x = 0.5, y = 0.2,
label = "beta",
size = 7) +
scale_x_continuous(expand = c(0, 0)) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
# An annotated arrow
<- tibble(x = c(0.35, 0.65),
p4 y = c(1/3, 1/3),
label = c("'~'", "italic(s)")) |>
ggplot(aes(x = x, y = y, label = label)) +
geom_text(family = "Times", parse = TRUE, size = c(10, 7)) +
geom_segment(x = 0.5, xend = 0.5,
y = 1, yend = 0,
arrow = my_arrow) +
xlim(0, 1) +
theme_void()
# Bar plot of Bernoulli data
<- tibble(x = 0:1,
p5 d = (dbinom(x, size = 1, prob = 0.6)) / max(dbinom(x, size = 1, prob = 0.6))) |>
ggplot(aes(x = x, y = d)) +
geom_col(fill = a_purple, width = 0.4) +
annotate(geom = "text",
x = 0.5, y = 0.2,
label = "Bernoulli",
size = 7) +
annotate(geom = "text",
x = 0.5, y = 0.92,
label = "theta[italic(s)]",
family = "Times", parse = TRUE, size = 7) +
xlim(-0.75, 1.75) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
# Another annotated arrow
<- tibble(x = c(0.35, 0.65),
p6 y = c(1/3, 1/3),
label = c("'~'", "italic(i)*'|'*italic(s)")) |>
ggplot(aes(x = x, y = y, label = label)) +
geom_text(family = "Times", parse = TRUE, size = c(10, 7)) +
geom_segment(x = 0.5, xend = 0.5,
y = 1, yend = 0,
arrow = my_arrow) +
xlim(0, 1) +
theme_void()
# Some text
<- tibble(x = 1,
p7 y = 0.5,
label = "italic(y)[italic(i)*'|'*italic(s)]") |>
ggplot(aes(x = x, y = y, label = label)) +
geom_text(family = "Times", parse = TRUE, size = 7) +
xlim(0, 2) +
theme_void()
# Define the layout
<- c(
layout area(t = 1, b = 2, l = 1, r = 1),
area(t = 4, b = 5, l = 1, r = 1),
area(t = 3, b = 4, l = 1, r = 2),
area(t = 6, b = 6, l = 1, r = 1),
area(t = 7, b = 8, l = 1, r = 1),
area(t = 9, b = 9, l = 1, r = 1),
area(t = 10, b = 10, l = 1, r = 1))
# Plot!
+ p3 + p2 + p4 + p5 + p6 + p7) +
(p1 plot_layout(design = layout) &
ylim(0, 1) &
theme(plot.margin = margin(0, 5.5, 0, 5.5))
The diagram accounts for multiple coins with the
9.2.1 Posterior via grid approximation.
Now we have two coins,
the full prior distribution is a joint distribution over three parameters:
, , and . In a grid approximation, the prior is specified as a three-dimensional (3D) array that holds the prior probability at various grid points in the 3D space. (p. 233)
The biases for both coins, make_prior()
function. It was originally designed to handle two dimensions,
<- function(theta1, theta2, omega, alpha, beta, kappa) {
make_prior
# p(theta_1 | omega)
<- dbeta(x = theta1,
t1 shape1 = omega * (kappa - 2) + 1,
shape2 = (1 - omega) * (kappa - 2) + 1)
# p(theta_2 | omega)
<- dbeta(x = theta2,
t2 shape1 = omega * (kappa - 2) + 1,
shape2 = (1 - omega) * (kappa - 2) + 1)
# p(omega)
<- dbeta(x = omega,
o shape1 = alpha,
shape2 = beta)
# p(theta1, theta2, omega) = p(theta1 | omega) * p(theta2 | omega) * p(omega)
* t2 * o
t1 }
Let’s make our new data object, d
.
<- crossing(theta_1 = parameter_space,
d theta_2 = parameter_space,
omega = parameter_space) |>
mutate(prior = make_prior(theta_1, theta_2, omega, alpha = 2, beta = 2, kappa = 5)) |>
# Here we normalize
mutate(prior = prior / sum(prior))
glimpse(d)
Rows: 1,030,301
Columns: 4
$ theta_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ theta_2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ omega <dbl> 0.00, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.…
$ prior <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
Unlike what Kruschke said in the text (p. 233), we’re not using a 3D data array. Rather, we’re just using a tibble with which prior
has been expanded across all possible dimensions of the three indexing variables: theta_1
, theta_2
, and omega
. As you can see from the ‘Rows’ count, above, this makes for a very long tibble.
“Because the parameter space is 3D, a distribution on it cannot easily be displayed on a 2D page. Instead, Figure 9.5 shows various marginal distributions” (p. 234). The consequence of that is when we marginalize, we’ll have to group by the two variables we’d like to retain for the plot. For example, the plots in the left and middle columns of the top row are the same save for their indices. So let’s just do the plot for theta_1
. In order to marginalize over theta_2
, we’ll need to group_by(theta_1, omega)
and then summarise(prior = sum(prior))
.
|>
d group_by(theta_1, omega) |>
summarise(prior = sum(prior)) |>
ggplot(aes(x = theta_1, y = omega, fill = prior)) +
geom_2dd() +
labs(x = expression(theta[1]),
y = expression(omega))
But we just have to average over omega
and theta_1
to plot their marginal prior distributions.
# For omega
|>
d group_by(omega) |>
summarise(prior = sum(prior)) |>
ggplot(aes(x = omega, y = prior)) +
geom_marginal(ul = 0.041) +
labs(x = expression(omega),
y = expression(p(omega))) +
coord_flip()
# For theta
|>
d group_by(theta_1) |>
summarise(prior = sum(prior)) |>
ggplot(aes(x = theta_1, y = prior)) +
geom_marginal(ul = 0.041) +
labs(x = expression(theta[1]),
y = expression(p(theta[1])))
Before we make the plots in the middle row of Figure 9.5, we need to add the likelihoods. Recall that we’re presuming the coin flips contained in
independence of the data across the two coins means that the data from coin 1 depend only on the bias in coin 1, and the data from coin 2 depend only on the bias in coin 2, which can be expressed formally as
and . (p. 164)
The likelihood function for our two series of coin flips is then
The upshot is we can compute the likelihoods for
# D1: 3 heads, 12 tails
<- 15
n1 <- 3
z1
# D2: 4 heads, 1 tail
<- 5
n2 <- 4
z2
<- d |>
d mutate(likelihood_1 = dbern(x = theta_1, z = z1, n = n1),
likelihood_2 = dbern(x = theta_2, z = z2, n = n2)) |>
mutate(likelihood = likelihood_1 * likelihood_2)
head(d)
# A tibble: 6 × 7
theta_1 theta_2 omega prior likelihood_1 likelihood_2 likelihood
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 0 0 0 0 0 0
2 0 0 0.01 0 0 0 0
3 0 0 0.02 0 0 0 0
4 0 0 0.03 0 0 0 0
5 0 0 0.04 0 0 0 0
6 0 0 0.05 0 0 0 0
Now after a little group_by()
followed by summarise()
we can plot the two marginal likelihoods, the two plots in the middle row of Figure 9.5.
# likelihood_1
|>
d group_by(theta_1, omega) |>
summarise(likelihood = sum(likelihood)) |>
ggplot(aes(x = theta_1, y = omega, fill = likelihood)) +
geom_2dd() +
labs(x = expression(theta[1]),
y = expression(omega))
# likelihood_2
|>
d group_by(theta_2, omega) |>
summarise(likelihood = sum(likelihood)) |>
ggplot(aes(x = theta_2, y = omega, fill = likelihood)) +
geom_2dd() +
labs(x = expression(theta[2]),
y = expression(omega))
The likelihoods look good. Next we compute the posterior in the same way we’ve done before: multiply the prior and the likelihood and then divide by their sum in order to convert the results to a probability metric.
# Compute
<- d |>
d mutate(posterior = (prior * likelihood) / sum(prior * likelihood))
# posterior_1
|>
d group_by(theta_1, omega) |>
summarise(posterior = sum(posterior)) |>
ggplot(aes(x = theta_1, y = omega, fill = posterior)) +
geom_2dd() +
labs(x = expression(theta[1]),
y = expression(omega))
# posterior_2
|>
d group_by(theta_2, omega) |>
summarise(posterior = sum(posterior)) |>
ggplot(aes(x = theta_2, y = omega, fill = posterior)) +
geom_2dd() +
labs(x = expression(theta[2]),
y = expression(omega))
Here’s the right plot on the second row from the bottom, the posterior distribution for
# For omega
|>
d group_by(omega) |>
summarise(posterior = sum(posterior)) |>
ggplot(aes(x = omega, y = posterior)) +
geom_marginal(ul = 0.041) +
labs(x = expression(omega),
y = expression(p(omega*"|"*D))) +
coord_flip()
Now here are the marginal posterior plots on the bottom row of Figure 9.5.
# For theta_1
|>
d group_by(theta_1) |>
summarise(posterior = sum(posterior)) |>
ggplot(aes(x = theta_1, y = posterior)) +
geom_marginal(ul = 0.041) +
labs(x = expression(theta[1]),
y = expression(p(theta[1]*"|"*D)))
# For theta_2
|>
d group_by(theta_2) |>
summarise(posterior = sum(posterior)) |>
ggplot(aes(x = theta_2, y = posterior)) +
geom_marginal(ul = 0.041) +
labs(x = expression(theta[2]),
y = expression(p(theta[2]*"|"*D)))
We’ll do this dog and pony one more time for Figure 9.6. Keeping the data and likelihood constant, we now set d
.
<- crossing(theta_1 = parameter_space,
d theta_2 = parameter_space,
omega = parameter_space) |>
mutate(prior = make_prior(theta_1, theta_2, omega, alpha = 2, beta = 2, kappa = 75)) |>
mutate(prior = prior / sum(prior))
Again, note how the only thing we changed from the last time was increasing kappa
to 75. Also like last time, the plots in the left and middle columns of the top row are the same save for their indices. But unlike last time, we’ll make both in preparation for a grand plotting finale. You’ll see.
One more step: for the 2D density plots in this section, we’ll omit the coord_equal()
line from our custom geom_2dd()
geom. This will help us with the formatting of our final plot.
<- function(...) {
geom_2dd list(
geom_raster(interpolate = TRUE),
scale_fill_viridis_c(option = "A"),
scale_x_continuous(breaks = 0:5 / 5, expand = c(0, 0), limits = c(0, 1)),
scale_y_continuous(breaks = 0:5 / 5, expand = c(0, 0), limits = c(0, 1)),
# coord_equal(),
theme_minimal_grid(...),
theme(legend.position = "none")
) }
Okay, here are our two 2D prior density plots.
<- d |>
p11 group_by(theta_1, omega) |>
summarise(prior = sum(prior)) |>
ggplot(aes(x = theta_1, y = omega, fill = prior)) +
geom_2dd(font_size = 10) +
annotate(geom = "text", x = 0.05, y = 0.925,
label = "p(list(theta[1], omega))",
color = "white", hjust = 0, parse = TRUE) +
labs(x = expression(theta[1]),
y = expression(omega))
<- d |>
p12 group_by(theta_2, omega) |>
summarise(prior = sum(prior)) |>
ggplot(aes(x = theta_2, y = omega, fill = prior)) +
geom_2dd(font_size = 10) +
annotate(geom = "text", x = 0.05, y = 0.925,
label = "p(list(theta[2], omega))",
color = "white", hjust = 0, parse = TRUE) +
labs(x = expression(theta[2]),
y = expression(omega))
p11
p12
Now we’ll average over omega
and theta
to plot their marginal prior distributions.
# For omega
<- d |>
p13 group_by(omega) |>
summarise(prior = sum(prior)) |>
ggplot(aes(x = omega, y = prior)) +
geom_marginal(ul = 0.041, font_size = 10) +
labs(x = expression(omega),
y = expression(p(omega))) +
coord_flip()
# For theta_1
<- d |>
p21 group_by(theta_1) |>
summarise(prior = sum(prior)) |>
ggplot(aes(x = theta_1, y = prior)) +
geom_marginal(ul = 0.041, font_size = 10) +
labs(x = expression(theta[1]),
y = expression(p(theta[1])))
# For theta_2
<- d |>
p22 group_by(theta_2) |>
summarise(prior = sum(prior)) |>
ggplot(aes(x = theta_2, y = prior)) +
geom_marginal(ul = 0.041, font_size = 10) +
labs(x = expression(theta[2]),
y = expression(p(theta[2])))
p13
p21
p22
Let’s get those likelihoods in there and plot.
# D1: 3 heads, 12 tails
<- 15
n1 <- 3
z1
# D2: 4 heads, 1 tail
<- 5
n2 <- 4
z2
# Compute the likelihoods
<- d |>
d mutate(likelihood_1 = dbern(x = theta_1, z = z1, n = n1),
likelihood_2 = dbern(x = theta_2, z = z2, n = n2)) |>
mutate(likelihood = likelihood_1 * likelihood_2)
# Plot likelihood_1
<- d |>
p31 group_by(theta_1, omega) |>
summarise(likelihood = sum(likelihood)) |>
ggplot(aes(x = theta_1, y = omega, fill = likelihood)) +
geom_2dd(font_size = 10) +
labs(x = expression(theta[1]),
y = expression(omega))
# Plot likelihood_2
<- d |>
p32 group_by(theta_2, omega) |>
summarise(likelihood = sum(likelihood)) |>
ggplot(aes(x = theta_2, y = omega, fill = likelihood)) +
geom_2dd(font_size = 10) +
labs(x = expression(theta[2]),
y = expression(omega))
p31
p32
Compute the posterior and make the left and middle plots of the second row to the bottom of Figure 9.6.
<- d |>
d mutate(posterior = (prior * likelihood) / sum(prior * likelihood))
# posterior_1
<- d |>
p41 group_by(theta_1, omega) |>
summarise(posterior = sum(posterior)) |>
ggplot(aes(x = theta_1, y = omega, fill = posterior)) +
geom_2dd(font_size = 10) +
annotate(geom = "text", x = 0.05, y = 0.925,
label = expression(p(list(theta[1], omega)*"|"*D)),
color = "white", hjust = 0, parse = TRUE) +
labs(x = expression(theta[1]),
y = expression(omega))
# posterior_2
<- d |>
p42 group_by(theta_2, omega) |>
summarise(posterior = sum(posterior)) |>
ggplot(aes(x = theta_2, y = omega, fill = posterior)) +
geom_2dd(font_size = 10) +
annotate(geom = "text", x = 0.05, y = 0.925,
label = expression(p(list(theta[2], omega)*"|"*D)),
color = "white", hjust = 0, parse = TRUE) +
labs(x = expression(theta[2]),
y = expression(omega))
p41
p42
Here’s the right plot on the same row, the posterior distribution for
# For omega
<- d |>
p43 group_by(omega) |>
summarise(posterior = sum(posterior)) |>
ggplot(aes(x = omega, y = posterior)) +
geom_marginal(ul = 0.041, font_size = 10) +
labs(x = expression(omega),
y = expression(p(omega*"|"*D))) +
coord_flip()
p43
Finally, here are the marginal posterior plots on the bottom row of Figure 9.6.
# For theta_1
<- d |>
p51 group_by(theta_1) |>
summarise(posterior = sum(posterior)) |>
ggplot(aes(x = theta_1, y = posterior)) +
geom_marginal(ul = 0.041, font_size = 10) +
labs(x = expression(theta[1]),
y = expression(p(theta[1]*"|"*D)))
# For theta_2
<- d |>
p52 group_by(theta_2) |>
summarise(posterior = sum(posterior)) |>
ggplot(aes(x = theta_2, y = posterior)) +
geom_marginal(ul = 0.041, font_size = 10) +
labs(x = expression(theta[2]),
y = expression(p(theta[2]*"|"*D)))
p51
p52
Did you notice how we saved each of plot from this last batch as objects? For the grand finale of this subsection, we’ll be stitching all those subplots together using syntax from the patchwork package. But before we do, we need to define three more subplots: the subplots with the annotation.
<- tibble(x = 1,
d_text y = 10:8,
label = c("Prior", "list(A[omega]==2, B[omega]==2)", "K==75"),
size = c(2, 1, 1))
<- d_text |>
p23 ggplot(aes(x = x, y = y, label = label)) +
geom_text(aes(size = size),
hjust = 0, parse = TRUE, show.legend = FALSE) +
scale_size_continuous(range = c(3.5, 5.5)) +
coord_cartesian(xlim = c(1, 2),
ylim = c(4, 11)) +
theme_cowplot(font_size = 10) +
theme(axis.line = element_blank(),
axis.text = element_text(color = "white"),
axis.ticks = element_blank(),
text = element_text(color = "white"))
<- tibble(x = 1,
d_text y = 10:8,
label = c("Likelihood", "D1: 3 heads, 12 tails", "D2: 4 heads, 1 tail"),
size = c(2, 1, 1))
<- d_text |>
p33 ggplot(aes(x = x, y = y, label = label)) +
geom_text(aes(size = size),
hjust = 0, show.legend = FALSE) +
scale_size_continuous(range = c(3.5, 5.5)) +
coord_cartesian(xlim = c(1, 2),
ylim = c(4, 11)) +
theme_cowplot(font_size = 10) +
theme(axis.line = element_blank(),
axis.text = element_text(color = "white"),
axis.ticks = element_blank(),
text = element_text(color = "white"))
<- ggplot() +
p53 annotate(geom = "text", x = 1, y = 10,
label = "Posterior", hjust = 0, size = 6) +
coord_cartesian(xlim = c(1, 2),
ylim = c(3, 11)) +
theme_cowplot(font_size = 10) +
theme(axis.line = element_blank(),
axis.text = element_text(color = "white"),
axis.ticks = element_blank(),
text = element_text(color = "white"))
Okay, let’s make the full version of Figure 9.6.
/ p21 / p31 / p41 / p51) | (p12 / p22 / p32 / p42 / p52) | (p13 / p23 / p33 / p43 / p53) (p11
Oh mamma!
The grid approximation displayed in Figures 9.5 and 9.6 used combs of only
points on each parameter ( , , and ). This means that the 3D grid had points, which is a size that can be handled easily on an ordinary desktop computer of the early st century. It is interesting to remind ourselves that the grid approximation displayed in Figures 9.5 and 9.6 would have been on the edge of computability years ago, and would have been impossible years ago. The number of points in a grid approximation can get hefty in a hurry. If we were to expand the example by including a third coin, with its parameter , then the grid would have points, which already strains small computers. Include a fourth coin, and the grid contains over billion points. Grid approximation is not a viable approach to even modestly large problems, which we encounter next. (p. 235)
In case you didn’t catch it, we used different numbers of points to evaluate each parameter. Whereas Kruschke indicated in the |> he only used 50, we used 101. That value of 101 came from how we defined our parameter_space
with the code seq(from = 0, to = 1, by = 0.01)
. The reason we used a more densely-packed parameter space was to get smoother-looking 2D density plots.
9.2.2 A realistic model with MCMC.
In this section, Kruschke freed up the previously fixed value of
# A beta density
<- tibble(x = seq(from = 0.01, to = 0.99, by = 0.01),
p1 d = (dbeta(x, 2, 2)) / max(dbeta(x, 2, 2))) |>
ggplot(aes(x = x, y = d)) +
geom_area(fill = a_purple) +
annotate(geom = "text",
x = 0.5, y = 0.2,
label = "beta",
size = 7) +
annotate(geom = "text",
x = 0.5, y = 0.6,
label = "italic(A[omega])*', '*italic(B[omega])",
family = "Times", parse = TRUE, size = 7) +
scale_x_continuous(expand = c(0, 0)) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
# A gamma density
<- tibble(x = seq(from = 0, to = 5, by = 0.01),
p2 d = (dgamma(x, 1.75, 0.85) / max(dgamma(x, 1.75, 0.85)))) |>
ggplot(aes(x = x, y = d)) +
geom_area(fill = a_purple) +
annotate(geom = "text",
x = 2.5, y = 0.2,
label = "gamma",
size = 7) +
annotate(geom = "text",
x = 2.5, y = 0.6,
label = "list(italic(S)[kappa], italic(R)[kappa])",
family = "Times", parse = TRUE, size = 7) +
scale_x_continuous(expand = c(0, 0)) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
<- tibble(
p3 x = c(0.5, 0.475, 0.26, 0.08, 0.06, 0.5, 0.55, 0.85, 1.15, 1.175,
1.5, 1.4, 1, 0.25, 0.2, 1.5, 1.49, 1.445, 1.4, 1.39),
y = c(1, 0.7, 0.6, 0.5, 0.2, 1, 0.7, 0.6, 0.5, 0.2,
1, 0.7, 0.6, 0.5, 0.2, 1, 0.75, 0.6, 0.45, 0.2),
line = rep(letters[2:1], each = 5) |> rep(times = 2),
plot = rep(1:2, each = 10)) |>
ggplot(aes(x = x, y = y, group = interaction(plot, line))) +
geom_bspline(aes(color = line),
linewidth = 2/3, show.legend = FALSE) +
annotate(geom = "text",
x = 0, y = 0.1,
label = "omega(kappa-2)+1*', '*(1-omega)(kappa-2)+1",
family = "Times", hjust = 0, parse = TRUE, size = 7) +
annotate(geom = "text",
x = c(1/3, 1.15), y = 0.7,
label = "'~'",
family = "Times", parse = TRUE, size = 10) +
scale_color_manual(values = c("grey75", "black")) +
scale_x_continuous(expand = c(0, 0), limits = c(0, 2)) +
ylim(0, 1) +
theme_void()
# Another beta density
<- tibble(x = seq(from = 0.01, to = 0.99, by = 0.01),
p4 d = (dbeta(x, 2, 2)) / max(dbeta(x, 2, 2))) |>
ggplot(aes(x = x, y = d)) +
geom_area(fill = a_purple) +
annotate(geom = "text",
x = 0.5, y = 0.2,
label = "beta",
size = 7) +
scale_x_continuous(expand = c(0, 0)) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
# An annotated arrow
<- tibble(x = c(0.375, 0.625),
p5 y = c(1/3, 1/3),
label = c("'~'", "italic(s)")) |>
ggplot(aes(x = x, y = y, label = label)) +
geom_text(family = "Times", parse = TRUE, size = c(10, 7)) +
geom_segment(x = 0.5, xend = 0.5,
y = 1, yend = 0,
arrow = my_arrow) +
xlim(0, 1) +
theme_void()
# Bar plot of Bernoulli data
<- tibble(x = 0:1,
p6 d = (dbinom(x, size = 1, prob = 0.6)) / max(dbinom(x, size = 1, prob = 0.6))) |>
ggplot(aes(x = x, y = d)) +
geom_col(fill = a_purple, width = 0.4) +
annotate(geom = "text",
x = 0.5, y = 0.2,
label = "Bernoulli",
size = 7) +
annotate(geom = "text",
x = 0.5, y = 0.94,
label = "theta",
family = "Times", parse = TRUE, size = 7) +
xlim(-0.75, 1.75) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
# Another annotated arrow
<- tibble(x = c(0.35, 0.65),
p7 y = c(1/3, 1/3),
label = c("'~'", "italic(i)*'|'*italic(s)")) |>
ggplot(aes(x = x, y = y, label = label)) +
geom_text(family = "Times", parse = TRUE, size = c(10, 7)) +
geom_segment(x = 0.5, xend = 0.5,
y = 1, yend = 0,
arrow = my_arrow) +
xlim(0, 1) +
theme_void()
# Some text
<- tibble(x = 0.5,
p8 y = 0.5,
label = "italic(y[i])['|'][italic(s)]") |>
ggplot(aes(x = x, y = y, label = label)) +
geom_text(family = "Times", parse = TRUE, size = 7) +
xlim(0, 1) +
theme_void()
# Define the layout
<- c(
layout area(t = 1, b = 2, l = 1, r = 1),
area(t = 1, b = 2, l = 2, r = 2),
area(t = 4, b = 5, l = 1, r = 1),
area(t = 3, b = 4, l = 1, r = 2),
area(t = 6, b = 6, l = 1, r = 1),
area(t = 7, b = 8, l = 1, r = 1),
area(t = 9, b = 9, l = 1, r = 1),
area(t = 10, b = 10, l = 1, r = 1))
# Plot!
+ p2 + p4 + p3 + p5 + p6 + p7 + p8) +
(p1 plot_layout(design = layout) &
ylim(0, 1) &
theme(plot.margin = margin(0, 5.5, 0, 5.5))
“Because the value of
# How many points do you want in your sequence of x values?
<- 150
length
# Wrangle
tibble(shape = c(0.01, 1.56, 1, 6.25),
rate = c(0.01, 0.0312, 0.02, 0.125)) |>
expand_grid(x = seq(from = 0, to = 200, length.out = length)) |>
mutate(mean = shape * 1 / rate,
sd = sqrt(shape * (1 / rate)^2)) |>
mutate(label = str_c("shape = ", shape, ", rate = ", rate,
"\nmean = ", mean, ", sd = ", round(sd, 4))) |>
# Plot
ggplot(aes(x = x, y = dgamma(x = x, shape = shape, rate = rate))) +
geom_area(aes(fill = label)) +
scale_x_continuous(expression(kappa), expand = expansion(mult = c(0, 0.05))) +
scale_y_continuous(expression(p(kappa*"|"*s*","*r)), breaks = c(0, 0.01, 0.02),
expand = expansion(mult = c(0, 0.05))) +
scale_fill_viridis_d(option = "A", end = 0.9, breaks = NULL) +
coord_cartesian(xlim = c(0, 150)) +
theme_cowplot(line_size = 0) +
panel_border() +
facet_wrap(~ label)
You can find the formulas for the mean and mutate()
statement for the data-prep stage of that last figure.
Using
With those in hand, we can follow Kruschke’s DBDA2E-utilities.R
file to make a couple convenience functions.
<- function(mean, sd) {
gamma_s_and_r_from_mean_sd if (mean <= 0) stop("`mean` must be > 0")
if (sd <= 0) stop("`sd` must be > 0")
<- mean^2 / sd^2
shape <- mean / sd^2
rate list(shape = shape, rate = rate)
}
<- function(mode, sd) {
gamma_s_and_r_from_mode_sd if (mode <= 0) stop("`mode` must be > 0")
if (sd <= 0) stop("`sd` must be > 0")
<- (mode + sqrt(mode^2 + 4 * sd^2)) / (2 * sd^2)
rate <- 1 + mode * rate
shape list(shape = shape, rate = rate)
}
They’re easy to put to use:
gamma_s_and_r_from_mean_sd(mean = 10, sd = 100)
$shape
[1] 0.01
$rate
[1] 0.001
gamma_s_and_r_from_mode_sd(mode = 10, sd = 100)
$shape
[1] 1.105125
$rate
[1] 0.01051249
Here’s a more detailed look at the structure of their output.
<- gamma_s_and_r_from_mode_sd(mode = 10, sd = 100)
gamma_param
str(gamma_param)
List of 2
$ shape: num 1.11
$ rate : num 0.0105
9.2.3 Doing it with JAGS brms.
Unlike JAGS, the brms formula
will not correspond as closely to the schematic in Figure 9.7. You’ll see in just a bit.
9.2.4 Example: Therapeutic touch.
Load the data from the TherapeuticTouchData.csv
file (see Rosa et al., 1998).
<- read_csv("data.R/TherapeuticTouchData.csv")
my_data
glimpse(my_data)
Rows: 280
Columns: 2
$ y <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
$ s <chr> "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01"…
Here are what the data look like.
|>
my_data mutate(y = y |> as.character()) |>
ggplot(aes(y = y)) +
geom_bar(aes(fill = after_stat(count))) +
scale_fill_viridis_c(option = "A", end = 0.7, breaks = NULL) +
scale_x_continuous(breaks = 0:4 * 2, expand = c(0, NA), limits = c(0, 9)) +
theme_minimal_vgrid() +
panel_border() +
facet_wrap(~ s, ncol = 7)
And here’s our Figure 9.9.
|>
my_data group_by(s) |>
summarize(mean = mean(y)) |>
ggplot(aes(x = mean)) +
geom_histogram(color = "white", fill = a_purple,
linewidth = 0.2, binwidth = 0.1) +
scale_x_continuous("Proportion Correct", limits = c(0, 1)) +
scale_y_continuous("# Practitioners", expand = c(0, NA)) +
theme_minimal_hgrid()
In applied statistics, the typical way to model a Bernoulli variable is with logistic regression. Instead of going through the pain of setting up a model in brms that mirrors the one in the text, I’m going to set up a hierarchical logistic regression model, instead.
Note the family = bernoulli(link = logit)
argument. In work-a-day regression with vanilla Gaussian variables, the prediction space is unbounded. But when we want to model the probability of a success for a Bernoulli variable (i.e.,
In case you’re curious, the logit of
But anyway, we’ll be doing logistic regression using the logit link. Kruschke covered this in detail in Chapter 21.
The next new part of our syntax is (1 | s)
. As in the popular frequentist lme4 package (Bates et al., 2015, 2022), you specify random effects or group-level parameters with the (|)
syntax in brms. On the left side of the |
, you tell brms what parameters you’d like to make random (i.e., vary by group). On the right side of the |
, you tell brms what variable you want to group the parameters by. In our case, we want the intercepts to vary over the grouping variable s
.
.1 <- brm(
fit9data = my_data,
family = bernoulli(link = logit),
~ 1 + (1 | s),
y prior = c(prior(normal(0, 1.5), class = Intercept),
prior(normal(0, 1), class = sd)),
iter = 20000, warmup = 1000, thin = 10, chains = 4, cores = 4,
seed = 9,
file = "fits/fit09.01")
As it turns out, the cauchy(0, 5)
, instead. See the prior wiki by the Stan team for more ideas on priors.
Here are the trace plots and posterior densities of the main parameters.
plot(fit9.1, widths = 2:3)
The trace plots indicate no problems with convergence. We’ll need to extract the posterior draws with as_draws_df()
.
<- as_draws_df(fit9.1) draws
One of the nice things about bayesplot is it returns ggplot2 objects. As such, we can amend their theme settings to be consistent with our other ggplot2 plots. Here we’ll amend bayesplot::mcmc_acf()
to the theme_cowplot()
theme.
|>
draws mutate(chain = .chain) |>
mcmc_acf(pars = vars(b_Intercept, sd_s__Intercept), lags = 10) +
theme_cowplot()
It appears fit9.1
had very low autocorrelations. Here we’ll examine the
neff_ratio(fit9.1) |>
mcmc_neff() +
theme_cowplot(font_size = 12)
The
print(fit9.1)
Family: bernoulli
Links: mu = logit
Formula: y ~ 1 + (1 | s)
Data: my_data (Number of observations: 280)
Draws: 4 chains, each with iter = 20000; warmup = 1000; thin = 10;
total post-warmup draws = 7600
Multilevel Hyperparameters:
~s (Number of levels: 28)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept) 0.28 0.18 0.01 0.68 1.00 7103 7350
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept -0.25 0.14 -0.54 0.02 1.00 7885 7200
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
We’ll need the base plogis()
function to convert the model parameters to predict
# Wrangle
<- draws |>
draws_small # Convert the linear model parameters to the probability space with `plogis()`
mutate(`theta[1]` = (b_Intercept + `r_s[S01,Intercept]`) |> plogis(),
`theta[14]` = (b_Intercept + `r_s[S14,Intercept]`) |> plogis(),
`theta[28]` = (b_Intercept + `r_s[S28,Intercept]`) |> plogis()) |>
# Make the difference distributions
mutate(`theta[1] - theta[14]` = `theta[1]` - `theta[14]`,
`theta[1] - theta[28]` = `theta[1]` - `theta[28]`,
`theta[14] - theta[28]` = `theta[14]` - `theta[28]`) |>
select(starts_with("theta"))
|>
draws_small pivot_longer(cols = everything()) |>
# This line is unnecessary, but will help order the plots
mutate(name = factor(name, levels = c("theta[1]", "theta[14]", "theta[28]", "theta[1] - theta[14]",
"theta[1] - theta[28]", "theta[14] - theta[28]"))) |>
ggplot(aes(x = value, y = 0)) +
stat_histinterval(point_interval = mode_hdi, .width = 0.95,
breaks = 40, fill = a_purple, normalize = "panels") +
scale_y_continuous(NULL, breaks = NULL) +
xlab(NULL) +
theme_minimal_hgrid() +
facet_wrap(~ name, ncol = 3, scales = "free")
If you wanted the specific values of the posterior modes and 95% HDIs, you could execute this.
|>
draws_small pivot_longer(cols = everything()) |>
group_by(name) |>
mode_hdi(value) |>
mutate_if(is.double, .funs = round, digits = 3)
# A tibble: 6 × 7
name value .lower .upper .width .point .interval
<chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 theta[1] 0.421 0.206 0.521 0.95 mode hdi
2 theta[1] - theta[14] 0 -0.274 0.12 0.95 mode hdi
3 theta[1] - theta[28] -0.002 -0.428 0.068 0.95 mode hdi
4 theta[14] 0.437 0.289 0.579 0.95 mode hdi
5 theta[14] - theta[28] -0.001 -0.328 0.105 0.95 mode hdi
6 theta[28] 0.463 0.363 0.71 0.95 mode hdi
And here are the Figure 9.10 scatter plots.
<- draws_small |>
p1 ggplot(aes(x = `theta[1]`, y = `theta[14]`)) +
geom_abline(linetype = 2) +
geom_point(alpha = 1/8, color = a_purple, size = 1/8)
<- draws_small |>
p2 ggplot(aes(x = `theta[1]`, y = `theta[28]`)) +
geom_abline(linetype = 2) +
geom_point(alpha = 1/8, color = a_purple, size = 1/8)
<- draws_small |>
p3 ggplot(aes(x = `theta[14]`, y = `theta[28]`)) +
geom_abline(linetype = 2) +
geom_point(alpha = 1/8, color = a_purple, size = 1/8)
+ p2 + p3) &
(p1 coord_cartesian(xlim = c(0, 1),
ylim = c(0, 1)) &
theme_minimal_grid()
This is posterior distribution for the population estimate for
# This part makes it easier to set the break points in `scale_x_continuous()`
<- draws |>
labels transmute(theta = b_Intercept |> plogis()) |>
mode_hdi() |>
pivot_longer(cols = theta:.upper) |>
mutate(label = value |> round(digits = 3) |> as.character())
|>
draws mutate(theta = b_Intercept |> plogis()) |>
ggplot(aes(x = theta, y = 0)) +
stat_histinterval(point_interval = mode_hdi, .width = 0.95,
breaks = 40, fill = a_purple) +
scale_x_continuous(expression(theta),
breaks = labels$value,
labels = labels$label) +
scale_y_continuous(NULL, breaks = NULL) +
theme_minimal_hgrid()
I’m not aware there’s a straight conversion to get coef()
to “extract [the] model coefficients, which are the sum of population-level effects and corresponding group-level effects” (Bürkner, 2022a, p. 58). With the model coefficient draws in hand, you can index them by posterior iteration, group them by that index, compute the iteration-level
# The tibble of the primary data
<- coef(fit9.1, summary = FALSE)$s |>
sigmas as_tibble() |>
mutate(iter = row_number()) |>
group_by(iter) |>
pivot_longer(cols = -iter) |>
mutate(theta = plogis(value)) |>
summarise(sd = sd(theta))
# This, again, is just to customize `scale_x_continuous()`
<- sigmas |>
labels mode_hdi(sd) |>
pivot_longer(cols = sd:.upper) |>
mutate(label = value |> round(3) |> as.character())
# The plot
|>
sigmas ggplot(aes(x = sd, y = 0)) +
stat_histinterval(point_interval = mode_hdi, .width = 0.95,
breaks = 40, fill = a_purple) +
scale_x_continuous(expression(paste(sigma, " of ", theta, " in a probability metric")),
breaks = labels$value,
labels = labels$label) +
scale_y_continuous(NULL, breaks = NULL) +
theme_minimal_hgrid()
And now you have a sense of how to do all those by hand, bayesplot::mcmc_pairs()
offers a fairly quick way to get a good portion of Figure 9.10.
color_scheme_set("purple")
bayesplot_theme_set(theme_default() + theme_minimal_grid())
coef(fit9.1, summary = FALSE)$s |>
plogis() |>
data.frame() |>
rename(`theta[1]` = S01.Intercept,
`theta[14]` = S14.Intercept,
`theta[28]` = S28.Intercept) |>
select(`theta[1]`, `theta[14]`, `theta[28]`) |>
mcmc_pairs(off_diag_args = list(size = 1/8, alpha = 1/8))
Did you see how we slipped in the color_scheme_set()
and bayesplot_theme_set()
lines at the top? Usually, the plots made with bayesplot are easy to modify with ggplot2 syntax. Plots made with mcmc_pairs()
function are one notable exception. On the back end, these made by combining multiple ggplot into a grid, a down-the-line result of which is they are difficult to modify. Happily, one can make some modifications beforehand by altering the global settings with the color_scheme_set()
and bayesplot_theme_set()
functions. You can learn more in the discussion on issue #128 on the bayesplot GitHub repo.
Kruschke used a
set.seed(1)
tibble(prior = rbeta(n = 1e5, 1, 1)) |>
ggplot(aes(x = prior)) +
geom_histogram(binwidth = 0.05, boundary = 0, color = "white",
fill = a_purple, linewidth = 0.2) +
scale_x_continuous(expression(omega), labels = c(0, ".25", ".5", ".75", 1)) +
scale_y_continuous(NULL, breaks = NULL) +
coord_cartesian(xlim = c(0, 1)) +
theme_minimal_hgrid()
You’ll note that plot corresponds to the upper right panel of Figure 9.11.
Recall that we used a logistic regression model with a normal(0, 1.5)
prior on the intercept. If you sample from normal(0, 1.5)
and then convert the draws using plogis()
, you’ll discover that our normal(0, 1.5)
prior was virtually flat on the probability scale. Here we’ll show the consequence of a variety of zero-mean Gaussian priors for the intercept of a logistic regression model.
# Define a function
<- function(i, n = 1e4) {
r_norm set.seed(1)
rnorm(n = n, mean = 0, sd = i) |>
plogis()
}
# Simulate and wrangle
tibble(sd = seq(from = 0.25, to = 3, by = 0.25)) |>
group_by(sd) |>
mutate(prior = map(sd, r_norm)) |>
unnest(prior) |>
ungroup() |>
mutate(sd = str_c("sd = ", sd)) |>
# Plot!
ggplot(aes(x = prior)) +
geom_histogram(binwidth = 0.05, boundary = 0, color = "white",
fill = a_purple, linewidth = 0.2) +
scale_x_continuous(labels = c(0, ".25", ".5", ".75", 1)) +
scale_y_continuous(NULL, breaks = NULL) +
coord_cartesian(xlim = c(0, 1)) +
theme_minimal_hgrid() +
panel_border() +
facet_wrap(~ sd)
It appears that as
Next, Kruschke examined the prior distribution. There are a few ways to do this. Like in the last chapter, the one we’ll explore involves adding the sample_prior = "only"
argument to the brm()
function. When you do so, the results of the model are just the prior. That is, brm()
leaves out the likelihood. This returns a bunch of draws from the prior predictive distribution.
.1_prior <- brm(
fit9data = my_data,
family = bernoulli(link = logit),
~ 1 + (1 | s),
y prior = c(prior(normal(0, 1.5), class = Intercept),
prior(normal(0, 1), class = sd)),
iter = 20000, warmup = 1000, thin = 10, chains = 4, cores = 4,
seed = 9,
sample_prior = "only",
file = "fits/fit09.01_prior")
If we feed fit9.1_prior
into the as_draws_df()
function, we’ll get back a data frame of draws from the prior, but with the same parameter names we’d get from the posterior.
<- as_draws_df(fit9.1_prior)
prior_draws
head(prior_draws)
# A draws_df: 6 iterations, 1 chains, and 33 variables
b_Intercept sd_s__Intercept Intercept r_s[S01,Intercept] r_s[S02,Intercept]
1 -0.96 0.91 -0.96 -0.010 0.97
2 0.86 1.29 0.86 0.043 -0.12
3 -0.72 0.41 -0.72 0.261 -0.04
4 0.92 1.42 0.92 -1.748 0.81
5 -2.03 0.99 -2.03 0.699 -0.65
6 -0.84 0.88 -0.84 -0.353 1.69
r_s[S03,Intercept] r_s[S04,Intercept] r_s[S05,Intercept]
1 -0.63 -0.265 1.18
2 -2.43 -0.419 0.70
3 -0.70 0.324 -0.35
4 0.88 0.048 0.56
5 1.08 -0.848 0.81
6 0.10 -0.012 0.67
# ... with 25 more variables
# ... hidden reserved variables {'.chain', '.iteration', '.draw'}
And here we’ll take a subset of the columns in prior_draws
, transform the results to the probability metric, and save.
<- prior_draws |>
prior_draws transmute(`theta[1]` = b_Intercept + `r_s[S01,Intercept]`,
`theta[14]` = b_Intercept + `r_s[S14,Intercept]`,
`theta[28]` = b_Intercept + `r_s[S28,Intercept]`) |>
mutate_all(.funs = plogis)
head(prior_draws)
# A tibble: 6 × 3
`theta[1]` `theta[14]` `theta[28]`
<dbl> <dbl> <dbl>
1 0.275 0.413 0.764
2 0.712 0.526 0.278
3 0.387 0.170 0.180
4 0.305 0.858 0.295
5 0.209 0.123 0.0741
6 0.233 0.141 0.579
Now we can use our prior_draws
object to make the diagonal of the lower grid of Figure 9.11.
|>
prior_draws pivot_longer(cols = everything()) |>
ggplot(aes(x = value)) +
geom_histogram(binwidth = 0.05, boundary = 0, color = "white",
fill = a_purple, linewidth = 0.2) +
scale_x_continuous(labels = c(0, ".25", ".5", ".75", 1)) +
scale_y_continuous(NULL, breaks = NULL) +
coord_cartesian(xlim = c(0, 1)) +
theme_minimal_hgrid() +
panel_border() +
facet_wrap(~ name)
With a little subtraction, we can reproduce the plots in the upper triangle.
|>
prior_draws mutate(`theta[1] - theta[14]` = `theta[1]` - `theta[14]`,
`theta[1] - theta[28]` = `theta[1]` - `theta[28]`,
`theta[14] - theta[28]` = `theta[14]` - `theta[28]`) |>
pivot_longer(cols = contains("-")) |>
ggplot(aes(x = value)) +
geom_histogram(binwidth = 0.05, boundary = 0, color = "white",
fill = a_purple, linewidth = 0.2) +
scale_y_continuous(NULL, breaks = NULL) +
theme_minimal_hgrid() +
panel_border() +
facet_wrap(~ name)
Those plots clarify our hierarchical logistic regression model was a little more regularizing than Kruschke’s. The consequence of our priors was more aggressive regularization, greater shrinkage toward zero. The prose in the next section of the text clarifies this isn’t necessarily a bad thing.
Finally, here are the plots for the lower triangle in Figure 9.11.
<- prior_draws |>
p1 ggplot(aes(x = `theta[1]`, y = `theta[14]`)) +
geom_point(alpha = 1/8, color = a_purple, size = 1/8)
<- prior_draws |>
p2 ggplot(aes(x = `theta[1]`, y = `theta[28]`)) +
geom_point(alpha = 1/8, color = a_purple, size = 1/8)
<- prior_draws |>
p3 ggplot(aes(x = `theta[14]`, y = `theta[28]`)) +
geom_point(alpha = 1/8, color = a_purple, size = 1/8)
+ p2 + p3) &
(p1 geom_abline(linetype = 2) &
coord_cartesian(xlim = c(0, 1),
ylim = c(0, 1)) &
theme_minimal_grid()
In case you were curious, here are the Pearson’s correlation coefficients among the priors.
cor(prior_draws) |> round(digits = 2)
theta[1] theta[14] theta[28]
theta[1] 1.00 0.72 0.73
theta[14] 0.72 1.00 0.71
theta[28] 0.73 0.71 1.00
9.3 Shrinkage in hierarchical models
“In typical hierarchical models, the estimates of low-level parameters are pulled closer together than they would be if there were not a higher-level distribution. This pulling together is called shrinkage of the estimates” (p. 245, emphasis in the original)
Further,
shrinkage is a rational implication of hierarchical model structure, and is (usually) desired by the analyst because the shrunken parameter estimates are less affected by random sampling noise than estimates derived without hierarchical structure. Intuitively, shrinkage occurs because the estimate of each low-level parameter is influenced from two sources: (1) the subset of data that are directly dependent on the low-level parameter, and (2) the higher-level parameters on which the low-level parameter depends. The higher- level parameters are affected by all the data, and therefore the estimate of a low-level parameter is affected indirectly by all the data, via their influence on the higher-level parameters. (p. 247)
Recall Formula 9.4 from page 223,
With that formula, we can express dbeta()
’s shape1
and shape2
in terms of
<- 0.5
omega <- 2.1
kappa1 <- 15.8
kappa2
tibble(x = seq(from = 0, to = 1, by = 0.001)) |>
mutate(`kappa==2.1` = dbeta(x = x,
shape1 = omega * (kappa1 - 2) + 1,
shape2 = (1 - omega) * (kappa1 - 2) + 1),
`kappa==15.8` = dbeta(x = x,
shape1 = omega * (kappa2 - 2) + 1,
shape2 = (1 - omega) * (kappa2 - 2) + 1)) |>
pivot_longer(cols = -x) |>
mutate(name = factor(name, levels = c("kappa==2.1", "kappa==15.8"))) |>
ggplot(aes(x = x, y = value)) +
geom_area(fill = a_purple) +
scale_y_continuous(expression(dbeta(theta*"|"*omega*", "*kappa)), breaks = NULL) +
xlab(expression(Data~Proportion~or~theta~value)) +
theme_minimal_hgrid() +
panel_border() +
facet_wrap(~ name, labeller = label_parsed)
This isn’t in the text, but it might help if we gave a sense of multilevel shrinkage by plotting the phenomena using the results from our model fit9.1
.
|>
my_data group_by(s) |>
summarise(p = mean(y)) |>
mutate(theta = coef(fit9.1)$s[, 1, "Intercept"] |> plogis()) |>
pivot_longer(cols = -s) |>
# Add a little jitter to reduce the overplotting
mutate(value = value + runif(n = n(), min = -0.02, max = 0.02),
name = if_else(name == "p", "italic(z/N)", "theta")) |>
ggplot(aes(x = value, y = name, group = s)) +
geom_point(color = alpha(a_purple, 1/2)) +
geom_line(linewidth = 1/3, alpha = 1/3) +
scale_x_continuous(breaks = 0:5 / 5, expand = c(0.01, 0.01), limits = 0:1) +
scale_y_discrete(NULL, labels = ggplot2:::parse_safe) +
labs(x = "data proportion or theta value",
title = "Multilevel shrinkage in fit9.1") +
theme_minimal_hgrid() +
panel_border()
The dots in the s
, the grouping variable in the my_data
data. You’ll note that we jittered the values for both within the second mutate()
line to help reduce the overplotting. If you don’t understand what that means, run the code without that line or set the values within runif()
closer to zero. You’ll see. Anyway, for more on multilevel shrinkage and for plots of this kind, check out Efron and Morris’s classic (1977) paper, Stein’s paradox in statistics, and my blog post walking out one of their examples in brms.
9.4 Speeding up JAGS brms
Here we’ll compare the time it takes to fit fit1
as either bernoulli(link = logit)
or binomial(link = logit)
.
# Bernoulli
<- proc.time()
start_time_bernoulli
brm(data = my_data,
family = bernoulli(link = logit),
~ 1 + (1 | s),
y prior = c(prior(normal(0, 1.5), class = Intercept),
prior(normal(0, 1), class = sd)),
iter = 20000, warmup = 1000, thin = 10, chains = 4, cores = 4,
seed = 9)
<- proc.time()
stop_time_bernoulli
# Binomial
<- proc.time()
start_time_binomial
brm(data = my_data,
family = binomial(link = logit),
| trials(1) ~ 1 + (1 | s),
y prior = c(prior(normal(0, 1.5), class = Intercept),
prior(normal(0, 1), class = sd)),
iter = 20000, warmup = 1000, thin = 10, chains = 4, cores = 4,
seed = 9)
<- proc.time() stop_time_binomial
See how we’re using proc.time()
to record when we began and finished evaluating our brm()
code? The last time we covered that was way back in Section 3.7.5. In that section, we also learned how subtracting the former from the latter yields the total elapsed time.
- start_time_bernoulli stop_time_bernoulli
user system elapsed
20.863 1.269 31.873
- start_time_binomial stop_time_binomial
user system elapsed
20.602 1.138 32.010
These times are based on my current laptop (a 2023 MacBook Pro). Your mileage may vary. If you wanted to be rigorous about this, you could do this multiple times in a mini simulation.
As to the issue of parallel processing, we’ve been doing this all along. Note our chains = 4, cores = 4
arguments.
Since Kruschke wrote his text, we have other options for speeding up your brms models related to within-chain parallelization and the backend = "cmdstanr"
option. For all the details, see Weber & Bürkner’s (2022) vignette, Running brms models with within-chain parallelization.
9.5 Extending the hierarchy: Subjects within categories
Many data structures invite hierarchical descriptions that may have multiple levels. Software such as
JAGS[brms] makes it easy to implement hierarchical models, and Bayesian inference makes it easy to interpret the parameter estimates, even for complex nonlinear hierarchical models. Here, we take a look at one type of extended hierarchical model. (p. 251)
As we will address below, our version of Figure 9.13 will look rather different from Kruschke’s. It’s something of a combination of the sensibilities from Figures 20.2 and 21.10. Even still, the diagram is of three-level model that shares many similarities to Kruschke’s and, as we will see, yields very similar results.
# Half-normal density
<- tibble(x = seq(from = 0, to = 3, by = 0.01)) |>
p1 ggplot(aes(x = x, y = (dnorm(x)) / max(dnorm(x)))) +
geom_area(fill = a_purple) +
annotate(geom = "text",
x = 1.5, y = 0.2,
label = "half-normal",
size = 7) +
annotate(geom = "text",
x = 1.5, y = 0.6,
label = "0*','*~italic(S)[italic(s)*'|'*italic(c)]",
family = "Times", parse = TRUE, size = 7) +
scale_x_continuous(expand = c(0, 0)) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
# Second half-normal density
<- tibble(x = seq(from = 0, to = 3, by = 0.01)) |>
p2 ggplot(aes(x = x, y = (dnorm(x)) / max(dnorm(x)))) +
geom_area(fill = a_purple) +
annotate(geom = "text",
x = 1.5, y = 0.2,
label = "half-normal",
size = 7) +
annotate(geom = "text",
x = 1.5, y = 0.6,
label = "0*','*~italic(S)[italic(c)]",
family = "Times", parse = TRUE, size = 7) +
scale_x_continuous(expand = c(0, 0)) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
# Annotated arrow
<- tibble(x = 0.85,
p3 y = 1,
xend = 0.5,
yend = 0.25) |>
ggplot(aes(x = x, xend = xend,
y = y, yend = yend)) +
geom_segment(arrow = my_arrow) +
annotate(geom = "text",
x = 0.54, y = 0.6, label = "'~'",
family = "Times", parse = TRUE, size = 10) +
xlim(0, 1) +
theme_void()
# Normal density
<- tibble(x = seq(from = -3, to = 3, by = 0.1)) |>
p4 ggplot(aes(x = x, y = (dnorm(x)) / max(dnorm(x)))) +
geom_area(fill = a_purple) +
annotate(geom = "text",
x = 0, y = 0.2,
label = "normal",
size = 7) +
annotate(geom = "text",
x = c(0, 1.45), y = 0.6,
hjust = c(0.5, 0),
label = c("italic(M)[0]", "italic(S)[0]"),
family = "Times", parse = TRUE, size = 7) +
scale_x_continuous(expand = c(0, 0)) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
# Second normal density
<- tibble(x = seq(from = -3, to = 3, by = 0.1)) |>
p5 ggplot(aes(x = x, y = (dnorm(x)) / max(dnorm(x)))) +
geom_area(fill = a_purple,) +
annotate(geom = "text",
x = 0, y = 0.2,
label = "normal",
size = 7) +
annotate(geom = "text",
x = c(0, 1.45), y = 0.6,
hjust = c(.5, 0),
label = c("0", "sigma[italic(s)*'|'*italic(c)]"),
family = "Times", parse = TRUE, size = 7) +
scale_x_continuous(expand = c(0, 0)) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
# Third normal density
<- tibble(x = seq(from = -3, to = 3, by = 0.1)) |>
p6 ggplot(aes(x = x, y = (dnorm(x)) / max(dnorm(x)))) +
geom_area(fill = a_purple) +
annotate(geom = "text",
x = 0, y = 0.2,
label = "normal",
size = 7) +
annotate(geom = "text",
x = c(0, 1.45), y = 0.6,
hjust = c(0.5, 0),
label = c("0", "sigma[italic(c)]"),
family = "Times", parse = TRUE, size = 7) +
scale_x_continuous(expand = c(0, 0)) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
# Three annotated arrows
<- tibble(x = c(0.09, 0.48, 0.9),
p7 y = 1,
xend = c(0.2, 0.425, 0.775),
yend = 0.2) |>
ggplot(aes(x = x, xend = xend,
y = y, yend = yend)) +
geom_segment(arrow = my_arrow) +
annotate(geom = "text",
x = c(0.10, 0.42, 0.49, 0.81, 0.87), y = 0.6,
label = c("'~'", "'~'", "italic(s)*'|'*italic(c)", "'~'", "italic(c)"),
family = "Times", parse = TRUE, size = c(10, 10, 7, 10, 7)) +
xlim(0, 1) +
theme_void()
# Likelihood formula
<- tibble(x = 0.5,
p8 y = 0.5,
label = "logistic(beta[0]+sum()[italic(s)*'|'*italic(c)]*beta['['*italic(s)*'|'*italic(c)*']']*italic(x)['['*italic(s)*'|'*italic(c)*']'](italic(i))+sum()[italic(c)]*beta['['*italic(c)*']']*italic(x)['['*italic(c)*']'](italic(i)))") |>
ggplot(aes(x = x, y = y, label = label)) +
geom_text(family = "Times", parse = TRUE, size = 7) +
scale_x_continuous(expand = c(0, 0), limits = c(0, 1)) +
ylim(0, 1) +
theme_void()
# A second annotated arrow
<- tibble(x = c(0.375, 0.5),
p9 y = c(0.75, 0.3),
label = c("'='", "mu[italic(i)*'|'*italic(sc)]")) |>
ggplot(aes(x = x, y = y, label = label)) +
geom_text(family = "Times", parse = TRUE, size = c(10, 7)) +
geom_segment(x = 0.5, xend = 0.5,
y = 1, yend = 0.4,
arrow = my_arrow) +
xlim(0, 1) +
theme_void()
# Binomial density
<- tibble(x = 0:7) |>
p10 ggplot(aes(x = x,
y = (dbinom(x, size = 7, prob = 0.625)) / max(dbinom(x, size = 7, prob = 0.625)))) +
geom_col(fill = a_purple, width = 0.4) +
annotate(geom = "text",
x = 3.5, y = 0.2,
label = "binomial",
size = 7) +
annotate(geom = "text",
x = 7, y = 0.85,
label = "italic(N)[italic(i)*'|'*italic(s)]",
family = "Times", parse = TRUE, size = 7) +
coord_cartesian(xlim = c(-1, 8),
ylim = c(0, 1.2)) +
theme_void() +
theme(axis.line.x = element_line(linewidth = 0.5))
# Another annotated arrow
<- tibble(x = c(0.375, 0.7),
p11 y = c(1/3, 1/3),
label = c("'~'", "italic(i)*'|'*italic(sc)")) |>
ggplot(aes(x = x, y = y, label = label)) +
geom_text(family = "Times", parse = TRUE, size = c(10, 7)) +
geom_segment(x = 0.5, xend = 0.5,
y = 1, yend = 0,
arrow = my_arrow) +
xlim(0, 1) +
theme_void()
# Some text
<- tibble(x = 1,
p12 y = 0.5,
label = "italic(y)[italic(i)*'|'*italic(sc)]") |>
ggplot(aes(x = x, y = y, label = label)) +
geom_text(family = "Times", parse = TRUE, size = 7) +
xlim(0, 2) +
theme_void()
# Define the layout
<- c(
layout area(t = 1, b = 2, l = 7, r = 9),
area(t = 1, b = 2, l = 11, r = 13),
area(t = 4, b = 5, l = 1, r = 3),
area(t = 4, b = 5, l = 5, r = 7),
area(t = 4, b = 5, l = 9, r = 11),
area(t = 3, b = 4, l = 6, r = 8),
area(t = 3, b = 4, l = 10, r = 12),
area(t = 7, b = 8, l = 1, r = 11),
area(t = 6, b = 7, l = 1, r = 11),
area(t = 11, b = 12, l = 5, r = 7),
area(t = 9, b = 11, l = 5, r = 7),
area(t = 13, b = 14, l = 5, r = 7),
area(t = 15, b = 15, l = 5, r = 7))
# Combine and plot!
+ p2 + p4 + p5 + p6 + p3 + p3 + p8 + p7 + p10 + p9 + p11 + p12) +
(p1 plot_layout(design = layout) &
ylim(0, 1) &
theme(plot.margin = margin(0, 5.5, 0, 5.5))
Though we will be fitting a hierarchical model with subjects
9.5.1 Example: Baseball batting abilities by position.
Here are the batting average data.
<- read_csv("data.R/BattingAverage.csv")
my_data
glimpse(my_data)
Rows: 948
Columns: 6
$ Player <chr> "Fernando Abad", "Bobby Abreu", "Tony Abreu", "Dustin Ack…
$ PriPos <chr> "Pitcher", "Left Field", "2nd Base", "2nd Base", "1st Bas…
$ Hits <dbl> 1, 53, 18, 137, 21, 0, 0, 2, 150, 167, 0, 128, 66, 3, 1, …
$ AtBats <dbl> 7, 219, 70, 607, 86, 1, 1, 20, 549, 576, 1, 525, 275, 12,…
$ PlayerNumber <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
$ PriPosNumber <dbl> 1, 7, 4, 4, 3, 1, 1, 3, 3, 4, 1, 5, 4, 2, 7, 4, 6, 8, 9, …
In his footnote #6, Kruschke indicated he retrieved the data from http://www.baseball-reference.com/leagues/MLB/2012-standard-batting.shtml on December of 2012. To give a sense of the data, here are the number of occasions by primary position, PriPos
, with their median at bat, AtBats
, values.
|>
my_data group_by(PriPos) |>
summarise(n = n(),
median = median(AtBats)) |>
arrange(desc(n))
# A tibble: 9 × 3
PriPos n median
<chr> <int> <dbl>
1 Pitcher 324 4
2 Catcher 103 170
3 Left Field 103 164
4 1st Base 81 265
5 3rd Base 75 267
6 2nd Base 72 228.
7 Center Field 67 259
8 Shortstop 63 205
9 Right Field 60 340.
As these data are aggregated, we’ll fit with an aggregated binomial model. This is still logistic regression. The Bernoulli distribution is a special case of the binomial distribution when the number of trials in each data point is 1 (see Bürkner, 2022b for details). Since our data are aggregated, the information encoded in Hits
is a combination of multiple trials, which requires us to jump up to the more general binomial likelihood. Note the Hits | trials(AtBats)
syntax. With that bit, we instructed brms that our criterion, Hits
, is an aggregate of multiple trials and the number of trials is encoded in AtBats
.
Also note the (1 | PriPos) + (1 | PriPos:Player)
syntax. In this model, we have two grouping factors, PriPos
and Player
. Thus we have two (|)
arguments. But since players are themselves nested within positions, we have encoded that nesting with the (1 | PriPos:Player)
syntax. For more on this style of syntax, see Kristoffer Magnusson’s handy blog post, Using R and lme/lmer to fit different two- and three-level longitudinal models. Since brms syntax is based on that from the earlier lme4 package, the basic syntax rules apply. Bürkner (2022a), of course, also covered these topics in the brmsformula
subsection of the brms reference manual.
.2 <- brm(
fit9data = my_data,
family = binomial(link = logit),
| trials(AtBats) ~ 1 + (1 | PriPos) + (1 | PriPos:Player),
Hits prior = c(prior(normal(0, 1.5), class = Intercept),
prior(normal(0, 1), class = sd)),
iter = 3500, warmup = 500, chains = 3, cores = 3,
control = list(adapt_delta = 0.99),
seed = 9,
file = "fits/fit09.02")
The chains look good.
plot(fit9.2, widths = 2:3)
Note how our color_scheme_set("purple")
line from back up in the mcmc_pairs()
has effected the color scheme of brms::plot()
.
We might examine the autocorrelations within the chains.
<- as_draws_df(fit9.2)
draws
|>
draws mutate(chain = .chain) |>
mcmc_acf(pars = vars(b_Intercept:`sd_PriPos:Player__Intercept`),
lags = 8) +
theme_minimal_hgrid()
Here’s a histogram of the
.2 |>
fit9neff_ratio() |>
mcmc_neff_hist(binwidth = 0.1) +
yaxis_text() +
theme_minimal_hgrid()
Happily, most have a very favorable ratio. Here’s a numeric summary of the primary model parameters.
print(fit9.2)
Family: binomial
Links: mu = logit
Formula: Hits | trials(AtBats) ~ 1 + (1 | PriPos) + (1 | PriPos:Player)
Data: my_data (Number of observations: 948)
Draws: 3 chains, each with iter = 3500; warmup = 500; thin = 1;
total post-warmup draws = 9000
Multilevel Hyperparameters:
~PriPos (Number of levels: 9)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept) 0.33 0.10 0.19 0.59 1.00 2786 4735
~PriPos:Player (Number of levels: 948)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept) 0.14 0.01 0.12 0.15 1.00 4243 6049
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept -1.16 0.12 -1.39 -0.93 1.00 783 1842
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
As far as I’m aware, brms offers three major ways to get the group-level parameters for a hierarchical model: using one of the as_draws
functions, coef()
, or fitted()
. We’ll cover each, beginning with as_draws
. In order to look at the autocorrelation plots, above, we already saved the results from as_draws_df(fit9.2)
as draws
. Let’s look at its structure with head()
.
head(draws)
# A draws_df: 6 iterations, 1 chains, and 963 variables
b_Intercept sd_PriPos__Intercept sd_PriPos:Player__Intercept Intercept
1 -1.3 0.45 0.12 -1.3
2 -1.2 0.41 0.14 -1.2
3 -1.2 0.40 0.12 -1.2
4 -1.3 0.34 0.14 -1.3
5 -1.2 0.23 0.12 -1.2
6 -1.2 0.32 0.14 -1.2
r_PriPos[1st.Base,Intercept] r_PriPos[2nd.Base,Intercept]
1 0.22 0.20
2 0.11 0.11
3 0.14 0.12
4 0.19 0.16
5 0.11 0.10
6 0.15 0.11
r_PriPos[3rd.Base,Intercept] r_PriPos[Catcher,Intercept]
1 0.27 0.107
2 0.18 0.060
3 0.19 0.087
4 0.20 0.126
5 0.11 0.045
6 0.15 0.050
# ... with 955 more variables
# ... hidden reserved variables {'.chain', '.iteration', '.draw'}
In the text, Kruschke described the model as having 968 parameters. Our draws
data frame has one vector for each, with a couple others tacked onto the end. In the hierarchical logistic regression model, the group-specific parameters for the levels of PriPos
are additive combinations of the global intercept vector, b_Intercept
and each position-specific vector, r_PriPos[i.Base,Intercept]
, where i
is a fill-in for the position of interest. And recall that since the linear model is of the logit of the criterion, we’ll need to use plogis()
to convert that to the probability space.
<- draws |>
draws_small transmute(`1st Base` = (b_Intercept + `r_PriPos[1st.Base,Intercept]`),
Catcher = (b_Intercept + `r_PriPos[Catcher,Intercept]`),
Pitcher = (b_Intercept + `r_PriPos[Pitcher,Intercept]`)) |>
mutate_all(.funs = plogis) |>
# Here we compute our difference distributions
mutate(`Pitcher - Catcher` = Pitcher - Catcher,
`Catcher - 1st Base` = Catcher - `1st Base`)
head(draws_small)
# A tibble: 6 × 5
`1st Base` Catcher Pitcher `Pitcher - Catcher` `Catcher - 1st Base`
<dbl> <dbl> <dbl> <dbl> <dbl>
1 0.255 0.234 0.127 -0.106 -0.0212
2 0.250 0.240 0.137 -0.103 -0.00990
3 0.253 0.243 0.123 -0.120 -0.00980
4 0.254 0.242 0.131 -0.112 -0.0118
5 0.255 0.242 0.137 -0.105 -0.0125
6 0.257 0.238 0.126 -0.112 -0.0193
If you take a glance at Figures 9.14 through 9.16 in the text, we’ll be making a lot of histograms of the same basic structure. To streamline our code a bit, we can make a custom histogram plotting function.
<- function(data, mapping, title, xlim, ...) {
make_histogram ggplot(data, mapping) +
geom_histogram(fill = viridis::viridis_pal(option = "A")(9)[4],
bins = 30, color = "white", linewidth = 0.2, ...) +
stat_pointinterval(aes(y = 0),
point_interval = mode_hdi, .width = 0.95) +
scale_y_continuous(NULL, breaks = NULL) +
labs(x = expression(theta),
title = title) +
coord_cartesian(xlim = xlim) +
theme_minimal_hgrid() +
panel_border()
}
We’ll do the same thing for the correlation plots.
<- function(data, mapping, limits, ...) {
make_point ggplot(data, mapping) +
geom_abline(linetype = 3, color = "grey50") +
geom_point(color = viridis::viridis_pal(option = "A")(9)[4],
alpha = 1/20, size = 1/10, ...) +
coord_cartesian(xlim = limits,
ylim = limits) +
theme_minimal_grid(line_size = 0) +
panel_border()
}
To learn more about wrapping custom plots into custom functions, check out Chapter 19 of Wickham’s (2016) ggplot2: Elegant graphics for data analysis.
Now we have our make_histogram()
and make_point()
functions, we’ll use grid.arrange()
to paste together the left half of Figure 9.14.
<- make_histogram(
p1 data = draws_small,
aes(x = Pitcher),
title = "Pitcher",
xlim = c(0.1, 0.25))
<- make_histogram(
p2 data = draws_small,
aes(x = `Pitcher - Catcher`),
title = "Pitcher - Catcher",
xlim = c(-0.15, 0))
<- make_point(
p3 data = draws_small,
aes(x = Pitcher, y = Catcher),
limits = c(0.12, 0.25))
<- make_histogram(
p4 data = draws_small,
aes(x = Catcher),
title = "Catcher",
xlim = c(0.1, 0.25))
+ p2 + p3 + p4 p1
We could follow the same procedure to make the right portion of Figure 9.14. But instead, let’s switch gears and explore the second way brms affords us for plotting group-level parameters. This time, we’ll use coef()
.
Up in Section 9.2.4, we learned that we can use coef()
to “extract [the] model coefficients, which are the sum of population-level effects and corresponding group-level effects” (Bürkner, 2022a, p. 58). The grouping level we’re interested in is PriPos
, so we’ll use that to index the information returned by coef()
. Since coef()
returns a matrix, we’ll use as_tibble()
to convert it to a tibble.
<- coef(fit9.2, summary = FALSE)$PriPos |>
coef_primary_position as_tibble()
str(coef_primary_position)
tibble [9,000 × 9] (S3: tbl_df/tbl/data.frame)
$ 1st Base.Intercept : num [1:9000] -1.07 -1.1 -1.08 -1.08 -1.07 ...
$ 2nd Base.Intercept : num [1:9000] -1.1 -1.1 -1.1 -1.1 -1.08 ...
$ 3rd Base.Intercept : num [1:9000] -1.02 -1.03 -1.03 -1.06 -1.08 ...
$ Catcher.Intercept : num [1:9000] -1.19 -1.15 -1.14 -1.14 -1.14 ...
$ Center Field.Intercept: num [1:9000] -1.04 -1.04 -1.01 -1.03 -1.07 ...
$ Left Field.Intercept : num [1:9000] -1.08 -1.08 -1.02 -1.04 -1.08 ...
$ Pitcher.Intercept : num [1:9000] -1.92 -1.84 -1.96 -1.9 -1.84 ...
$ Right Field.Intercept : num [1:9000] -1.05 -1.07 -1.04 -1.01 -1.05 ...
$ Shortstop.Intercept : num [1:9000] -1.13 -1.16 -1.09 -1.12 -1.14 ...
Keep in mind that coef()
returns the values in the logit scale when used for logistic regression models. So we’ll have to use plogis()
to convert the estimates to the probability metric. After we’re done converting the estimates, we’ll then make the difference distributions.
<- coef_primary_position |>
coef_small select(`1st Base.Intercept`, Catcher.Intercept, Pitcher.Intercept) |>
transmute(`1st Base` = `1st Base.Intercept`,
Catcher = Catcher.Intercept,
Pitcher = Pitcher.Intercept) |>
mutate_all(.funs = plogis) |>
# Here we make the difference distributions
mutate(`Pitcher - Catcher` = Pitcher - Catcher,
`Catcher - 1st Base` = Catcher - `1st Base`)
head(coef_small)
# A tibble: 6 × 5
`1st Base` Catcher Pitcher `Pitcher - Catcher` `Catcher - 1st Base`
<dbl> <dbl> <dbl> <dbl> <dbl>
1 0.255 0.234 0.127 -0.106 -0.0212
2 0.250 0.240 0.137 -0.103 -0.00990
3 0.253 0.243 0.123 -0.120 -0.00980
4 0.254 0.242 0.131 -0.112 -0.0118
5 0.255 0.242 0.137 -0.105 -0.0125
6 0.257 0.238 0.126 -0.112 -0.0193
Now we’re ready for the right half of Figure 9.14.
<- make_histogram(
p1 data = coef_small,
aes(x = Catcher),
title = "Catcher",
xlim = c(0.22, 0.27))
<- make_histogram(
p2 data = coef_small,
aes(x = `Catcher - 1st Base`),
title = "Catcher - 1st Base",
xlim = c(-0.04, 0.01))
<- make_point(
p3 data = coef_small,
aes(x = Catcher, y = `1st Base`),
limits = c(0.22, 0.27))
<- make_histogram(
p4 data = coef_small,
aes(x = `1st Base`),
title = "1st Base",
xlim = c(0.22, 0.27))
+ p2 + p3 + p4 p1
And if you wanted the posterior modes and HDIs, you’d use mode_hdi()
after a little wrangling.
|>
coef_small pivot_longer(cols = everything()) |>
group_by(name) |>
mode_hdi(value) |>
mutate_if(is.double, .funs = round, digits = 3)
# A tibble: 5 × 7
name value .lower .upper .width .point .interval
<chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 1st Base 0.254 0.245 0.262 0.95 mode hdi
2 Catcher 0.241 0.233 0.25 0.95 mode hdi
3 Catcher - 1st Base -0.012 -0.024 -0.001 0.95 mode hdi
4 Pitcher 0.13 0.12 0.139 0.95 mode hdi
5 Pitcher - Catcher -0.111 -0.124 -0.097 0.95 mode hdi
While we’re at it, we should capitalize on the opportunity to show how these results are the same as those derived from our as_draws_df()
approach, above.
|>
draws_small pivot_longer(cols = everything()) |>
group_by(name) |>
mode_hdi(value) |>
mutate_if(is.double, .funs = round, digits = 3)
# A tibble: 5 × 7
name value .lower .upper .width .point .interval
<chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 1st Base 0.254 0.245 0.262 0.95 mode hdi
2 Catcher 0.241 0.233 0.25 0.95 mode hdi
3 Catcher - 1st Base -0.012 -0.024 -0.001 0.95 mode hdi
4 Pitcher 0.13 0.12 0.139 0.95 mode hdi
5 Pitcher - Catcher -0.111 -0.124 -0.097 0.95 mode hdi
Success!
For Figures 9.15 and 9.16, Kruschke drilled down further into the posterior. To drill along with him, we’ll take the opportunity to showcase fitted()
, the third way brms affords us for plotting group-level parameters.
# This will make life easier. Just go with it
<- c("Kyle Blanks", "Bruce Chen", "ShinSoo Choo", "Ichiro Suzuki",
name_list "Mike Leake", "Wandy Rodriguez", "Andrew McCutchen", "Brett Jackson")
# We'll define the data we'd like to feed into `fitted()`, here
<- my_data |>
nd filter(Player %in% name_list) |>
# These last two lines aren't typically necessary, but they allow us to
# arrange the rows in the same order we find the names in Figures 9.15 and 9.16
mutate(Player = factor(Player, levels = name_list)) |>
arrange(Player)
<- fitted(
fitted_players .2,
fit9newdata = nd,
scale = "linear",
summary = FALSE) |>
as_tibble() |>
# Rename the values as returned by `as_tibble()`
set_names(name_list) |>
# Convert the values from the logit scale to the probability scale
mutate_all(.funs = plogis) |>
# In this last section, we make our difference distributions
mutate(`Kyle Blanks - Bruce Chen` = `Kyle Blanks` - `Bruce Chen`,
`ShinSoo Choo - Ichiro Suzuki` = `ShinSoo Choo` - `Ichiro Suzuki`,
`Mike Leake - Wandy Rodriguez` = `Mike Leake` - `Wandy Rodriguez`,
`Andrew McCutchen - Brett Jackson` = `Andrew McCutchen` - `Brett Jackson`)
glimpse(fitted_players)
Rows: 9,000
Columns: 12
$ `Kyle Blanks` <dbl> 0.2688321, 0.2190504, 0.2852203, 0.…
$ `Bruce Chen` <dbl> 0.1209843, 0.1677716, 0.1000332, 0.…
$ `ShinSoo Choo` <dbl> 0.2494368, 0.2782929, 0.2647410, 0.…
$ `Ichiro Suzuki` <dbl> 0.2749976, 0.2744152, 0.2745830, 0.…
$ `Mike Leake` <dbl> 0.1632125, 0.1580212, 0.1394908, 0.…
$ `Wandy Rodriguez` <dbl> 0.1139079, 0.1110459, 0.1242971, 0.…
$ `Andrew McCutchen` <dbl> 0.3151969, 0.3198682, 0.2906760, 0.…
$ `Brett Jackson` <dbl> 0.2511353, 0.1938503, 0.2736172, 0.…
$ `Kyle Blanks - Bruce Chen` <dbl> 0.14784780, 0.05127880, 0.18518709,…
$ `ShinSoo Choo - Ichiro Suzuki` <dbl> -2.556087e-02, 3.877648e-03, -9.841…
$ `Mike Leake - Wandy Rodriguez` <dbl> 0.04930462, 0.04697526, 0.01519371,…
$ `Andrew McCutchen - Brett Jackson` <dbl> 0.06406165, 0.12601794, 0.01705884,…
Note our use of the scale = "linear"
argument in the fitted()
function. By default, fitted()
returns predictions on the scale of the criterion. But we don’t want a list of successes and failures; we want player-level parameters. When you specify scale = "linear"
, you request fitted()
return the values in the parameter scale.
Here’s the left portion of Figure 9.15.
<- make_histogram(
p1 data = fitted_players,
aes(x = `Kyle Blanks`),
title = "Kyle Blanks (1st Base)",
xlim = c(0.05, 0.35))
<- make_histogram(
p2 data = fitted_players,
aes(x = `Kyle Blanks - Bruce Chen`),
title = "Kyle Blanks (1st Base) -\nBruce Chen (Pitcher)",
xlim = c(-0.1, 0.25))
<- make_point(
p3 data = fitted_players,
aes(x = `Kyle Blanks`, y = `Bruce Chen`),
limits = c(0.09, 0.35))
<- make_histogram(
p4 data = fitted_players,
aes(x = `Bruce Chen`),
title = "Bruce Chen (Pitcher)",
xlim = c(0.05, 0.35))
+ p2 + p3 + p4 p1
Figure 9.15, right:
<- make_histogram(
p1 data = fitted_players,
aes(x = `ShinSoo Choo`),
title = "ShinSoo Choo (Right Field)",
xlim = c(0.22, 0.34))
<- make_histogram(
p2 data = fitted_players,
aes(x = `ShinSoo Choo - Ichiro Suzuki`),
title = "ShinSoo Choo (Right Field) -\nIchiro Suzuki (Right Field)",
xlim = c(-0.07, 0.07))
<- make_point(
p3 data = fitted_players,
aes(x = `ShinSoo Choo`, y = `Ichiro Suzuki`),
limits = c(0.23, 0.32))
<- make_histogram(
p4 data = fitted_players,
aes(x = `Ichiro Suzuki`),
title = "Ichiro Suzuki (Right Field)",
xlim = c(0.22, 0.34))
+ p2 + p3 + p4) &
(p1 theme(title = element_text(size = 11))
Figure 9.16, left:
<- make_histogram(
p1 data = fitted_players,
aes(x = `Mike Leake`),
title = "Mike Leake (Pitcher)",
xlim = c(0.05, 0.35))
<- make_histogram(
p2 data = fitted_players,
aes(x = `Mike Leake - Wandy Rodriguez`),
title = "Mike Leake (Pitcher) -\nWandy Rodriguez (Pitcher)",
xlim = c(-0.05, 0.25))
<- make_point(
p3 data = fitted_players,
aes(x = `Mike Leake`, y = `Wandy Rodriguez`),
limits = c(0.07, 0.25))
<- make_histogram(
p4 data = fitted_players,
aes(x = `Wandy Rodriguez`),
title = "Wandy Rodriguez (Pitcher)",
xlim = c(0.05, 0.35))
+ p2 + p3 + p4) &
(p1 theme(title = element_text(size = 11))
Figure 9.16, right:
<- make_histogram(
p1 data = fitted_players,
aes(x = `Andrew McCutchen`),
title = "Andrew McCutchen (Center Field)",
xlim = c(0.15, 0.35))
<- make_histogram(
p2 data = fitted_players,
aes(x = `Andrew McCutchen - Brett Jackson`),
title = "Andrew McCutchen (Center Field) -\nBrett Jackson (Center Field)",
xlim = c(0, 0.20))
<- make_point(
p3 data = fitted_players,
aes(x = `Andrew McCutchen`, y = `Brett Jackson`),
limits = c(0.15, 0.35))
<- make_histogram(
p4 data = fitted_players,
aes(x = `Brett Jackson`),
title = "Brett Jackson (Center Field)",
xlim = c(0.15, 0.35))
+ p2 + p3 + p4) &
(p1 theme(title = element_text(size = 11))
If you wanted the posterior modes and HDIs for any of the players and their contrasts, you’d use mode_hdi()
after a little wrangling.
|>
fitted_players pivot_longer(cols = everything()) |>
group_by(name) |>
mode_hdi(value) |>
mutate_if(is.double, .funs = round, digits = 3)
# A tibble: 12 × 7
name value .lower .upper .width .point .interval
<chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 Andrew McCutchen 0.307 0.274 0.338 0.95 mode hdi
2 Andrew McCutchen - Brett Jackson 0.072 0.021 0.123 0.95 mode hdi
3 Brett Jackson 0.234 0.194 0.274 0.95 mode hdi
4 Bruce Chen 0.129 0.101 0.164 0.95 mode hdi
5 Ichiro Suzuki 0.274 0.246 0.305 0.95 mode hdi
6 Kyle Blanks 0.25 0.205 0.305 0.95 mode hdi
7 Kyle Blanks - Bruce Chen 0.12 0.063 0.181 0.95 mode hdi
8 Mike Leake 0.146 0.117 0.185 0.95 mode hdi
9 Mike Leake - Wandy Rodriguez 0.027 -0.016 0.068 0.95 mode hdi
10 ShinSoo Choo 0.273 0.245 0.305 0.95 mode hdi
11 ShinSoo Choo - Ichiro Suzuki -0.001 -0.042 0.041 0.95 mode hdi
12 Wandy Rodriguez 0.122 0.095 0.151 0.95 mode hdi
To make our version of Figure 9.7, we’ll have to switch gears from player-specific effects to those specific to positions averaged over individual players. The fitted()
approach will probably make this the easiest. To do this, we’ll need to specify values for AtBats
, which will need to be a positive integer. However, since we’re asking for fitted values of the linear predictors by setting scale = "linear"
, any value meeting the positive-integer criterion will return the same results. We’ll keep things simple and set AtBats = 1
.
Another consideration is if we’d like to use fitted()
to average across one of the hierarchical grouping parameters (i.e., (1 | PriPos:Player)
), we’ll need to employ the re_formula
argument. With the line re_formula = ~ (1 | PriPos)
, we’ll instruct fitted()
to return the PriPos
-specific effects after averaging across levels of Player
. The rest is quire similar to our method from above.
<- my_data |>
nd distinct(PriPos,
AtBats = 1)
<- fitted(
fitted_positions .2,
fit9newdata = nd,
re_formula = ~ (1 | PriPos),
scale = "linear",
summary = FALSE) |>
as_tibble() |>
set_names(distinct(my_data, PriPos) |> pull()) |>
mutate_all(.funs = plogis)
glimpse(fitted_positions)
Rows: 9,000
Columns: 9
$ Pitcher <dbl> 0.1274818, 0.1367352, 0.1231512, 0.1306654, 0.1373809, …
$ `Left Field` <dbl> 0.2541778, 0.2540229, 0.2643559, 0.2602814, 0.2543268, …
$ `2nd Base` <dbl> 0.2501098, 0.2496925, 0.2489085, 0.2494403, 0.2528942, …
$ `1st Base` <dbl> 0.2546954, 0.2496810, 0.2528163, 0.2542391, 0.2546610, …
$ `3rd Base` <dbl> 0.2640624, 0.2625069, 0.2627164, 0.2569670, 0.2543385, …
$ Catcher <dbl> 0.2335230, 0.2397796, 0.2430212, 0.2424189, 0.2422081, …
$ Shortstop <dbl> 0.2446326, 0.2378248, 0.2522922, 0.2458944, 0.2417578, …
$ `Center Field` <dbl> 0.2604794, 0.2603677, 0.2668882, 0.2625836, 0.2546612, …
$ `Right Field` <dbl> 0.2587849, 0.2546103, 0.2620422, 0.2672120, 0.2598108, …
Now we make and save the nine position-specific subplots for Figure 9.17.
<- fitted_positions |>
p1 pivot_longer(cols = everything(), values_to = "theta") |>
# Though technically not needed, this line reorders the panels to match the text
mutate(name = factor(name, levels = c(
"1st Base", "Catcher", "Pitcher", "2nd Base", "Center Field",
"Right Field", "3rd Base", "Left Field", "Shortstop"))) |>
ggplot(aes(x = theta)) +
geom_histogram(binwidth = 0.0025, color = "white",
fill = a_purple, linewidth = 0.1) +
stat_pointinterval(aes(y = 0),
point_interval = mode_hdi, .width = 0.95, size = 1) +
scale_y_continuous(NULL, breaks = NULL) +
xlab(expression(theta)) +
coord_cartesian(xlim = c(0.1, 0.28)) +
theme_minimal_hgrid() +
panel_border() +
facet_wrap(~ name, nrow = 3, scales = "free_y")
In this code block, we’ll make the subplot for the overall batting average. Given the size of the model, it’s perhaps easiest to pull that information from the model with the fixef()
function.
<- fixef(fit9.2, summary = FALSE) |>
p2 as_tibble() |>
transmute(theta = plogis(Intercept),
name = "Overall") |>
ggplot(aes(x = theta)) +
geom_histogram(binwidth = 0.005, color = "white",
fill = a_purple, linewidth = 0.2) +
stat_pointinterval(aes(y = 0),
point_interval = mode_hdi, .width = 0.95) +
scale_y_continuous(NULL, breaks = NULL) +
xlab(expression(theta)) +
coord_cartesian(xlim = c(0.1, 0.28)) +
theme_minimal_hgrid() +
panel_border() +
facet_wrap(~ name)
Now combine the plots with a little patchwork magic.
<- plot_spacer()
p3
+ (p2 / p3 / p3) +
p1 plot_layout(widths = c(3, 1))
Do note that, unlike Kruschke’s Figure 9.17, our subplots are all based on
Finally, we have only looked at a tiny fraction of the relations among the
parameters. We could investigate many more comparisons among parameters if we were specifically interested. In traditional statistical testing based on -values (which will be discussed in Chapter 11), we would pay a penalty for even intending to make more comparisons. This is because a value depends on the space of counter-factual possibilities created from the testing intentions. In a Bayesian analysis, however, decisions are based on the posterior distribution, which is based only on the data (and the prior), not on the testing intention. More discussion of multiple comparisons can be found in Section 11.4. (pp. 259–260)
Session info
sessionInfo()
R version 4.4.3 (2025-02-28)
Platform: aarch64-apple-darwin20
Running under: macOS Ventura 13.4
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/Chicago
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] tidybayes_3.0.7 bayesplot_1.11.1 brms_2.22.0 Rcpp_1.0.14
[5] viridis_0.6.5 viridisLite_0.4.2 patchwork_1.3.0 ggforce_0.4.2
[9] ggridges_0.5.6 cowplot_1.1.3 lubridate_1.9.3 forcats_1.0.0
[13] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.4 readr_2.1.5
[17] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] tensorA_0.36.2.1 rstudioapi_0.16.0 jsonlite_1.8.9
[4] magrittr_2.0.3 TH.data_1.1-2 estimability_1.5.1
[7] farver_2.1.2 nloptr_2.0.3 rmarkdown_2.29
[10] vctrs_0.6.5 minqa_1.2.6 base64enc_0.1-3
[13] htmltools_0.5.8.1 distributional_0.5.0 curl_6.0.1
[16] StanHeaders_2.32.10 htmlwidgets_1.6.4 plyr_1.8.9
[19] sandwich_3.1-1 emmeans_1.10.3 zoo_1.8-12
[22] igraph_2.0.3 mime_0.12 lifecycle_1.0.4
[25] pkgconfig_2.0.3 colourpicker_1.3.0 Matrix_1.7-2
[28] R6_2.6.1 fastmap_1.1.1 shiny_1.8.1.1
[31] digest_0.6.37 colorspace_2.1-1 crosstalk_1.2.1
[34] labeling_0.4.3 timechange_0.3.0 polyclip_1.10-6
[37] abind_1.4-8 compiler_4.4.3 bit64_4.0.5
[40] withr_3.0.2 backports_1.5.0 inline_0.3.19
[43] shinystan_2.6.0 QuickJSR_1.1.3 pkgbuild_1.4.4
[46] MASS_7.3-64 gtools_3.9.5 loo_2.8.0
[49] tools_4.4.3 httpuv_1.6.15 threejs_0.3.3
[52] glue_1.8.0 nlme_3.1-167 promises_1.3.0
[55] grid_4.4.3 checkmate_2.3.2 reshape2_1.4.4
[58] generics_0.1.3 gtable_0.3.6 tzdb_0.4.0
[61] hms_1.1.3 utf8_1.2.4 pillar_1.10.2
[64] ggdist_3.3.2 markdown_1.13 vroom_1.6.5
[67] posterior_1.6.0 later_1.3.2 splines_4.4.3
[70] tweenr_2.0.3 lattice_0.22-6 survival_3.8-3
[73] bit_4.0.5 tidyselect_1.2.1 miniUI_0.1.1.1
[76] knitr_1.49 arrayhelpers_1.1-0 gridExtra_2.3
[79] V8_4.4.2 stats4_4.4.3 xfun_0.49
[82] rstanarm_2.32.1 bridgesampling_1.1-2 matrixStats_1.4.1
[85] DT_0.33 rstan_2.32.6 stringi_1.8.4
[88] yaml_2.3.8 boot_1.3-31 evaluate_1.0.1
[91] codetools_0.2-20 cli_3.6.4 RcppParallel_5.1.7
[94] shinythemes_1.2.0 xtable_1.8-4 munsell_0.5.1
[97] coda_0.19-4.1 svUnit_1.0.6 parallel_4.4.3
[100] rstantools_2.4.0 dygraphs_1.1.1.6 Brobdingnag_1.2-9
[103] lme4_1.1-35.3 mvtnorm_1.2-5 scales_1.3.0
[106] xts_0.14.1 crayon_1.5.3 rlang_1.1.5
[109] multcomp_1.4-26 shinyjs_2.1.0