# 7  探索实践

## 7.1 分析老忠实间歇泉喷发规律

# faithful 添加二维核密度估计 density 列
library(KernSmooth)
den <- bkde2D(x = faithful, bandwidth = c(0.7, 7), gridsize = c(51L, 51L))
faithful2d <- expand.grid(eruptions = den$x1, waiting = den$x2) |>
transform(density = as.vector(den$fhat)) plot(faithful, pch = 20, panel.first = grid(), cex = 1, ann = FALSE, xlim = c(0.5, 6.5), ylim = c(35, 100) ) title(xlab = "喷发时间", ylab = "等待时间", family = "Noto Serif CJK SC") plot(faithful, pch = 20, panel.first = grid(), cex = 1, ann = FALSE, xlim = c(0.5, 6.5), ylim = c(35, 100), col = densCols(faithful, bandwidth = c(0.7, 7), nbin = c(51L, 51L), colramp = hcl.colors ) ) title(xlab = "喷发时间", ylab = "等待时间", family = "Noto Serif CJK SC") plot(faithful, pch = 20, panel.first = grid(), cex = 1, ann = FALSE, xlim = c(0.5, 6.5), ylim = c(35, 100), col = densCols(faithful, bandwidth = c(0.7, 7), nbin = c(51L, 51L), colramp = hcl.colors ) ) contour(den$x1, den$x2, den$fhat, nlevels = 10, add = TRUE, family = "sans")
title(xlab = "喷发时间", ylab = "等待时间", family = "Noto Serif CJK SC")

# 散点添加颜色
mkBreaks <- function(u) u - diff(range(u)) / (length(u) - 1) / 2
# faithful 划入网格内
xbin <- cut(faithful[, 1], mkBreaks(den$x1), labels = FALSE) ybin <- cut(faithful[, 2], mkBreaks(den$x2), labels = FALSE)
# 网格对应的核密度估计值即为 faithful 对应的核密度估计值
faithful$dens <- den$fhat[cbind(xbin, ybin)]
# 若是 faithful 数据点没有划分，则置为 0
faithful$dens[is.na(faithful$dens)] <- 0

library(ggplot2)
library(ggnewscale)
ggplot() +
geom_point(
data = faithful, aes(x = eruptions, y = waiting, color = dens),
shape = 20, size = 2, show.legend = FALSE
) +
scale_colour_viridis_c(option = "D") +
new_scale_color() +
geom_contour(data = faithful2d, aes(
x = eruptions, y = waiting,
z = density, colour = after_stat(level)
), bins = 14, linewidth = 0.45, show.legend = FALSE) +
scale_colour_viridis_c(option = "C", direction = -1, begin = 0.2, end = 0.8) +
# colorspace::scale_color_continuous_sequential(palette = "Grays") +
scale_x_continuous(breaks = 1:6) +
scale_y_continuous(breaks = 10 * 4:10) +
coord_cartesian(xlim = c(0.5, 6.5), ylim = c(35, 100)) +
labs(x = "喷发时间", y = "等待时间", colour = "密度") +
theme_bw(base_size = 13) +
theme(
legend.title = element_text(family = "Noto Serif CJK SC"),
axis.title = element_text(family = "Noto Serif CJK SC"),
axis.title.x = element_text(
margin = margin(b = 0, l = 0, t = 20, r = 0)
),
axis.title.y = element_text(
margin = margin(b = 0, l = 0, t = 0, r = 20)
),
panel.border = element_rect(color = "black"),
panel.grid = element_blank(),
panel.grid.major = element_line(
color = "lightgray",
linetype = 3, linewidth = 0.5
),
axis.ticks.length = unit(0.25, "cm"),
axis.text.x = element_text(
family = "sans", color = "black",
vjust = -1.5, size = rel(1.25)
),
axis.text.y = element_text(
family = "sans", color = "black",
angle = 90, vjust = 1.5, hjust = 0.5,
size = rel(1.25)
)
)

## 7.2 中国地区级男女性别比分布

china_household_sex <- readRDS(file = "data/china-household-sex-2020.rds")
ggplot(data = china_household_sex, aes(x = 户口登记状况, y = 男性 / 女性)) +
geom_jitter(aes(color = 区域), width = 0.3) +
theme_classic()

ggplot(data = china_household_sex, aes(x = 户口登记状况, y = 男性 / 女性)) +
geom_jitter(aes(color = 区域), width = 0.3) +
scale_y_continuous(trans = "log10") +
theme_classic()
• 「住本乡、镇、街道，户口在本乡、镇、街道」土著和已获得当地户口的。性别比分布有明显的层次差异，性别比均值从大到小依次是城市、乡村、镇。城市里，男性略多于女性，镇里，男性明显少于女性，乡村里，男性略低于女性。
• 「住本乡、镇、街道，户口待定」黑户或其它。性别比分布有明显的层次差异。同上。
• 「住本乡、镇、街道，户口在外乡、镇、街道，离开户口登记地半年以上」流出人口，流出乡、镇、街道。城市、镇、乡村的性别比数据充分混合了，性别比分布没有明显的层次差异。
• 「居住在港澳台或国外，户口在本乡、镇、街道」流出人口，流出国。同上。

## 7.3 美国历年各年龄死亡率变化

usa_mortality <- readRDS(file = "data/usa-mortality-2020.rds")
library(patchwork)
p1 <- ggplot(data = usa_mortality, aes(x = Age, y = Male, group = Year)) +
geom_vline(xintercept = "100", colour = "gray", lty = 2) +
geom_line(aes(color = Year), linewidth = 0.25) +
scale_x_discrete(
breaks = as.character(20 * 0:5),
labels = as.character(20 * 0:5)
) +
theme_classic()
p2 <- p1 +
labs(x = "年龄", y = "死亡率", color = "年份")
p3 <- p1 +
scale_y_log10(labels = scales::label_log()) +
scale_colour_gradientn(colors = RColorBrewer::brewer.pal(name = "Spectral", n = 11)) +
labs(x = "年龄", y = "死亡率（对数尺度）", color = "年份")
p2 / p3

ggplot(data = usa_mortality, aes(x = Year, y = Age, fill = Male)) +
colors = RColorBrewer::brewer.pal(name = "Spectral", n = 11),
trans = "log10", labels = scales::percent_format()
) +
geom_tile(linewidth = 0.4) +
scale_y_discrete(
breaks = as.character(10 * 0:10),
labels = as.character(10 * 0:10),
expand = c(0, 0)
) +
scale_x_continuous(
breaks = 1940 + 10 * 0:8,
labels = 1940 + 10 * 0:8,
expand = c(0, 0)
) +
theme_classic() +
labs(x = "年份", y = "年龄", fill = "死亡率")

## 7.4 美国弗吉尼亚州城乡死亡率

VADeaths 数据来自 Base R 内置的 datasets 包，记录 1940 年美国弗吉尼亚州死亡率，如下表。

dat <- transform(expand.grid(
site = c("乡村", "城镇"), sex = c("男", "女"),
age = ordered(c("50-54", "55-59", "60-64", "65-69", "70-74"))
), deaths = as.vector(t(VADeaths)) / 1000)

library(ggplot2)
# (\u2030) 表示千分号
ggplot(data = dat, aes(x = sex, y = deaths, fill = age)) +
geom_col(position = "dodge2") +
scale_y_continuous(labels = scales::label_percent(scale = 1000, suffix = "\u2030")) +
scale_fill_brewer(palette = "Spectral") +
facet_wrap(~site, ncol = 1) +
theme_bw(base_size = 13) +
labs(x = "性别", y = "死亡率", fill = "年龄")

ggplot(data = dat, aes(x = age, y = deaths, fill = site)) +
geom_col(position = "dodge2") +
scale_y_continuous(labels = scales::label_percent(scale = 1000, suffix = "\u2030")) +
scale_fill_brewer(palette = "Spectral") +
facet_wrap(~sex, ncol = 1) +
theme_bw(base_size = 13) +
labs(x = "年龄", y = "死亡率", fill = "城乡")

## 7.5 如何用图表示累积概率分布

$F(x) = \int_{-\infty}^{x} f(t) \mathrm{dt}$

diamonds 数据来自 ggplot2 包，记录了约 54000 颗钻石的价格、重量、切工、颜色、纯净度、尺寸等属性信息。图 7.7 展示这批不同切工的钻石随价格的分布，在这个示例中，如何表达累积分布？概率分布的密度曲线是根据直方图估计得来的，根据不同价格区间内钻石的数量占总钻石的比例估计概率。固定窗宽，即在同一价格区间内累积不同切工的钻石数量，得到累积概率，最后获得累积概率密度曲线，更多理论细节见数据可视化陷阱

library(ggplot2)
library(patchwork)
p1 <- ggplot(diamonds, aes(x = price, y = after_stat(density), fill = cut)) +
geom_density(position = "stack", colour = "white") +
scale_fill_brewer(palette = "Spectral") +
scale_y_continuous(
labels = expression(0, 5~"·"~10^-4, 10 ~ "·" ~ 10^-4, 15 ~ "·" ~ 10^-4),
breaks = c(0, 5, 10, 15) * 10^(-4)
) +
theme_bw(base_family = "Noto Serif CJK SC") +
labs(x = "价格", y = "概率密度", fill = "切工", tag = "坏") +
theme(
axis.text.x = element_text(family = "sans", color = "black"),
axis.text.y = element_text(
family = "sans", color = "black",
angle = 90, vjust = 1.5, hjust = 0.5
),
legend.text = element_text(family = "sans"),
plot.tag = element_text(family = "Noto Serif CJK SC", color = "red"),
plot.tag.position = "topright"
)

p2 <- ggplot(diamonds, aes(x = price, y = after_stat(density * n), fill = cut)) +
geom_density(position = "stack", colour = "white") +
scale_fill_brewer(palette = "Spectral") +
theme_bw(base_family = "Noto Serif CJK SC") +
labs(x = "价格", y = "概率质量", fill = "切工", tag = "好") +
theme(
axis.text.x = element_text(family = "sans", color = "black"),
axis.text.y = element_text(
family = "sans", color = "black",
angle = 90, vjust = 1.5, hjust = 0.5
),
legend.text = element_text(family = "sans"),
plot.tag = element_text(family = "Noto Serif CJK SC", color = "black"),
plot.tag.position = "topright"
)

p1 / p2

## 7.6 解释二项总体参数的置信带

0 和 1 是世界的原点，蕴含大道真意，从 0-1 分布也叫伯努利分布，独立同 0-1分布之和可以衍生出二项分布。在一定条件下，可以用泊松分布近似二项分布。根据中心极限定理，独立同二项分布的极限和与正态分布可以发生关系。在二项分布、正态分布的基础上，可以衍生出超几何分布、贝塔分布等等。本书多个地方以二项分布为例介绍基本统计概念和模型。

$P(X = x) = \binom{n}{x}p^x(1-p)^{n-x} \tag{7.1}$

$\sum_{r = x}^{n}\binom{n}{x}p^x(1-p)^{n-x} = \frac{\alpha}{2} \tag{7.2}$

qbinom(0.025, size = 10, prob = 0.1, lower.tail = F)
#> [1] 3

$P(X > 3) = \sum_{x > 3}^{10}\binom{10}{x}0.1^x*(1-0.1)^{10-x} \tag{7.3}$

pbinom(q = 3, size = 10, prob = 0.1, lower.tail = F)
#> [1] 0.0127952

1. $$1-\alpha$$ 的把握确定区间包含真值。
2. 区间包含真值的概率是 $$1-\alpha$$

1934 年 C. J. Clopper 和 E. S. Pearson 在给定置信水平 $$1- \alpha = 0.95$$ 和样本量 $$n = 10$$ 的情况下，给出二项分布 $$B(n, p)$$ 参数 $$p$$ 的区间估计（即所谓的 Clopper-Pearson 精确区间估计）和置信带 ，如 图 7.8 所示，横坐标为观测到的成功次数，纵坐标为参数 $$p$$ 的置信限。具体来说，固定样本量为 10，假定观测到的成功次数为 2，在置信水平为 0.95 的情况下，Base R 内置的二项精确检验函数 binom.test()，可以获得参数 $$p$$ 的精确区间估计为 $$(p_1, p_2) = (0.025, 0.556)$$，即：

# 精确二项检验 p = 0.2
binom.test(x = 2, n = 10, p = 0.2)
#>
#>  Exact binomial test
#>
#> data:  2 and 10
#> number of successes = 2, number of trials = 10, p-value = 1
#> alternative hypothesis: true probability of success is not equal to 0.2
#> 95 percent confidence interval:
#>  0.02521073 0.55609546
#> sample estimates:
#> probability of success
#>                    0.2

# 精确二项检验 p = 0.4
binom.test(x = 2, n = 10, p = 0.4)
#>
#>  Exact binomial test
#>
#> data:  2 and 10
#> number of successes = 2, number of trials = 10, p-value =
#> 0.3335
#> alternative hypothesis: true probability of success is not equal to 0.4
#> 95 percent confidence interval:
#>  0.02521073 0.55609546
#> sample estimates:
#> probability of success
#>                    0.2

library(rootSolve) # uniroot.all
options(digits = 4)
# r 为上分位点
p_fun <- function(p, r = 9) qbinom(0.025, size = 10, prob = p, lower.tail = F) - r # 上分位点
l_fun <- function(p, r = 9) qbinom(0.025, size = 10, prob = p, lower.tail = T) - r # 下分位点

# 计算每个分位点对应的最小的概率 p
p <- sapply(0:10, function(x) min(uniroot.all(p_fun, lower = 0, upper = 1, r = x)))

# 计算每个分位点对应的最大的概率 l
l <- sapply(0:10, function(x) max(uniroot.all(l_fun, lower = 0, upper = 1, r = x)))

plot(
x = seq(from = 0, to = 10, length.out = 11),
y = seq(from = 0, to = 1, length.out = 11),
type = "n", ann = FALSE, family = "sans", panel.first = grid()
)
title(xlab = "成功次数", ylab = "置信限", family = "Noto Serif CJK SC")
lines(x = 0:10, y = p, type = "s") # 朝下的阶梯线
lines(x = 0:10, y = p, type = "l") # 折线
# points(x = 0:10, y = p, pch = 16, cex = .8) # 散点

# abline(a = 0, b = 0.1, col = "gray", lwd = 2, lty = 2) # 添加对称线
text(x = 5, y = 0.5, label = "置信带", cex = 1.5, srt = 45, family = "Noto Serif CJK SC")
# points(x = 5, y = 0.5, col = "black", pch = 16) # 中心对称点
# points(x = 5, y = 0.5, col = "black", pch = 3) # 中心对称点

lines(x = 0:10, y = l, type = "S") # 朝上的阶梯线
lines(x = 0:10, y = l, type = "l") # 折线
# points(x = 0:10, y = l, pch = 16, cex = .8) # 散点

points(x = c(2, 2), y = c(0.03, 0.55), pch = 8, col = "black")
text(x = 2, y = 0.55, labels = expression(p[2]), pos = 1)
text(x = 2, y = 0.03, labels = expression(p[1]), pos = 3)

## 7.7 解释置信区间及其覆盖概率

$(\hat{p} - Z_{1-\alpha/2} \sqrt{\hat{p}*(1-\hat{p})/n}, \hat{p} + Z_{1-\alpha/2} \sqrt{\hat{p}*(1-\hat{p})/n}) \tag{7.4}$

# 计算覆盖概率
# Wald 覆盖
coverage_wald <- function(p = 0.1, n = 10, nsim = 1000) {
phats <- rbinom(nsim, prob = p, size = n) / n
ll <- phats - qnorm(1 - 0.05 / 2) * sqrt(phats * (1 - phats) / n)
ul <- phats + qnorm(1 - 0.05 / 2) * sqrt(phats * (1 - phats) / n)
mean(ll < p & ul > p)
}
# Agresti-Coull 覆盖
coverage_agresti <- function(p = 0.1, n = 10, nsim = 1000) {
phats <- (rbinom(nsim, prob = p, size = n) + 2) / (n + 4)
ll <- phats - qnorm(1 - 0.05 / 2) * sqrt(phats * (1 - phats) / n)
ul <- phats + qnorm(1 - 0.05 / 2) * sqrt(phats * (1 - phats) / n)
mean(ll < p & ul > p)
}
# Clopper and Pearson (1934)
# 与 binom.test() 计算结果一致
coverage_clopper <- function(p = 0.1, n = 10, nsim = 1000) {
nd <- rbinom(nsim, prob = p, size = n)
ll <- qbeta(p = 0.05 / 2, shape1 = nd, shape2 = n - nd + 1)
ul <- qbeta(p = 1 - 0.05 / 2, shape1 = nd + 1, shape2 = n - nd)
mean(ll < p & ul > p)
}
# Wilson (1927)
# 与 prop.test(correct = FALSE) 计算结果一致
coverage_wilson <- function(p = 0.1, n = 10, nsim = 1000) {
phats <- rbinom(nsim, prob = p, size = n) / n
lambda <- qnorm(1 - 0.05 / 2)
ll <- phats + lambda^2 / (2 * n) - lambda * sqrt(phats * (1 - phats) / n + lambda^2 / (4 * n^2))
ul <- phats + lambda^2 / (2 * n) + lambda * sqrt(phats * (1 - phats) / n + lambda^2 / (4 * n^2))
mean(ll / (1 + lambda^2 / n) < p & ul / (1 + lambda^2 / n) > p)
}

sim_dat <- transform(expand.grid(
p = seq(0.01, 0.99, by = 0.01),
n = c(10, 100),
nsim = 1000,
methods = c("Wald", "Agresti-Coull", "Wilson", "Clopper-Pearson")
), prob = ifelse(methods == "Wald",
Vectorize(coverage_wald)(p = p, n = n, nsim = nsim),
ifelse(methods == "Agresti-Coull",
Vectorize(coverage_agresti)(p = p, n = n, nsim = nsim),
ifelse(methods == "Wilson",
Vectorize(coverage_wilson)(p = p, n = n, nsim = nsim),
Vectorize(coverage_clopper)(p = p, n = n, nsim = nsim)
)
)
), nsample = ifelse(n == 10, "n=10", "n=100"))

ggplot(data = sim_dat, aes(x = p, y = prob, color = nsample)) +
geom_hline(yintercept = 0.95, linetype = 2,
linewidth = 1, color = "gray60") +
geom_point() +
geom_path() +
# annotate(geom = "text", x = 0, y = 0.95, label = "0.950",
#          fontface = "bold", hjust = 2, size = 3.5) +
# scale_color_grey() +
scale_color_brewer(palette = "Set1") +
facet_wrap(facets = ~methods, ncol = 1, scales = "free_y") +
labs(x = "成功概率", y = "覆盖概率", color = "样本量") +
theme_bw(base_size = 13, base_family = "sans") +
theme(title = element_text(family = "Noto Serif CJK SC")) +
coord_cartesian(clip = 'off')

## 7.8 习题

1. 1888 年，瑞士开始进入一个人口转变的阶段，从发展中国家的高出生率开始下滑。分析生育率和经济指标的关系。数据集 swiss 记录了 1888 年瑞士 47 个说法语的省份的生育率和社会经济指标数据。Fertility（生育率，采用常见的标准生育率统计口径）、Agriculture（男性从事农业生产的比例）、Examination（应征者在军队考试中获得最高等级的比例）、Education（应征者有小学以上教育水平的比例）、Catholic（信仰天主教的比例）、Infant.Mortality（婴儿死亡率，仅考虑出生一年内死亡），各个指标都统一标准化为百分比的形式。其中，Examination 和 Education 是 1887 年、1888 年和 1889 年的平均值。瑞士 182 个地区 1888 年及其它年份的数据可从网站获得。

1. 数据来自德国马克斯普朗克人口研究所、美国加州大学伯克利分校、法国人口研究所共同建立的人类死亡率数据库 (https://www.mortality.org/)。↩︎