# 第 49 章 科研数据可视化

## 49.1 统计分布图

lincoln_df <- ggridges::lincoln_weather %>%
mutate(
month_short = fct_recode(
Month,
Jan = "January",
Feb = "February",
Mar = "March",
Apr = "April",
May = "May",
Jun = "June",
Jul = "July",
Aug = "August",
Sep = "September",
Oct = "October",
Nov = "November",
Dec = "December"
)
) %>%
mutate(month_short = fct_rev(month_short)) %>%
select(Month, month_short, Mean Temperature [F])

lincoln_df %>%
head(5)
## # A tibble: 5 x 3
##   Month   month_short Mean Temperature [F]
##   <fct>   <fct>                        <int>
## 1 January Jan                             24
## 2 January Jan                             23
## 3 January Jan                             23
## 4 January Jan                             17
## 5 January Jan                             29

### 49.1.1 points-errorbars

lincoln_errbar <- lincoln_df %>%
ggplot(aes(x = month_short, y = Mean Temperature [F])) +
stat_summary(
fun.y = mean, fun.ymax = function(x) {
mean(x) + 2 * sd(x)
},
fun.ymin = function(x) {
mean(x) - 2 * sd(x)
}, geom = "pointrange",
fatten = 5
) +
xlab("month") +
ylab("mean temperature (°F)") +
theme_classic(base_size = 14) +
theme(
axis.text = element_text(color = "black", size = 12),
plot.margin = margin(3, 7, 3, 1.5)
)

lincoln_errbar

1. 图中只用了一个点和两个误差棒，丢失了很多分布信息。

2. 读者不能很直观的读出这个点的含义（是均值还是中位数？）

3. 误差棒代表的含义不明确（标准差？标准误？还是其他？）

( 标准误：对样本均值估计的不确定性; 标准差：对偏离均值的分散程度 )

1. 现实的数据往往是偏态的，但这个图的误差棒几乎是对称，会让人觉得产生怀疑。

### 49.1.2 箱线图

• 盒子中间的横线是中位数(50th percentile)，底部的横线代表第一分位数(25th percentile)，顶部的横线代表第三分位数(75th percentile)
• 盒子的范围覆盖了50%的数据，每个小盒子是25%的数据，盒子高度越短， 说明数据越集中，盒子高度越长，数据越分散。
• 上面的这条竖线的长度 = 从盒子上边缘开始，延伸到1.5倍盒子高度的范围中最远的点
• 下面的这条竖线的长度 = 从盒子下边缘开始，延伸到1.5倍盒子高度的范围中最远的点
• 在线条之外的点就是 outlies

lincoln_box <- lincoln_df %>%
ggplot(aes(x = month_short, y = Mean Temperature [F])) +
geom_boxplot(fill = "grey90") +
xlab("month") +
ylab("mean temperature (°F)") +
theme_classic(base_size = 14) +
theme(
axis.text = element_text(color = "black", size = 12),
plot.margin = margin(3, 7, 3, 1.5)
)

lincoln_box

### 49.1.3 小提琴图

• 小提琴图相当于密度分布图旋转90度，然后再做个对称的镜像
• 最宽或者最厚的地方，对应着数据密度最大的地方
• 箱线图能用的地方小提琴图都能用，而且小提琴图可以很好的展示bimodal data的情况（箱线图做不到）

lincoln_violin <- lincoln_df %>%
ggplot(aes(x = month_short, y = Mean Temperature [F])) +
geom_violin(fill = "grey90") +
xlab("month") +
ylab("mean temperature (°F)") +
theme_classic(base_size = 14) +
theme(
axis.text = element_text(color = "black", size = 12),
plot.margin = margin(3, 7, 3, 1.5)
)

lincoln_violin

### 49.1.4 sina 图

lincoln_points <- lincoln_df %>%
ggplot(aes(x = month_short, y = Mean Temperature [F])) +
geom_point(size = 0.75) +
xlab("month") +
ylab("mean temperature (°F)") +
theme_classic(base_size = 14) +
theme(
axis.text = element_text(color = "black", size = 12),
plot.margin = margin(3, 7, 3, 1.5)
)

lincoln_points

lincoln_jitter <- lincoln_df %>%
ggplot(aes(x = month_short, y = Mean Temperature [F])) +
geom_point(position = position_jitter(width = .15, height = 0, seed = 320), size = 0.75) +
xlab("month") +
ylab("mean temperature (°F)") +
theme_classic(base_size = 14) +
theme(
axis.text = element_text(
color = "black",
size = 12
),
plot.margin = margin(3, 7, 3, 1.5)
)

lincoln_jitter

lincoln_sina <- lincoln_df %>%
ggplot(aes(x = month_short, y = Mean Temperature [F])) +
geom_violin(color = "transparent", fill = "gray90") +
# dviz.supp::stat_sina(size = 0.85) +
geom_jitter(width = 0.25, size = 0.85) +
xlab("month") +
ylab("mean temperature (°F)") +
theme_classic(base_size = 14) +
theme(
axis.text = element_text(
color = "black",
size = 12
),
plot.margin = margin(3, 7, 3, 1.5)
)

lincoln_sina

### 49.1.5 山峦图

bandwidth <- 3.4

lincoln_df %>%
ggplot(aes(x = Mean Temperature [F], y = Month)) +
geom_density_ridges(
scale = 3, rel_min_height = 0.01,
bandwidth = bandwidth, fill = colorspace::lighten("#56B4E9", .3), color = "white"
) +
scale_x_continuous(
name = "mean temperature (°F)",
expand = c(0, 0), breaks = c(0, 25, 50, 75)
) +
scale_y_discrete(name = NULL, expand = c(0, .2, 0, 2.6)) +
theme_minimal(base_size = 14) +
theme(
axis.text = element_text(color = "black", size = 12),
axis.text.y = element_text(vjust = 0),
plot.margin = margin(3, 7, 3, 1.5)
)

### 49.1.6 有颜色山峦图

• 温度值越高，x轴坐标越靠右；
• 温度值越高，颜色更亮；

bandwidth <- 3.4

lincoln_base <- lincoln_weather %>%
ggplot(aes(x = Mean Temperature [F], y = Month, fill = ..x..)) +
scale = 3, rel_min_height = 0.01, bandwidth = bandwidth,
color = "black", size = 0.25
) +
scale_x_continuous(
name = "mean temperature (°F)",
expand = c(0, 0), breaks = c(0, 25, 50, 75), labels = NULL
) +
scale_y_discrete(name = NULL, expand = c(0, .2, 0, 2.6)) +
colorspace::scale_fill_continuous_sequential(
palette = "Heat",
l1 = 20, l2 = 100, c2 = 0,
rev = FALSE
) +
guides(fill = "none") +
theme_minimal(base_size = 14) +
theme(
axis.text = element_text(color = "black", size = 12),
axis.text.y = element_text(vjust = 0),
plot.margin = margin(3, 7, 3, 1.5)
)

# x axis labels
temps <- data.frame(temp = c(0, 25, 50, 75))

# calculate corrected color ranges
# stat_joy uses the +/- 3*bandwidth calculation internally
tmin <- min(lincoln_weather$Mean Temperature [F]) - 3 * bandwidth tmax <- max(lincoln_weather$Mean Temperature [F]) + 3 * bandwidth

xax <- axis_canvas(lincoln_base, axis = "x", ylim = c(0, 2)) +
data = data.frame(temp = seq(tmin, tmax, length.out = 100)),
aes(x = temp, y = 1.1, height = .9, fill = temp),
color = "transparent"
) +
geom_text(
data = temps, aes(x = temp, label = temp),
color = "black",
y = 0.9, hjust = 0.5, vjust = 1, size = 14 / .pt
) +
colorspace::scale_fill_continuous_sequential(
palette = "Heat",
l1 = 20, l2 = 100, c2 = 0,
rev = FALSE
)

lincoln_final <- cowplot::insert_xaxis_grob(lincoln_base, xax, position = "bottom", height = unit(0.1, "null"))

ggdraw(lincoln_final)