# 2  基础图形

• 章节 2.1 探索、展示数据中隐含的趋势信息，具体有折线图、瀑布图、曲线图、曲面图、热力图、日历图、棋盘图和时间线图。
• 章节 2.2 以图形展示数据对比，达到更加突出、显著的效果，让差异给人留下印象，具体有柱形图、条形图、点线图（也叫克利夫兰点图）、雷达图和词云图。
• 章节 2.3 探索、展示数据中隐含的比例信息，以突出重点，具体有简单饼图、环形饼图、扇形饼图、帕累托图、马赛克图和矩阵树图。

## 2.1 描述趋势

GNU R 是一个自由的统计计算和统计绘图环境，最初由新西兰奥克兰大学统计系的 Ross Ihaka 和 Robert Gentleman 共同开发。1997 年之后，成立了一个 R Core Team（R 语言核心团队），他们在版本控制系统 Apache Subversion上一起协作开发至今。25 年—四分之一个世纪过去了，下面分析他们留下的一份开发日志，了解一段不轻易为人所知的故事。

svn log --xml --verbose -r 6:83528 \
https://svn.r-project.org/R/trunk > data-raw/svn_trunk_log_2022.xml

svn_trunk_log <- readRDS(file = "data/svn-trunk-log-2022.rds")
#>   revision author               stamp
#> 1        6  ihaka 1997-09-18 04:41:25
#> 2        7  ihaka 1997-09-18 04:42:42
#>                                  msg
#> 1 New predict.lm from Peter Dalgaard
#> 2             Updated release number

### 2.1.1 折线图

svn_trunk_log <- within(svn_trunk_log, {
# 提取日期、月份、年份、星期、第几周、第几天等时间成分
year <- as.integer(format(stamp, "%Y"))
date <- format(stamp, format = "%Y-%m-%d", tz = "UTC")
month <- format(stamp, format = "%m", tz = "UTC")
hour <- format(stamp, format = "%H", tz = "UTC")
week <- format(stamp, format = "%U", tz = "UTC")
wday <- format(stamp, format = "%a", tz = "UTC")
nday <- format(stamp, format = "%j", tz = "UTC")
})
# 代码维护者 ID 和姓名对应
ctb_map <- c(
"bates" = "Douglas Bates", "deepayan" = "Deepayan Sarkar",
"duncan" = "Duncan Temple Lang", "falcon" = "Seth Falcon",
"guido" = "Guido Masarotto", "hornik" = "Kurt Hornik",
"iacus" = "Stefano M. Iacus", "ihaka" = "Ross Ihaka",
"jmc" = "John Chambers", "kalibera" = "Tomas Kalibera",
"lawrence" = "Michael Lawrence", "leisch" = "Friedrich Leisch",
"ligges" = "Uwe Ligges", "luke" = "Luke Tierney",
"lyndon" = "Others", "maechler" = "Martin Maechler",
"mike" = "Others", "morgan" = "Martin Morgan",
"murdoch" = "Duncan Murdoch", "murrell" = "Paul Murrell",
"pd" = "Peter Dalgaard", "plummer" = "Martyn Plummer",
"rgentlem" = "Robert Gentleman", "ripley" = "Brian Ripley",
"smeyer" = "Sebastian Meyer", "system" = "Others",
"tlumley" = "Thomas Lumley", "urbaneks" = "Simon Urbanek"
)
svn_trunk_log$author <- ctb_map[svn_trunk_log$author]

trunk_year <- aggregate(data = svn_trunk_log, revision ~ year, FUN = length)

library(ggplot2)
ggplot(data = trunk_year, aes(x = year, y = revision)) +
geom_point() +
geom_line() +
theme_classic() +
theme(panel.grid.major.y = element_line(colour = "gray90")) +
labs(x = "年份", y = "提交量")

aggregate(data = svn_trunk_log, revision ~ year + hour, length) |>
ggplot(aes(x = hour, y = revision, group = year)) +
geom_line() +
geom_line(data = function(x) subset(x, year < 2006),
aes(color = as.character(year))) +
theme_classic() +
labs(x = "时段", y = "提交量", color = "年份")

aggregate(data = svn_trunk_log, revision ~ year + month, length) |>
transform(date = as.Date(paste(year, month, "01", sep = "-"))) |>
ggplot(aes(x = date, y = revision)) +
geom_point(aes(color = factor(year)), show.legend = F, size = 0.75) +
geom_line(aes(color = factor(year)), show.legend = F) +
scale_x_date(date_minor_breaks = "1 year") +
theme_classic() +
theme(panel.grid.minor.x = element_line()) +
labs(x = "时间（月粒度）", y = "提交量")

### 2.1.2 瀑布图

trunk_year <- trunk_year[order(trunk_year$year), ] trunk_year_tmp <- data.frame( xmin = trunk_year$year[-length(trunk_year$year)], ymin = trunk_year$revision[-length(trunk_year$revision)], xmax = trunk_year$year[-1],
ymax = trunk_year$revision[-1], fill = trunk_year$revision[-1] - trunk_year$revision[-length(trunk_year$revision)] > 0
)

ggplot() +
geom_rect(
data = trunk_year_tmp,
aes(xmin = xmin, ymin = ymin, xmax = xmax, ymax = ymax, fill = fill
),
show.legend = FALSE
) +
geom_point(
data = trunk_year, aes(x = year, y = revision), size = 0.75
) +
scale_fill_brewer(palette = "Set2") +
theme_classic() +
theme(panel.grid.major.y = element_line(colour = "gray90")) +
labs(x = "年份", y = "提交量")

ggTimeSeries (https://github.com/thecomeonman/ggTimeSeries) 提供统计图层 stat_waterfall() 实现类似的瀑布图，如 图 2.5 所示。

library(ggTimeSeries)
ggplot(data = trunk_year, aes(x = year, y = revision)) +
stat_waterfall() +
scale_fill_brewer(palette = "Set2") +
theme_classic() +
theme(panel.grid.major.y = element_line(colour = "gray90")) +
labs(x = "年份", y = "提交量")

### 2.1.3 曲线图

ggplot2 包提供函数 geom_smooth() 拟合散点图中隐含的趋势，通过查看函数 geom_smooth() 的帮助文档，可以了解其内部调用的统计方法。默认情况下，采用局部多项式回归拟合方法，内部调用了函数 loess() 来拟合趋势，如 图 2.6 所示。

ggplot(data = trunk_year, aes(x = year, y = revision)) +
geom_point() +
geom_smooth(data = subset(trunk_year, year != 1997)) +
theme_classic() +
theme(panel.grid.major.y = element_line(colour = "gray90")) +
labs(x = "年份", y = "提交量")
#> geom_smooth() using method = 'loess' and formula = 'y ~ x'

trunk_year_loess <- loess(revision ~ year,
data = subset(trunk_year, year != 1997),
span = 0.75, degree = 2, method = "loess",
family = "symmetric",
control = loess.control(surface = "direct", iterations = 4)
)

ggplot(data = trunk_year, aes(x = year, y = revision)) +
geom_point() +
geom_smooth(method = "loess", formula = "y~x",
method.args = list(
span = 0.75, degree = 2, family = "symmetric",
control = loess.control(surface = "direct", iterations = 4)
), data = subset(trunk_year, year != 1997)) +
theme_classic() +
theme(panel.grid.major.y = element_line(colour = "gray90")) +
labs(x = "年份", y = "提交量")

method = "loess" 意味着调用了一种非参数的回归方法，即局部估计散点平滑 （locally estimated scatterplot smoothing），另一个与之类似的回归方法是局部加权散点平滑 （locally weighted scatterplot smoothing），简称 lowess 。1991 年 Jerome Friedman 提出多元适应性回归样条（Multivariate Adaptive Regression Splines），R 语言社区对应功能的扩展包是 earth

trunk_year_nls <- nls(revision ~ a * (year - 1996)^2 + b,
data = subset(trunk_year, year != 1997),
start = list(a = -0.1, b = 1000)
)

ggplot(data = trunk_year, aes(x = year, y = revision)) +
geom_point() +
geom_smooth(
method = "nls",
formula = "y ~ a * (x - 1996)^2 + b",
method.args = list(
start = list(a = -0.1, b = 1000)
), se = FALSE,
data = subset(trunk_year, year != 1997),
) +
theme_classic() +
theme(panel.grid.major.y = element_line(colour = "gray90")) +
labs(x = "年份", y = "提交量")

summary(trunk_year_loess)
#> Call:
#> loess(formula = revision ~ year, data = subset(trunk_year, year !=
#>     1997), span = 0.75, degree = 2, family = "symmetric", method = "loess",
#>     control = loess.control(surface = "direct", iterations = 4))
#>
#> Number of Observations: 25
#> Equivalent Number of Parameters: 4.53
#> Residual Scale Estimate: 308.4
#> Trace of smoother matrix: 4.97  (exact)
#>
#> Control settings:
#>   span     :  0.75
#>   degree   :  2
#>   family   :  symmetric      iterations = 4
#>   surface  :  direct
#>   normalize:  TRUE
#>  parametric:  FALSE
#> drop.square:  FALSE
summary(trunk_year_nls)
#>
#> Formula: revision ~ a * (year - 1996)^2 + b
#>
#> Parameters:
#>    Estimate Std. Error t value Pr(>|t|)
#> a   -2.9625     0.4555  -6.504 1.23e-06 ***
#> b 3070.0890   147.1920  20.858  < 2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 471.8 on 23 degrees of freedom
#>
#> Number of iterations to convergence: 1
#> Achieved convergence tolerance: 2.808e-08

# 非线性回归的残差平方和的标准差
sqrt(sum(residuals(trunk_year_nls)^2)/24)
#> [1] 461.8963

ggplot2 包支持的平滑方法有很多，如借助函数 splinefun() 构造样条插值获得平滑曲线，调用 mgcv 包的函数 gam() ，调用 ggalt 包的函数 geom_xspline()

xxspline <- function(formula, data, ...) {
dat <- model.frame(formula, data)
res <- splinefun(dat[[2]], dat[[1]])
class(res) <- "xxspline"
res
}

predict.xxspline <- function(object, newdata, ...) {
object(newdata[[1]])
}

ggplot(data = trunk_year, aes(x = year, y = revision)) +
geom_point() +
geom_smooth(
formula = "y~x",
method = xxspline, se = FALSE,
data = subset(trunk_year, year != 1997)
) +
theme_classic() +
theme(panel.grid.major.y = element_line(colour = "gray90")) +
labs(x = "年份", y = "提交量")

ggplot(data = trunk_year, aes(x = year, y = revision)) +
geom_point() +
geom_smooth(
formula = y ~ s(x, k = 12),
method = "gam", se = FALSE,
data = subset(trunk_year, year != 1997)
) +
theme_classic() +
theme(panel.grid.major.y = element_line(colour = "gray90")) +
labs(x = "年份", y = "提交量")

ggplot(data = trunk_year, aes(x = year, y = revision)) +
geom_point() +
geom_smooth(
method = "lm",
formula = "y ~ poly((x - 1996), 3)",
se = FALSE,
data = subset(trunk_year, year != 1997),
) +
theme_classic() +
theme(panel.grid.major.y = element_line(colour = "gray90")) +
labs(x = "年份", y = "提交量")

### 2.1.4 流线图

library(ggstream)
trunk_year_author <- aggregate(data = svn_trunk_log, revision ~ year + author, FUN = length)
ggplot(trunk_year_author, aes(x = year, y = revision, fill = author)) +
geom_stream() +
theme_classic() +
theme(legend.position = "bottom") +
labs(x = "年份", y = "提交量", fill = "贡献者")

### 2.1.5 曲面图

ggplot2 包暂不支持绘制三维曲面图，而 lattice 包支持，但也是非常有限的支持。lattice 包ggplot2 包都是基于图形语法的，层层叠加就必然会出现覆盖，只有在绘制函数型数据的图像时是合适的，因为覆盖少，即使覆盖也不妨碍趋势的表达。根据不同的使用场景有两个更好的选择，基于 OpenGL 的真三维图形可以用 rayrenderrayshader 包绘制，而基于 JavaScripts 的交互式三维图形可以用 rglplotly 包绘制。

trunk_year_week <- aggregate(data = svn_trunk_log, revision ~ year + week, FUN = length)
library(lattice)
wireframe(
data = trunk_year_week, revision ~ year * as.integer(week),
shade = TRUE, drape = FALSE,
xlab = "年份",
ylab = "第几周",
zlab = list("提交量", rot = 90),
scales = list(
arrows = FALSE, col = "black"
),
# 减少三维图形的边空
lattice.options = list(
layout.widths = list(
left.padding = list(x = -.6, units = "inches"),
right.padding = list(x = -1.0, units = "inches")
),
layout.heights = list(
bottom.padding = list(x = -.8, units = "inches"),
top.padding = list(x = -1.0, units = "inches")
)
),
par.settings = list(axis.line = list(col = "transparent")),
screen = list(z = -60, x = -70, y = 0)
)

# 按年、月分组统计代码提交量
trunk_year_month <- aggregate(
data = svn_trunk_log,
revision ~ year + month, FUN = length
)
# 数据转化为矩阵类型
trunk_year_month_m <- matrix(
#>  $date : chr "1997-04-23" "1997-07-22" "1997-09-10" "1997-12-04" ... 做一点数据处理，将 date 字段转为日期类型，并从日期中提取年、月信息。 rversion$date <- as.Date(rversion$date, format = "%Y-%m-%d", tz = "UTC") rversion$year <- format(rversion$date, "%Y") rversion$month <- format(rversion$date, "%m") 统计过去 25 年里每月的发版次数，如图 图 2.16 aggregate(data = rversion, version ~ year + month, length) |> ggplot(aes(x = month, y = year)) + geom_label(aes(label = version, fill = version), show.legend = F, color = "white") + scale_fill_viridis_c(option = "D", begin = 0.2, end = 0.8) + theme_classic() + theme(panel.grid.major.y = element_line(colour = "gray95")) + labs(x = "月份", y = "年份") ### 2.1.9 时间线图 时间线图非常适合回顾过去，展望未来，讲故事 时间线图展示信息的层次和密度一般由时间跨度决定。时间跨度大时，展示重点节点信息，时间跨度小时，重点和次重点信息都可以放。从更加宏观的视角，厘清发展脉络，比如近两年的 R 软件发版情况。 本节用到一个数据集 rversion，记录了历次 R 软件发版时间及版本号，见 表格 2.2 rversion_tl <- within(rversion, { # 版本号为 x.0.0 为重大版本 big # 版本号为 x.1.0 x.12.0 x.20.0 为主要版本 major # 版本号为 x.0.1 为次要版本 minor status <- ifelse(grepl(pattern = "*\\.0\\.0", x = version), "big", version) status <- ifelse(grepl(pattern = "*\\.[1-9]{1,2}\\.0$", x = status), "major", status)
status <- ifelse(!status %in% c("big", "major"), "minor", status)
})
positions <- c(0.5, -0.5, 1.0, -1.0, 1.5, -1.5)
directions <- c(1, -1)
# 位置
rversion_pos <- data.frame(
# 只要不是同一天发布的版本，方向相对
date = unique(rversion_tl$date), position = rep_len(positions, length.out = length(unique(rversion_tl$date))),
direction = rep_len(directions, length.out = length(unique(rversion_tl$date))) ) # 原始数据上添加方向和位置信息 rversion_df <- merge(x = rversion_tl, y = rversion_pos, by = "date", all = TRUE) # 最重要的状态放在最后绘制到图上 rversion_df <- rversion_df[with(rversion_df, order(date, status)), ] 选取一小段时间内的发版情况，比如最近的三年 — 2020 - 2022 年 # 选取 2020 - 2022 年的数据 sub_rversion_df<- rversion_df[rversion_df$year %in% 2020:2022, ]
# 月份注释
month_dat <- data.frame(
date = seq(from = as.Date('2020-01-01'), to = as.Date('2022-12-31'), by = "3 month")
)
month_dat <- within(month_dat, {
month = format(date, "%b")
})
# 年份注释
year_dat <- data.frame(
date = seq(from = as.Date('2020-01-01'), to = as.Date('2022-12-31'), by = "1 year")
)
year_dat <- within(year_dat, {
year = format(date, "%Y")
})

ggplot(data = sub_rversion_df) +
geom_segment(aes(x = date, y = 0, xend = date, yend = position)) +
geom_hline(yintercept = 0, color = "black", linewidth = 1) +
geom_label(
aes(x = date, y = position, label = version, color = status),
show.legend = FALSE
) +
geom_point(aes(x = date, y = 0, color = status),
size = 3, show.legend = FALSE
) +
geom_text(
data = month_dat, aes(x = date, y = 0, label = month), vjust = 1.5
) +
geom_text(
data = year_dat, aes(x = date, y = 0, label = year), vjust = -0.5
) +
theme_void()

sub_rversion_df2 <- rversion_df[rversion_df$status %in% c("big", "major"), ] ggplot(data = sub_rversion_df2) + geom_segment(aes(x = 0, y = date, xend = position, yend = date, color = status), show.legend = F ) + geom_vline(xintercept = 0, color = "black", linewidth = 1) + geom_label( aes(x = position, y = date, label = version, color = status), show.legend = FALSE ) + geom_point(aes(x = 0, y = date, color = status), size = 3, show.legend = FALSE) + geom_text( aes(x = 0, y = as.Date(format(date, "%Y-01-01")), label = year), hjust = -0.1 ) + theme_void() 在 R 语言诞生的前 5 年里，每年发布 3 个主要版本，这 5 年是 R 软件活跃开发的时期。而 2003-2012 年的这 10 年，基本上每年发布 2 个主要版本。2013-2022 年的这 10 年，基本上每年发布 1 个主要版本。 timevis 包基于 JavaScript 库 Visvis-timeline 模块，可以 创建交互式的时间线图，支持与 Shiny 应用集成。 ## 2.2 描述对比 数据来自中国国家统计局发布的2021年统计年鉴， 对比的是什么？城市、镇和乡村的性别分布，是否失衡？在哪个年龄段表现很失衡？ ### 2.2.1 柱形图 分年龄段比较城市、镇和乡村的性别比数据 ggplot(data = china_age_sex, aes(x = 年龄, y = 性别比（女=100）, fill = 区域)) + geom_hline(yintercept = 100, color = "gray", lty = 2, linewidth = 1) + geom_col(position = "dodge2", width = 0.75) + theme_bw() 考虑到数据本身的含义，一般来说，性别比不可能从 0 开始，除非现实中出现了《西游记》里的女儿国。因此，将纵轴的范围，稍加限制，从 性别比为 70 开始，目的是突出城市、镇和乡村的差异。 ggplot(data = china_age_sex, aes(x = 年龄, y = 性别比（女=100）, fill = 区域)) + geom_hline(yintercept = 100, color = "gray", lty = 2, linewidth = 1) + geom_col(position = "dodge2", width = 0.75) + coord_cartesian(ylim = c(70, 130)) + theme_bw() ### 2.2.2 条形图 将柱形图横过来即可得到条形图，横过来的好处主要体现在分类很多的时候，留足空间给年龄分组的分类标签，从左到右，从上往下也十分符合大众的阅读习惯 ggplot(data = china_age_sex, aes(x = 性别比（女=100）, y = 年龄, fill = 区域)) + geom_vline(xintercept = 100, color = "gray", lty = 2, linewidth = 1) + geom_col(position = "dodge2", width = 0.75) + coord_cartesian(xlim = c(70, 130)) + theme_bw() ### 2.2.3 点线图 克利夫兰点图 dotchart() 在条形图的基础上，省略了条形图的宽度，可以容纳更多的数据点。 ggplot(data = china_age_sex, aes(x = 性别比（女=100）, y = 年龄, color = 区域)) + geom_vline(xintercept = 100, color = "lightgray", lty = 2, linewidth = 1) + geom_point() + theme_bw() ### 2.2.4 词云图 ggwordcloud 包提供词云图层 geom_text_wordcloud() 根据代码提交的说明制作词云图。 library(ggwordcloud) aggregate(data = svn_trunk_log, revision ~ author, FUN = length) |> ggplot(aes(label = author, size = revision)) + geom_text_wordcloud(seed = 2022, grid_size = 10, max_grid_size = 24) + scale_size_area(max_size = 20) 词云图也可以是条形图或柱形图的一种替代，词云图不用担心数目多少，而条形图不适合太多的分类情形。 aggregate(data = svn_trunk_log, revision ~ author, FUN = length) |> subset(subset = revision >= 100) |> ggplot(aes(x = revision, y = reorder(author, revision))) + geom_col() + theme_classic() + coord_cartesian(expand = FALSE) + labs(x = "提交量", y = "维护者") ## 2.3 描述占比 ### 2.3.1 简单饼图 提交量小于 2000 次的贡献者合并为一类 Others，按贡献者分组统计提交量及其占比，如 图 2.25 所示。 aggregate(data = svn_trunk_log, revision ~ author, FUN = length) |> transform(author2 = ifelse(revision < 2000, "Others", author)) |> aggregate(revision ~ author2, FUN = sum) |> transform(label = paste0(round(revision / sum(revision), digits = 4) * 100, "%")) |> ggplot(aes(x = 1, fill = reorder(author2, revision), y = revision)) + geom_col(position = "fill", show.legend = FALSE, color = "white") + scale_y_continuous(labels = scales::label_percent()) + coord_polar(theta = "y") + geom_text(aes(x = 1.2, label = author2), position = position_fill(vjust = 0.5), color = "black" ) + geom_text(aes(x = 1.65, label = label), position = position_fill(vjust = 0.5), color = "black" ) + theme_void() + labs(x = NULL, y = NULL) 当把提交量小于 1000 次的贡献者合并为 Others，则分类较多，占比小的也有一席之地，饼图上显得十分拥挤。 aggregate(data = svn_trunk_log, revision ~ author, FUN = length) |> transform(author2 = ifelse(revision < 1000, "Others", author)) |> aggregate(revision ~ author2, FUN = sum) |> transform(label = paste0(round(revision / sum(revision), digits = 4) * 100, "%")) |> ggplot(aes(x = 1, fill = reorder(author2, revision) , y = revision)) + geom_col(position = "fill", show.legend = FALSE, color = "white") + scale_y_continuous(labels = scales::label_percent()) + coord_polar(theta = "y") + geom_text(aes(x = 1.2, label = author2), position = position_fill(vjust = 0.5), color = "black" ) + geom_text(aes(x = 1.6, label = label), position = position_fill(vjust = 0.5), color = "black" ) + theme_void() + labs(x = NULL, y = NULL) 一种缓解拥挤的办法是通过 ggrepel 包在扇形区域旁边添加注释 library(ggrepel) dat1 <- aggregate(data = svn_trunk_log, revision ~ author, FUN = length) |> transform(author2 = ifelse(revision < 1000, "Others", author)) |> aggregate(revision ~ author2, FUN = sum) dat2 <- within(dat1, { value <- 100 * revision / sum(revision) csum <- rev(cumsum(rev(value))) pos <- value / 1.5 + c(csum[-1], NA) pos <- ifelse(is.na(pos), value / 2, pos) label <- paste(author2, paste0(round(value, 2), "%"), sep = "\n") }) ggplot(data = dat2, aes(x = 1, fill = author2, y = value)) + geom_col(show.legend = FALSE, color = "white") + coord_polar(theta = "y") + geom_label_repel(aes(y = pos, label = label), size = 4.5, nudge_x = 0.75, show.legend = FALSE ) + theme_void() + labs(x = NULL, y = NULL) 但是数量很多的情况下，也是无能为力的，当然，是否需要显示那么多，是否可以合并占比小的部分，也是值得考虑的问题。 ### 2.3.2 环形饼图 中间空了一块 aggregate(data = svn_trunk_log, revision ~ author, FUN = length) |> transform(author2 = ifelse(revision < 2000, "Others", author)) |> aggregate(revision ~ author2, FUN = sum) |> transform(label = paste0(round(revision / sum(revision), digits = 4) * 100, "%")) |> ggplot(aes(x = 1, fill = author2, y = revision)) + geom_col(position = "fill", show.legend = FALSE, color = "white") + scale_y_continuous(labels = scales::label_percent()) + coord_polar(theta = "y") + geom_text(aes(x = 1.2, label = author2), position = position_fill(vjust = 0.5), color = "black" ) + geom_text(aes(x = 1.7, label = label), position = position_fill(vjust = 0.5), color = "black" ) + theme_void() + labs(x = NULL, y = NULL) + xlim(c(0.2, 1.7)) ### 2.3.3 扇形饼图 扇形饼图又叫风玫瑰图或南丁格尔图 aggregate(data = svn_trunk_log, revision ~ author, FUN = length) |> transform(author2 = ifelse(revision < 2000, "Others", author)) |> aggregate(revision ~ author2, FUN = sum) |> ggplot(aes(x = reorder(author2, revision), y = revision)) + geom_col(aes(fill = author2), show.legend = FALSE) + coord_polar() + theme_minimal() + theme(axis.text.y = element_blank()) + labs(x = NULL, y = NULL) ### 2.3.4 帕累托图 除了饼图，还常用堆积柱形图描述各个部分的数量，柱形图的优势在于简洁，准确，兼顾对比和趋势。下 图 2.30 描述各年开发者们的贡献量及其变化趋势，饼图无法表达数量的变化趋势。 aggregate(data = svn_trunk_log, revision ~ year + author, FUN = length) |> ggplot(aes(x = year, y = revision, fill = author)) + geom_col() + theme_classic() + coord_cartesian(expand = FALSE) + theme(legend.position = "bottom") + labs(x = "年份", y = "提交量", fill = "开发者") 百分比堆积柱形图在数量堆积柱形图的基础上，将纵坐标的数量转化为百分比，下 图 2.31 展示各年开发者代码提交比例的变化趋势。 aggregate(data = svn_trunk_log, revision ~ year + author, FUN = length) |> ggplot(aes(x = year, y = revision, fill = author)) + geom_col(position = "fill") + scale_y_continuous(labels = scales::label_percent()) + theme_classic() + coord_cartesian(expand = FALSE) + theme(legend.position = "bottom") + labs(x = "年份", y = "提交量", fill = "开发者") 帕累托图描述各个部分的占比，特别是突出关键要素的占比。收入常服从帕累托分布，这是一个幂率分布，比如 80% 的财富集中在 20% 的人的手中。下 图 2.32 展示过去 25 年各位开发者的代码累计提交量，提交量小于 1000 的已经合并为一类。不难看出，Ripley 的提交量远高于其他开发者。 dat <- aggregate(data = svn_trunk_log, revision ~ author, FUN = length) |> transform(author = ifelse(revision < 1000, "Others", author)) |> aggregate(revision ~ author, FUN = sum) dat <- dat[order(-dat$revision), ]

ggplot(data = dat, aes(x = reorder(author, revision, decreasing = T), y = revision)) +
geom_col(width = 0.75) +
geom_line(aes(y = cumsum(revision), group = 1)) +
geom_point(aes(y = cumsum(revision))) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
labs(x = "维护者", y = "累计提交量")

### 2.3.5 马赛克图

library(ggmosaic)
geom_mosaic(aes(x = product(Dept, Gender), weight = Freq, fill = Admit)) +
theme_minimal()

Base R 提供函数 plot()mosaicplot()table 表格类型的数据可视化，提供一套公式绘图语法，可以绘制类似的马赛克图。

mosaicplot(~ Gender + Dept + Admit,
data = UCBAdmissions, color = TRUE,
main = "", xlab = "性别", ylab = "院系"
)

### 2.3.6 矩阵树图

ggplot(G20, aes(area = gdp_mil_usd, fill = region, label = country, subgroup = region)) +
geom_treemap() +
geom_treemap_text(grow = T, reflow = T, colour = "black") +
facet_wrap(~hemisphere) +
scale_fill_brewer(palette = "Set1") +
theme(legend.position = "bottom") +
labs(title = "G20 主要经济体", fill = "区域")

### 2.3.7 量表图

1. Strongly disagree （强烈反对），
2. Disagree（反对），
3. Neither agree nor disagree（中立），
4. Agree（同意），
5. Strongly agree（强烈同意）。

Jason M. Bryer 开发了一个 R 包 likert，特别适合调查研究数据可视化，将研究对象的态度以直观有效的方式展示出来，内置多个数据集，其中 表格 2.7 是一个数学焦虑量表调查的结果，调查数据来自统计课上的 20 个学生。

# 数据来自 likert 包
# 宽转长格式
MathAnxiety_df <- reshape(data = MathAnxiety,
varying = c("Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree"),
times = c("Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree"),
timevar = "Attitude", v.names = "Numbers",  idvar = "Item",
new.row.names = 1:(5 * 14), direction = "long"
)

MathAnxiety_df$Attitude <- factor(MathAnxiety_df$Attitude, levels = c(
"Strongly Agree", "Agree", "Neutral", "Disagree", "Strongly Disagree"
), labels =  c(
"强烈同意", "同意", "中立", "反对", "强烈反对"
), ordered = TRUE)

ggplot(data = MathAnxiety_df, aes(x = Numbers, y = Item)) +
geom_col(aes(fill = Attitude), position = "fill") +
scale_x_continuous(labels = scales::label_percent()) +
scale_y_discrete(labels = scales::label_wrap(25)) +
scale_fill_brewer(palette = "BrBG", direction = -1) +
theme_classic() +
guides(fill = guide_legend(reverse = TRUE)) +
coord_cartesian(expand = FALSE) +
labs(x = "占比", y = "问题", fill = "态度")

likert 包的函数 likert() 适合对聚合的调查数据绘图。

library(likert)
lmath <- likert(summary = MathAnxiety)
plot(lmath)

ggstats 包的函数 gglikert() 适合对明细的调查数据绘图。下面模拟一次调查收集到的数据，共计 150 人回答 6 个问题，每个问题都有 5 个候选项构成。

library(ggstats)
likert_levels <- c("强烈反对", "反对", "中立", "同意", "强烈同意")
set.seed(2023)
library(data.table)
df <- data.table(
q1 = sample(likert_levels, 150, replace = TRUE),
q2 = sample(likert_levels, 150, replace = TRUE, prob = 5:1),
q3 = sample(likert_levels, 150, replace = TRUE, prob = 1:5),
q4 = sample(likert_levels, 150, replace = TRUE, prob = 1:5),
q5 = sample(c(likert_levels, NA), 150, replace = TRUE),
q6 = sample(likert_levels, 150, replace = TRUE, prob = c(1, 0, 1, 1, 0))
)
fkt <- paste0("q", 1:6)
df[, (fkt) := lapply(.SD, factor, levels = likert_levels), .SDcols = fkt]

gglikert(df)

## 2.4 习题

1. 根据 Github 代码提交量数据制作日历图。

github_ctb <- jsonlite::read_json(path = "data/contributions.json")
github_df <- data.frame(
date = unlist(lapply(github_ctb$contributions, "[[", "date")), count = unlist(lapply(github_ctb$contributions, "[[", "count")),
color = unlist(lapply(github_ctb$contributions, "[[", "color")), intensity = unlist(lapply(github_ctb$contributions, "[[", "intensity"))
)
week.abb <- c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")
github_df <- within(github_df, {
date <- as.Date(date)
year <- format(date, format = "%Y", tz = "UTC")
month <- format(date, format = "%m", tz = "UTC")
week <- format(date, format = "%U", tz = "UTC")
wday <- format(date, format = "%a", tz = "UTC")
nday <- format(date, format = "%j", tz = "UTC")
week <- as.integer(week)
wday <- factor(wday, labels = week.abb)
})
ggplot(
data = subset(github_df, subset = year %in% 2020:2022),
aes(x = week, y = wday, fill = count)
) +
geom_tile(color = "white", linewidth = 0.5) +
scale_fill_distiller(palette = "Greens", direction = 1) +
scale_x_continuous(
expand = c(0, 0), breaks = seq(1, 52, length = 12), labels = month.abb
) +
facet_wrap(~year, ncol = 1) +
theme_minimal() +
labs(x = "月份", y = "星期", fill = "提交量")