第 11 章 数据可视化
library(ggplot2) # ggplot2 图形
library(patchwork) # 图形布局
library(magrittr) # 管道操作
library(ggrepel) # 文本注释
library(extrafont) # 加载外部字体 TTF
library(hrbrthemes) # 主题
library(maps) # 地图数据
library(mapdata) # 地图数据
library(xkcd) # 漫画字体
library(RgoogleMaps) # 静态地图
library(data.table) # 数据操作
library(KernSmooth) # 核平滑
library(ggnormalviolin) # 提琴图
library(ggbeeswarm) # 蜂群图
library(gert) # Git 数据操作
library(ggridges) # 岭线图
library(ggpubr) # 组合图
library(treemap) # 树状图
library(treemapify) # 树状图
library(ggquiver) # 向量场图
library(ggstream) # 水流图
library(timelineS) # 时间线
library(ggdendro) # 聚类图
library(ggfortify) # 统计分析结果可视化:主成分图
library(gganimate) # 动态图
David Robinson 给出为何使用 ggplot2 33 当然也有 Jeff Leek 指出在某些重要场合不适合 ggplot2 34 并且给出强有力的 证据,其实不管怎么样,适合自己的才是好的。也不枉费 Garrick Aden-Buie 花费 160 页幻灯片逐步分解介绍 优雅的ggplot2,Malcolm Barrett 也介绍了 ggplot2 基础用法,还有 Selva Prabhakaran 精心总结给出了 50 个 ggplot2 数据可视化的 例子 以及 Victor Perrier 为小白用 ggplot2 操碎了心地开发 RStudio 插件 esquisse 包,Claus O. Wilke 教你一步步创建出版级的图形 https://github.com/clauswilke/practical_ggplot2。
ggplot2 是十分方便的统计作图工具,相比 Base R,为了一张出版级的图形,不需要去调整每个参数,实现快速出图。集成了很多其它统计计算的 R 包,支持丰富的统计分析和计算功能,如回归、平滑等,实现了作图和模型的无缝连接。比如图11.1,使用 loess 局部多项式平滑得到数据的趋势,不仅仅是散点图,代码量也非常少。
ggplot(mpg, aes(displ, hwy)) +
geom_point(aes(color = class)) +
geom_smooth(se = TRUE, method = "loess") +
labs(
title = "Fuel efficiency generally decreases with engine size",
subtitle = "Two seaters (sports cars) are an exception because of their light weight",
caption = "Data from fueleconomy.gov"
)

图 11.1: 简洁美观
故事源于一幅图片,我不记得第一次见到这幅图是什么时候了,只因多次在多个场合中见过,所以留下了深刻的印象,后来才知道它出自于一篇博文 — Using R packages and education to scale Data Science at Airbnb,作者 Ricardo Bion 还在其 Github 上传了相关代码35。
如 Berton Gunter 所说,数据可视化只是一种手段,根据数据实际情况作展示才是重要的,并不是要追求酷炫。
3-D bar plots are an abomination. Just because Excel can do them doesn’t mean you should. (Dismount pulpit).
— Berton Gunter 36
grid 是 lattice 和 ggplot2 的基础,gganimate 是 ggplot2 一个扩展,它将静态图形视为帧,调用第三方工具合成 GIF 动图或 MP4 视频等,要想深入了解 ggplot2,可以去看 Hadley Wickham, Danielle Navarro, and Thomas Lin Pedersen 合著的《ggplot2: elegant graphics for data analysis》第三版 https://ggplot2-book.org/。
11.1 元素
以数据集 airquality 为例介绍 GGplot2 图层、主题、配色、坐标、尺度、注释和组合等
11.1.1 图层
ls("package:ggplot2", pattern = "^geom_")
## [1] "geom_abline" "geom_area" "geom_bar"
## [4] "geom_bin_2d" "geom_bin2d" "geom_blank"
## [7] "geom_boxplot" "geom_col" "geom_contour"
## [10] "geom_contour_filled" "geom_count" "geom_crossbar"
## [13] "geom_curve" "geom_density" "geom_density_2d"
## [16] "geom_density_2d_filled" "geom_density2d" "geom_density2d_filled"
## [19] "geom_dotplot" "geom_errorbar" "geom_errorbarh"
## [22] "geom_freqpoly" "geom_function" "geom_hex"
## [25] "geom_histogram" "geom_hline" "geom_jitter"
## [28] "geom_label" "geom_line" "geom_linerange"
## [31] "geom_map" "geom_path" "geom_point"
## [34] "geom_pointrange" "geom_polygon" "geom_qq"
## [37] "geom_qq_line" "geom_quantile" "geom_raster"
## [40] "geom_rect" "geom_ribbon" "geom_rug"
## [43] "geom_segment" "geom_sf" "geom_sf_label"
## [46] "geom_sf_text" "geom_smooth" "geom_spoke"
## [49] "geom_step" "geom_text" "geom_tile"
## [52] "geom_violin" "geom_vline"
生成一个散点图
ggplot(airquality, aes(x = Temp, y = Ozone)) + geom_point()
## Warning: Removed 37 rows containing missing values (geom_point).

11.1.2 标签
图形的标签分为横纵轴标签、刻度标签、主标题、副标题等
data.frame(
dates = seq.Date(
from = as.Date("1945-01-01"),
to = as.Date("1974-12-31"),
by = "quarter"
),
presidents = as.vector(presidents)
) |>
ggplot(aes(x = dates, y = presidents)) +
geom_line(color = "slategray", na.rm = TRUE) +
geom_point(size = 1.5, color = "darkslategray", na.rm = TRUE) +
scale_x_date(date_breaks = "4 year", date_labels = "%Y") +
labs(
title = "1945年至1974年美国总统每季度支持率",
x = "年份", y = "支持率 (%)",
caption = "数据源: R 包 datasets"
) +
theme_minimal(base_size = 10.54, base_family = "Noto Serif CJK SC")

图 11.2: 自1945年第一季度至1974年第四季度美国总统的支持
11.1.3 注释
图中注释的作用在于高亮指出关键点,提请读者注意。文本注释可由 ggrepel 包提供的标签图层 geom_label_repel()
添加,标签数据可独立于之前的数据层,标签所在的位置可以通过参数 direction
和 nudge_y
精调,图 11.3 模拟了一组数据。
set.seed(2020)
library(ggrepel)
dat <- data.frame(
x = seq(100),
y = cumsum(rnorm(100))
)
anno_data <- dat |>
subset(x %% 25 == 10) |>
transform(text = "text")
ggplot(data = dat, aes(x, y)) +
geom_line() +
geom_label_repel(aes(label = text),
data = anno_data,
direction = "y",
nudge_y = c(-5, 5, 5, 5)
) +
theme_minimal()

图 11.3: 文本注释
ggrepel 包的图层 geom_text_repel()
支持所有数据点的注释,并且自动调整文本的位置,防止重叠,增加辨识度,如图 11.4。当然,数据点如果过于密集也不适合全部注释,高亮其中的关键点即可。
mtcars |>
transform(cyl = as.factor(cyl)) |>
ggplot(aes(wt, mpg, label = rownames(mtcars), color = cyl)) +
geom_point() +
geom_text_repel(max.overlaps = 12) +
theme_minimal()

图 11.4: 少量点的情况下可以全部注释,且可以解决注释重叠的问题
Claus Wilke 开发的 ggtext 包支持更加丰富的注释样式,详见网站 https://wilkelab.org/ggtext/
ls("package:ggplot2", pattern = "^annotation_")
## [1] "annotation_custom" "annotation_logticks" "annotation_map"
## [4] "annotation_raster"
ggplot(airquality, aes(x = Temp, y = Ozone)) +
geom_point(na.rm = TRUE)

ggplot(airquality, aes(x = Temp, y = Ozone)) +
geom_point(na.rm = TRUE) +
labs(title = substitute(paste(d *
bolditalic(x)[italic(t)] == alpha * (theta - bolditalic(x)[italic(t)]) *
d * italic(t) + lambda * d * italic(B)[italic(t)]), list(lambda = 4)))

geomtextpath 曲线上的文字随曲线弯曲变化
ggsvg 曲线上散点以图片、彩色图标表示
11.1.4 刻度
ls("package:ggplot2", pattern = "^scale_(x|y)_")
## [1] "scale_x_binned" "scale_x_continuous" "scale_x_date"
## [4] "scale_x_datetime" "scale_x_discrete" "scale_x_log10"
## [7] "scale_x_reverse" "scale_x_sqrt" "scale_x_time"
## [10] "scale_y_binned" "scale_y_continuous" "scale_y_date"
## [13] "scale_y_datetime" "scale_y_discrete" "scale_y_log10"
## [16] "scale_y_reverse" "scale_y_sqrt" "scale_y_time"
range(airquality$Temp, na.rm = TRUE)
## [1] 56 97
range(airquality$Ozone, na.rm = TRUE)
## [1] 1 168
ggplot(airquality, aes(x = Temp, y = Ozone)) +
geom_point(na.rm = TRUE) +
scale_x_continuous(breaks = seq(50, 100, 5)) +
scale_y_continuous(breaks = seq(0, 200, 20))

11.1.5 图例
二维的图例 biscale 和 multiscales 和 ggnewscale
11.1.6 坐标系
极坐标,直角坐标
ls("package:ggplot2", pattern = "^coord_")
## [1] "coord_cartesian" "coord_equal" "coord_fixed" "coord_flip"
## [5] "coord_map" "coord_munch" "coord_polar" "coord_quickmap"
## [9] "coord_sf" "coord_trans"
11.1.8 配色
ls("package:ggplot2", pattern = "^scale_(color|fill)_")
## [1] "scale_color_binned" "scale_color_brewer" "scale_color_continuous"
## [4] "scale_color_date" "scale_color_datetime" "scale_color_discrete"
## [7] "scale_color_distiller" "scale_color_fermenter" "scale_color_gradient"
## [10] "scale_color_gradient2" "scale_color_gradientn" "scale_color_grey"
## [13] "scale_color_hue" "scale_color_identity" "scale_color_manual"
## [16] "scale_color_ordinal" "scale_color_steps" "scale_color_steps2"
## [19] "scale_color_stepsn" "scale_color_viridis_b" "scale_color_viridis_c"
## [22] "scale_color_viridis_d" "scale_fill_binned" "scale_fill_brewer"
## [25] "scale_fill_continuous" "scale_fill_date" "scale_fill_datetime"
## [28] "scale_fill_discrete" "scale_fill_distiller" "scale_fill_fermenter"
## [31] "scale_fill_gradient" "scale_fill_gradient2" "scale_fill_gradientn"
## [34] "scale_fill_grey" "scale_fill_hue" "scale_fill_identity"
## [37] "scale_fill_manual" "scale_fill_ordinal" "scale_fill_steps"
## [40] "scale_fill_steps2" "scale_fill_stepsn" "scale_fill_viridis_b"
## [43] "scale_fill_viridis_c" "scale_fill_viridis_d"
ggplot(airquality, aes(x = Temp, y = Ozone, color = as.factor(Month))) +
geom_point(na.rm = TRUE)

ggplot(airquality, aes(x = Temp, y = Ozone, color = as.ordered(Month))) +
geom_point(na.rm = TRUE)

11.1.9 主题
ggcharts 和 bbplot prettyB 美化 Base R 图形 ggprism
ls("package:ggplot2", pattern = "^theme_")
## [1] "theme_bw" "theme_classic" "theme_dark" "theme_get"
## [5] "theme_gray" "theme_grey" "theme_light" "theme_linedraw"
## [9] "theme_minimal" "theme_replace" "theme_set" "theme_test"
## [13] "theme_update" "theme_void"
这里只展示 theme_bw()
theme_void()
theme_minimal()
和 theme_void()
等四个常见主题,更多主题参考 ggsci、ggthemes 、ggtech、hrbrthemes、clcharts 和 ggthemr 包
ggplot(airquality, aes(x = Temp, y = Ozone), na.rm = TRUE) +
geom_point() +
theme_bw()
## Warning: Removed 37 rows containing missing values (geom_point).
ggplot(airquality, aes(x = Temp, y = Ozone), na.rm = TRUE) +
geom_point() +
theme_void()
## Warning: Removed 37 rows containing missing values (geom_point).
ggplot(airquality, aes(x = Temp, y = Ozone), na.rm = TRUE) +
geom_point() +
theme_minimal()
## Warning: Removed 37 rows containing missing values (geom_point).
ggplot(airquality, aes(x = Temp, y = Ozone), na.rm = TRUE) +
geom_point() +
theme_classic()
## Warning: Removed 37 rows containing missing values (geom_point).




图 11.5: ggplot2 内置的主题
11.1.10 布局
ggplot(airquality) +
geom_point(aes(x = Temp, y = Ozone), na.rm = TRUE) +
facet_wrap(~ as.ordered(Month))

ggplot(airquality) +
geom_point(aes(x = Temp, y = Ozone), na.rm = TRUE) +
facet_wrap(~ as.ordered(Month), nrow = 1)

cowplot 是以作者 Claus O. Wilke 命名的,用来组合 ggplot 对象画图,类似的组合图形的功能包还有 baptiste auguié 开发的 gridExtra 和 egg, Thomas Lin Pedersen 开发的 patchwork
Dean Attali 开发的 ggExtra 可以在图的边界添加密度估计曲线,直方图等
11.2 字体
firatheme 包提供基于 fira sans 字体的 ggplot2 主题,类似的字体主题包还有 trekfont 、 fontHind, fontquiver 包与 fontBitstreamVera(Bitstream Vera 字体)、 fontLiberation(Liberation 字体)包和 fontDejaVu (DejaVu 字体)包一道提供了一些可允许使用的字体文件,这样,我们可以不依赖系统制作可重复的图形。Thomas Lin Pedersen 开发的 systemfonts 可直接使用系统自带的字体。
11.2.1 系统字体
以 CentOS 系统为例,软件仓库中包含 Noto , DejaVu 、liberation 等字体。可以安装自己喜欢的字体类型,比如:
sudo dnf install -y \
\
google-noto-mono-fonts \
google-noto-sans-fonts \
google-noto-serif-fonts \
dejavu-sans-mono-fonts \
dejavu-sans-fonts
dejavu-serif-fonts# 或者
sudo dnf install -y dejavu-fonts liberation-fonts
liberation 系列的四款字体可以用来替换 Windows 系统上对应的四款字体,对应关系见表 11.1
CentOS 系统 | Windows 系统 | |
---|---|---|
衬线体/宋体 | liberation-serif-fonts | Times New Roman |
无衬线体/黑体 | liberation-sans-fonts | Arial |
Arial 的细瘦版 | liberation-narrow-fonts | Arial Narrow |
等宽体/微软雅黑 | liberation-mono-fonts | Courier New |
Lionel Henry 将 Liberation 系列字体打包到 R 包 fontLiberation,非常便携,不需要操心跨平台的字体安装了。那如何使用呢?
# install.packages("fontLiberation")
system.file(package = "fontLiberation", "fonts", "liberation-fonts")
## [1] ""
此外,我们还可以从网上获取各种个样的字体,特别地,Boryslav Larin 收录的 awesome-fonts 列表是一个不错的开始,比如图标字体 Font-Awesome,
sudo dnf install -y fontawesome-fonts
再安装宏包 fontawesome 后,即可在 LaTeX 文档中使用,下面这个示例推荐用 XeLaTeX 引擎编译。
\documentclass[border=10pt]{standalone}
\usepackage{fontawesome}
\begin{document}
\faGithub
Hello, \end{document}
而在 R 绘制的图形中,通过指定 par()
、 plot()
、 title()
等函数的 family
参数值,比如 family = "Liberation Sans"
来调用系统无衬线 Liberation 字体,效果见图 11.6。
library(extrafont)
plot(data = pressure, pressure ~ temperature,
xlab = "Temperature (deg C)", ylab = "Pressure (mm of Hg)",
col.lab = "red", col.axis = "blue",
font.lab = 3, font.axis = 2, family = "Liberation Sans")
title(main = "Vapor Pressure of Mercury as a Function of Temperature",
family = "Liberation Serif", font.main = 3)
title(sub = "Data Source: Weast, R. C",
family = "Liberation Mono", font.sub = 1)

图 11.6: 调用系统字体绘图
为了符合出版的要求,需要在 11.6 中嵌入字体,
# embed fonts to pdf
embed_fonts <- function(fig_path) {
if(knitr::is_latex_output()){
embedFonts(
file = fig_path, outfile = fig_path,
fontpaths = "~/Library/Fonts"
)
}
return(fig_path)
}
设置代码块选项 fig.process=embed_fonts
,这样生成 PDF 格式图形的时候,会调用此函数处理 PDF 图形。在 ggplot2 绘图中的调用方式是类似的,便不再赘述了。值得注意的是,extrafont 和 showtext 有些不一样,前者只能处理系统字体,后者还能获取网络字体和使用 OTF 字体,下面从 Google 开源的字体库获取 Noto 系列的四款字体,如图 11.7。
sysfonts::font_add_google(name = "Noto Sans", family = "Noto Sans")
sysfonts::font_add_google(name = "Noto Serif", family = "Noto Serif")
sysfonts::font_add_google(name = "Noto Serif SC", family = "Noto Serif SC")
sysfonts::font_add_google(name = "Noto Sans SC", family = "Noto Sans SC")
在本书中,不要全局加载 showtext 包或调用 showtext::showtext_auto()
,会和 extrafont 冲突,使得绘图时默认就只能使用 showtext 提供的字体。extrafont 包提供的函数 font_import()
仅支持系统安装的 TrueType/Type1 字体
p1 <- ggplot(pressure, aes(x = temperature, y = pressure)) +
geom_point() +
ggtitle(label = "默认字体设置")
p2 <- p1 + theme(
axis.title = element_text(family = "Noto Sans"),
axis.text = element_text(family = "Noto Serif")
) +
theme(
title = element_text(family = "Noto Serif SC")
) +
ggtitle(label = "英文字体设置")
p3 <- p1 + labs(x = "温度", y = "压力") +
theme(
axis.title = element_text(family = "Noto Serif SC"),
axis.text = element_text(family = "Noto Serif")
) +
ggtitle(label = "中文字体设置")
p4 <- p1 + labs(
x = "温度", y = "压力", title = "散点图",
subtitle = "Vapor Pressure of Mercury as a Function of Temperature",
caption = paste("Data on the relation
between temperature in degrees Celsius and",
"vapor pressure of mercury in millimeters (of mercury).",
sep = "\n"
)
) +
theme(
axis.title = element_text(family = "Noto Serif SC"),
axis.text.x = element_text(family = "Noto Serif"),
axis.text.y = element_text(family = "Noto Sans"),
title = element_text(family = "Noto Serif SC"),
plot.subtitle = element_text(family = "Noto Sans", size = rel(0.7)),
plot.caption = element_text(family = "Noto Sans", size = rel(0.6))
) +
ggtitle(label = "任意字体设置")
(p1 + p2) / (p3 + p4)

图 11.7: 在 ggplot2 绘图系统中设置中英文字体
另外值得一提的是 hrbrthemes 包,除了定制了很多 ggplot2 主题,它还打包了很多的字体主题。比如默认主题 theme_ipsum()
使用 Arial Narrow 字体,如果没有该字体就自动寻找系统中的替代品,如图 11.8 实际使用的是 Nimbus Sans Narrow 字体,因为在 GitHub Action 中,我实际使用的测试环境是 Ubuntu 20.04,该系统自带 Nimbus Sans Narrow 字体,Arial Narrow 毕竟是 Windows 上的闭源字体。
# brew install font-roboto
# 导入字体
# hrbrthemes::import_roboto_condensed()
sysfonts::font_add_google(name = "Roboto Condensed", family = "Roboto Condensed")
library(hrbrthemes)
ggplot(mtcars, aes(mpg, wt)) +
geom_point() +
labs(
x = "Fuel efficiency (mpg)", y = "Weight (tons)",
title = "Seminal ggplot2 scatterplot example",
subtitle = "A plot that is only useful for demonstration purposes",
caption = "Brought to you by the letter 'g'"
) +
theme_ipsum(base_family = "Roboto Condensed")

图 11.8: 调用 hrbrthemes 包设置字体主题
如果系统没有安装 Arial Narrow 字体,可以导入 hrbrthemes 包自带的一些字体,比如 hrbrthemes::import_roboto_condensed()
,然后调用字体主题 theme_ipsum_rc()
。如果不想使用这个包自带的字体,可以用系统中安装的字体去修改主题 theme_ipsum()
和 theme_ipsum_rc()
中的字体设置。如图 11.9 使用了 theme_ipsum()
中的 Arial Narrow 字体。
ggplot(mtcars, aes(mpg, wt)) +
geom_point() +
labs(
x = "Fuel efficiency (mpg)", y = "Weight (tons)",
title = "Seminal ggplot2 scatterplot example",
subtitle = "A plot that is only useful for demonstration purposes",
caption = "Brought to you by the letter 'g'"
) +
theme_ipsum(base_family = "Noto Sans")

图 11.9: 默认字体 Arial Narrow
hrbrthemes 包提供了一个全局字体加载选项 hrbrthemes.loadfonts
,如果设置为 TRUE,即 options(hrbrthemes.loadfonts = TRUE)
会先调用函数 extrafont::loadfonts()
预加载系统字体,就不用一次次手动加载字体了。后续在第 11.2.3 节还会提及 extrafont 包的其它功能。
11.2.2 思源字体
邱怡轩开发的 showtext 包支持丰富的外部字体,支持 Base R 和 ggplot2 图形,图 11.10 嵌入了 5 号思源宋体,图例和坐标轴文本使用 serif 字体,更多详细的使用文档见 [15]。
# 安装 showtext 包
install.packages('showtext')
# 思源宋体
showtextdb::font_install(showtextdb::source_han_serif())
# 思源黑体
showtextdb::font_install(showtextdb::source_han_sans())
# ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
# geom_point(aes(colour = Species)) +
# scale_colour_brewer(palette = "Set1") +
# labs(
# title = "鸢尾花数据的散点图",
# x = "萼片长度", y = "萼片宽度", colour = "鸢尾花类别",
# caption = "鸢尾花数据集最早见于 Edgar Anderson (1935) "
# ) +
# theme(
# title = element_text(family = "source-han-sans-cn"),
# axis.title = element_text(family = "source-han-serif-cn"),
# legend.title = element_text(family = "source-han-serif-cn")
# )
ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
geom_point(aes(colour = Species)) +
scale_colour_brewer(palette = "Set1") +
labs(
title = "鸢尾花数据的散点图",
x = "萼片长度", y = "萼片宽度", colour = "鸢尾花类别",
caption = "鸢尾花数据集最早见于 Edgar Anderson (1935) "
) +
theme(
title = element_text(family = "Noto Sans SC"),
axis.title = element_text(family = "Noto Serif SC"),
legend.title = element_text(family = "Noto Serif SC")
)

图 11.10: showtext 包处理图里的中文
11.2.3 数学字体
Winston Chang 将 Paul Murrell 的 Computer Modern 字体文件打包成 fontcm 包 [16],fontcm 包可以在 Base R 图形中嵌入数学字体 37,图形中嵌入重音字符 38。 下面先下载、安装、加载字体,
library(extrafont)
if (!"fontcm" %in% .packages(T)) {
install.packages("fontcm")
}
查看可被 pdf()
图形设备使用的字体列表
# 可用的字体
fonts()
fontcm 包提供数学字体,grDevices::embedFonts()
函数调用 Ghostscript 软件将数学字体嵌入 ggplot2 图形中,达到正确显示数学公式的目的,此方法适用于 pdf 设备保存的图形,对 cairo_pdf()
保存的 PDF 格式图形无效。
library(fontcm)
library(ggplot2)
library(extrafont)
library(patchwork)
p <- ggplot(
data = data.frame(x = c(1, 5), y = c(1, 5)),
aes(x = x, y = y)
) +
geom_point() +
labs(
x = "Made with CM fonts", y = "Made with CM fonts",
title = "Made with CM fonts"
)
# 公式
eq <- "italic(sum(frac(1, n*'!'), n==0, infinity) ==
lim(bgroup('(', 1 + frac(1, n), ')')^n, n %->% infinity))"
# 默认字体
p1 <- p + annotate("text",
x = 3, y = 3,
parse = TRUE, label = eq # , family = "CM Roman"
)
# 使用 CM Roman 字体
p2 <- p + annotate("text",
x = 3, y = 3,
parse = TRUE, label = eq, family = "CM Roman"
) +
theme(
text = element_text(size = 10, family = "CM Roman"),
axis.title.x = element_text(face = "italic"),
axis.title.y = element_text(face = "bold")
)
p1 + p2
为实现图 ?? 的最终效果,需要启用一个有超级牛力的 fig.process 选项,主要是传递一个函数给它,对用 R 语言生成的图形再操作。
# embed math fonts to pdf
embed_math_fonts <- function(fig_path) {
if(knitr::is_latex_output()){
embedFonts(
file = fig_path, outfile = fig_path,
fontpaths = system.file("fonts", package = "fontcm")
)
}
return(fig_path)
}
代码块选项中设置 fig.process=embed_math_fonts
可在绘图后,立即插入字体,此操作仅限于以 pdf 格式保存的图形设备,也适用于 Base R 绘制的图形,见图 ??。
par(mar = c(4.1, 4.1, 1.5, 0.5), family = "CM Roman")
x <- seq(-4, 4, len = 101)
y <- cbind(sin(x), cos(x))
matplot(x, y,
type = "l", xaxt = "n",
main = expression(paste(
plain(sin) * phi, " and ",
plain(cos) * phi
)),
ylab = expression("sin" * phi, "cos" * phi),
xlab = expression(paste("Phase Angle ", phi)),
col.main = "blue"
)
axis(1,
at = c(-pi, -pi / 2, 0, pi / 2, pi),
labels = expression(-pi, -pi / 2, 0, pi / 2, pi)
)
11.2.4 TikZ 设备
与 11.2.3 小节不同,Ralf Stubner 维护的 tikzDevice 包提供了另一种嵌入数学字体的方式,其提供的 tikzDevice::tikz()
绘图设备将图形对象转化为 TikZ 代码,调用 LaTeX 引擎编译成 PDF 文档。安装后,先测试一下 LaTeX 编译环境是否正常。
tikzDevice::tikzTest()
##
## Active compiler:
## /home/runner/.TinyTeX/bin/x86_64-linux/xelatex
## XeTeX 3.141592653-2.6-0.999994 (TeX Live 2022)
## kpathsea version 6.3.4
## [1] 7.90259
确认没有问题后,下面图 11.11 的坐标轴标签,标题,图例等位置都支持数学公式,使用 tikzDevice 打造出版级的效果图。更多功能的介绍见 https://www.daqana.org/tikzDevice/。
x <- rnorm(10)
y <- x + rnorm(5, sd = 0.25)
model <- lm(y ~ x)
rsq <- summary(model)$r.squared
rsq <- signif(rsq, 4)
plot(x, y,
main = "Hello \\LaTeX!", xlab = "$x$", ylab = "$y$",
sub = "$\\mathcal{N}(x;\\mu,\\Sigma)$"
)
abline(model, col = "red")
mtext(paste0("Linear model: $R^{2}=", rsq, "$"), line = 0.5)
legend("bottomright",
legend = paste0(
"$y = ",
round(coef(model)[2], 3),
"x +",
round(coef(model)[1], 3),
"$"
),
bty = "n"
)

图 11.11: 线性回归模型
推荐的全局 LaTeX 环境配置如下:
options(
tinytex.engine = "xelatex",
tikzDefaultEngine = "xetex",
tikzDocumentDeclaration = "\\documentclass[tikz]{standalone}\n",
tikzXelatexPackages = c(
"\\usepackage[fontset=adobe]{ctex}",
"\\usepackage[default,semibold]{sourcesanspro}",
"\\usepackage{amsfonts,mathrsfs,amssymb}\n"
)
)
设置默认的 LaTeX 编译引擎为 XeLaTeX,相比于 PDFLaTeX,它对中文的兼容性更好,支持多平台下的中文环境,中文字体这里采用了 Adobe 的字体,默认加载了 mathrsfs 宏包支持 \mathcal
、\mathscr
等命令,此外, LaTeX 发行版采用谢益辉自定义的 TinyTeX。绘制独立的 PDF 图形的过程如下:
11.2.5 漫画字体
下载 XKCD 字体,并刷新系统字体缓存
mkdir -p ~/.fonts
curl -fLo ~/.fonts/xkcd.ttf http://simonsoftware.se/other/xkcd.ttf
fc-cache -fsv
将 XKCD 字体导入到 R 环境,以便后续被 ggplot2 图形设备调用。
-e 'library(extrafont);font_import(pattern="[X/x]kcd.ttf", prompt = FALSE)' R
下图是一个使用 xkcd 字体的简单例子,更多高级特性请看 xkcd 包文档 [17]
library(xkcd)
ggplot(aes(mpg, wt), data = mtcars) +
geom_point() +
theme_xkcd()
11.2.6 表情字体
余光创开发的 emojifont 包和 Hadley 开发的 emo 包,下面使用 Noto Emoji 字体,支持的表情图见 https://www.google.com/get/noto/help/emoji/food-drink/,下面给出一个示例。先从 GitHub 安装 emo 包,目前它还未正式发布到 CRAN 上。
remotes::install_github("hadley/emo")
除了安装 emo 包,系统需要先安装好 emoji 字体,图形才会正确地渲染出来,想调用更多 emoji 图标请参考 Emoji 速查手册,给出 emoji 对应的名字。
# CentOS
sudo dnf install -y google-noto-emoji-color-fonts \
google-noto-emoji-fonts# MacOS
brew cask install font-noto-color-emoji font-noto-emoji
data.frame(
category = c("pineapple", "apple", "watermelon", "mango", "pear"),
value = c(5, 4, 3, 6, 2)
) |>
transform(category = sapply(category, emo::ji)) |>
ggplot(aes(x = category, y = value)) +
scale_y_continuous(limits = c(2, 7)) +
geom_text(aes(label = category), size = 12, vjust = -0.5) +
theme_minimal()

图 11.12: 表情字体
Noto Color Emoji 字体在 MacOS 上有问题,为了跨平台的便携性,提供 emojifont 包的例子,要引入更多的依赖。
library(ggplot2)
library(emojifont)
names <- c("smile", "school", "office", "blush", "smirk", "heart_eyes")
n <- length(names):1
e <- sapply(names, emojifont::emoji)
dat <- data.frame(emoji_name = names, n = n, emoji = e, stringsAsFactors = F)
ggplot(data = dat, aes(emoji_name, n)) +
geom_bar(stat = "identity") +
scale_x_discrete(breaks = dat$emoji_name, labels = dat$emoji) +
theme(axis.text.y = element_text(size = 20, family = "EmojiOne")) +
coord_flip()
11.3 配色
配色真的是一门学问,有的人功力非常深厚,仅用黑白灰就可以创造出一个世界,如中国的水墨画,科波拉执导的《教父》,沃卓斯基姐妹执导的《黑客帝国》等。黑西装、白衬衫和黑领带是《黑客帝国》的经典元素,《教父》开场的黑西装、黑领结和白衬衫,尤其胸前的红玫瑰更是点睛之笔。导演将黑白灰和光影混合形成了层次丰富立体的画面,打造了一场视觉盛宴,无论是呈现在纸上还是银幕上都可以给人留下深刻的印象。正所谓食色性也,花花世界,岂能都是法印眼中的白骨!再说《红楼梦》里,芍药丛中,桃花树下,滴翠亭边,栊翠庵里,处处都是湘云、黛玉、宝钗、妙玉留下的四季诗歌。
为什么需要这么多颜色模式呢?主要取决于颜色输出的通道,比如印刷机,照相机,自然界,网页,人眼等,显示器因屏幕和分辨率的不同呈现的色彩数量是不一样的。读者大概都听说过 RGB、CMYK、AdobeRGB、sRGB、P3 广色域等名词,我想这主要归功于各大电子设备厂商的宣传。普清、高清、超高清、全高清、2K、4K、5K、视网膜屏,而 HSV、HCL 估计听说的人就少很多了。本节的目的是简单阐述背后的色彩原理,颜色模式及其之间的转化,在应对天花乱坠的销售时少交一些智商税,同时,告诉读者如何在 R 环境中使用色彩。早些时候我在统计之都论坛上发帖 – R语言绘图用调色板大全 https://d.cosx.org/d/419378,如果读者希望拿来即用,不妨去看看。
filled.contour(volcano, nlevels = 10, color.palette = terrain.colors)
filled.contour(volcano, nlevels = 10, color.palette = heat.colors)
filled.contour(volcano, nlevels = 10, color.palette = topo.colors)
filled.contour(volcano, nlevels = 10, color.palette = cm.colors)




图 11.13: R 3.6.0 以前的调色板
filled.contour(volcano,
nlevels = 10,
color.palette = function(n, ...) hcl.colors(n, "Grays", rev = TRUE, ...)
)
filled.contour(volcano,
nlevels = 10,
color.palette = function(n, ...) hcl.colors(n, "YlOrRd", rev = TRUE, ...)
)
filled.contour(volcano,
nlevels = 10,
color.palette = function(n, ...) hcl.colors(n, "purples", rev = TRUE, ...)
)
filled.contour(volcano,
nlevels = 10,
color.palette = function(n, ...) hcl.colors(n, "viridis", rev = FALSE, ...)
)




图 11.14: R 3.6.0 以后的调色板
hcl.colors()
函数是在 R 3.6.0 引入的,之前的 R 软件版本中没有,同时内置了 110 个调色板,详见 hcl.pals()
。
11.3.1 调色板
R 预置的灰色有224种,挑出其中的调色板
## [1] "darkgray" "darkgrey" "darkslategray" "darkslategray1"
## [5] "darkslategray2" "darkslategray3" "darkslategray4" "darkslategrey"
## [9] "dimgray" "dimgrey" "lightgray" "lightgrey"
## [13] "lightslategray" "lightslategrey" "slategray" "slategray1"
## [17] "slategray2" "slategray3" "slategray4" "slategrey"
gray_colors <- paste0(rep(c("slategray", "darkslategray"), each = 4), seq(4))
barplot(1:8, col = gray_colors, border = NA)

图 11.15: 灰度调色板
gray 与 grey 是一样的,类似 color 和 colour 的关系,可能是美式和英式英语的差别,且看
## [1] TRUE
gray100
代表白色,gray0
代表黑色,提取灰色调色板,去掉首尾部分是必要的
barplot(1:8,
col = gray.colors(8, start = .3, end = .9),
main = "gray.colors function", border = NA
)

图 11.16: 提取 10 种灰色做调色板
首先选择一组合适的颜色,比如从桃色到梨色,选择6种颜色,以此为基础,可以借助 grDevices::colorRampPalette()
函数扩充至想要的数目,用 graphics::rect()
函数预览这组颜色配制的调色板
# Colors from https://github.com/johannesbjork/LaCroixColoR
colors_vec <- c("#FF3200", "#E9A17C", "#E9E4A6",
"#1BB6AF", "#0076BB", "#172869")
# 代码来自 ?colorspace::rainbow_hcl
pal <- function(n = 20, colors = colors, border = "light gray", ...) {
colorname <- (grDevices::colorRampPalette(colors))(n)
plot(0, 0,
type = "n", xlim = c(0, 1), ylim = c(0, 1),
axes = FALSE, ...
)
rect(0:(n - 1) / n, 0, 1:n / n, 1, col = colorname, border = border)
}
par(mar = rep(0, 4))
pal(n = 20, colors = colors_vec, xlab = "Colors from Peach to Pear", ylab = "")

图 11.17: 桃色至梨色的渐变
colorRampPalette()
自制调色板
create_palette <- function(n = 1000, colors = c("blue", "orangeRed")) {
color_palette <- colorRampPalette(colors)(n)
barplot(rep(1, times = n), col = color_palette,
border = color_palette, axes = FALSE)
}
par(mfrow = c(3, 1), mar = c(0.1, 0.1, 0.5, 0.1), xaxs = "i", yaxs = "i")
create_palette(n = 1000, colors = c("blue", "orangeRed"))
create_palette(n = 1000, colors = c("darkgreen", "yellow", "orangered"))
create_palette(n = 1000, colors = c("blue", "white", "orangered"))

图 11.18: colorRampPalette 自制调色板
par(mar = c(0, 4, 0, 0))
RColorBrewer::display.brewer.all()

图 11.19: RColorBrewer 调色板
# 代码来自 ?palettes
demo.pal <- function(n, border = if (n < 32) "light gray" else NA,
main = paste("color palettes: alpha = 1, n=", n),
ch.col = c(
"rainbow(n, start=.7, end=.1)", "heat.colors(n)",
"terrain.colors(n)", "topo.colors(n)",
"cm.colors(n)", "gray.colors(n, start = 0.3, end = 0.9)"
)) {
nt <- length(ch.col)
i <- 1:n
j <- n / nt
d <- j / 6
dy <- 2 * d
plot(i, i + d, type = "n", axes = FALSE, ylab = "", xlab = "", main = main)
for (k in 1:nt) {
rect(i - .5, (k - 1) * j + dy, i + .4, k * j,
col = eval(parse(text = ch.col[k])), border = border
)
text(2 * j, k * j + dy / 4, ch.col[k])
}
}
n <- if (.Device == "postscript") 64 else 16
# Since for screen, larger n may give color allocation problem
par(mar = c(0, 0, 2, 0))
demo.pal(n)

图 11.20: grDevices 调色板
par(mfrow = c(33, 1), mar = c(0, 0, .8, 0))
for (i in seq(32)) {
pal(
n = length((1 + 20 * (i - 1)):(20 * i)),
colors()[(1 + 20 * (i - 1)):(20 * i)],
main = paste(1 + 20 * (i - 1), "to", 20 * i)
)
}
pal(n = 17, colors()[641:657], main = "641 to 657")

图 11.21: grDevices 调色板
library(colorspace)
## a few useful diverging HCL palettes
par(mar = c(0,0,2,0), mfrow = c(16, 2))
pal(n = 16, diverge_hcl(16), main = "diverging HCL palettes")
pal(n = 16, diverge_hcl(16, h = c(246, 40), c = 96, l = c(65, 90)))
pal(n = 16, diverge_hcl(16, h = c(130, 43), c = 100, l = c(70, 90)))
pal(n = 16, diverge_hcl(16, h = c(180, 70), c = 70, l = c(90, 95)))
pal(n = 16, diverge_hcl(16, h = c(180, 330), c = 59, l = c(75, 95)))
pal(n = 16, diverge_hcl(16, h = c(128, 330), c = 98, l = c(65, 90)))
pal(n = 16, diverge_hcl(16, h = c(255, 330), l = c(40, 90)))
pal(n = 16, diverge_hcl(16, c = 100, l = c(50, 90), power = 1))
## sequential palettes
pal(n = 16, sequential_hcl(16), main= "sequential palettes")
pal(n = 16, heat_hcl(16, h = c(0, -100),
l = c(75, 40), c = c(40, 80), power = 1))
pal(n = 16, terrain_hcl(16, c = c(65, 0), l = c(45, 95), power = c(1/3, 1.5)))
pal(n = 16, heat_hcl(16, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5)))
## compare base and colorspace palettes
## (in color and desaturated)
## diverging red-blue colors
pal(n = 16, diverge_hsv(16), main = "diverging red-blue colors")
pal(n = 16, diverge_hcl(16, c = 100, l = c(50, 90)))
pal(n = 16, desaturate(diverge_hsv(16)))
pal(n = 16, desaturate(diverge_hcl(16, c = 100, l = c(50, 90))))
## diverging cyan-magenta colors
pal(n = 16, cm.colors(16), main = "diverging cyan-magenta colors")
pal(n = 16, diverge_hcl(16, h = c(180, 330), c = 59, l = c(75, 95)))
pal(n = 16, desaturate(cm.colors(16)))
pal(n = 16, desaturate(diverge_hcl(16, h = c(180, 330), c = 59, l = c(75, 95))))
## heat colors
pal(n = 16, heat.colors(16), main = "heat colors")
pal(n = 16, heat_hcl(16))
pal(n = 16, desaturate(heat.colors(16)))
pal(n = 16, desaturate(heat_hcl(16)))
## terrain colors
pal(n = 16, terrain.colors(16), main = "terrain colors")
pal(n = 16, terrain_hcl(16))
pal(n = 16, desaturate(terrain.colors(16)))
pal(n = 16, desaturate(terrain_hcl(16)))
pal(n = 16, rainbow_hcl(16, start = 30, end = 300), main = "dynamic")
pal(n = 16, rainbow_hcl(16, start = 60, end = 240), main = "harmonic")
pal(n = 16, rainbow_hcl(16, start = 270, end = 150), main = "cold")
pal(n = 16, rainbow_hcl(16, start = 90, end = -30), main = "warm")

图 11.22: colorspace 调色板
除之前提到的 grDevices 包, colorspace (https://hclwizard.org/) 包 [18]–[20],RColorBrewer 包 [21] https://colorbrewer2.org/,viridis 包、colourvalues、wesanderson、dichromat 包、pals 包,palr 包,colorRamps 包、ColorPalette 包、colortools 包就不一一详细介绍了。
colormap 包基于 node.js 的 colormap 模块提供 44 个预定义的调色板 paletteer 包收集了很多 R 包提供的调色板,同时也引入了很多依赖。根据电影 Harry Potter 制作的调色板 harrypotter,根据网站 CARTO 设计的 rcartocolor 包,colorblindr 模拟色盲环境下的配色方案。
yarrr 包主要是为书籍 《YaRrr! The Pirate’s Guide to R》 https://github.com/ndphillips/ThePiratesGuideToR 提供配套资源,兼顾收集了一组调色板。
RColorBrewer 调色板数量必须至少 3 个,这是上游 colorbrewer 的 问题,具体体现在调用
RColorBrewer::brewer.pal(n = 2, name = "Set2")
时会有警告。 plotly 调用
[1] "#66C2A5" "#FC8D62" "#8DA0CB"
Warning message:
In RColorBrewer::brewer.pal(n = 2, name = "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
par(mar = c(1, 2, 1, 0), mfrow = c(3, 2))
set.seed(1234)
x <- sample(seq(8), 8, replace = FALSE)
barplot(x, col = palette(), border = "white")
barplot(x, col = heat.colors(8), border = "white")
barplot(x, col = gray.colors(8), border = "white")
barplot(x, col = "lightblue", border = "white")
barplot(x, col = colorspace::sequential_hcl(8), border = "white")
barplot(x, col = colorspace::diverge_hcl(8,
h = c(130, 43),
c = 100, l = c(70, 90)
), border = "white")

图 11.23: 源起
与图 11.85 对比,图11.24 的层次更加丰富,识别性更高
expand.grid(months = month.abb, years = 1949:1960) |>
transform(num = as.vector(AirPassengers)) |>
ggplot(aes(x = years, y = months, fill = num)) +
scale_fill_distiller(palette = "Spectral") +
geom_tile(color = "white", size = 0.4) +
scale_x_continuous(
expand = c(0.01, 0.01),
breaks = seq(1949, 1960, by = 1),
labels = 1949:1960
) +
theme_minimal(
base_size = 10.54,
base_family = "Noto Serif SC"
) +
labs(x = "年", y = "月", fill = "人数")

图 11.24: Spectral 调色板
再举例子,图 11.25 是正负例对比,其中好在哪里呢?这张图要表达美国黄石国家公园的老忠实泉间歇喷发的时间规律,那么好的标准就是层次分明,以突出不同颜色之间的时间差异。这个差异,还要看起来不那么费眼睛,一目了然最好。
erupt <- ggplot(faithfuld, aes(waiting, eruptions, fill = density)) +
geom_raster() +
scale_x_continuous(NULL, expand = c(0, 0)) +
scale_y_continuous(NULL, expand = c(0, 0)) +
theme(legend.position = "none")
p1 <- erupt + scale_fill_gradientn(colours = gray.colors(7))
p2 <- erupt + scale_fill_distiller(palette = "Spectral")
p3 <- erupt + scale_fill_gradientn(colours = terrain.colors(7))
p4 <- erupt + scale_fill_continuous(type = 'viridis')
(p1 + p2) / (p3 + p4)

图 11.25: 美国黄石国家公园的老忠实泉
RColorBrewer 包 提供了有序 (Sequential) 、定性 (Qualitative) 和发散 (Diverging) 三类调色板,一般来讲,分别适用于连续或有序分类变量、无序分类变量、两类分层对比变量的绘图。再加上强大的 ggplot2 包内置的对颜色处理的函数,如 scale_alpha_*
、 scale_colour_*
和 scale_fill_*
等,详见:
ls("package:ggplot2", pattern = "scale_col(ou|o)r_")
## [1] "scale_color_binned" "scale_color_brewer"
## [3] "scale_color_continuous" "scale_color_date"
## [5] "scale_color_datetime" "scale_color_discrete"
## [7] "scale_color_distiller" "scale_color_fermenter"
## [9] "scale_color_gradient" "scale_color_gradient2"
## [11] "scale_color_gradientn" "scale_color_grey"
## [13] "scale_color_hue" "scale_color_identity"
## [15] "scale_color_manual" "scale_color_ordinal"
## [17] "scale_color_steps" "scale_color_steps2"
## [19] "scale_color_stepsn" "scale_color_viridis_b"
## [21] "scale_color_viridis_c" "scale_color_viridis_d"
## [23] "scale_colour_binned" "scale_colour_brewer"
## [25] "scale_colour_continuous" "scale_colour_date"
## [27] "scale_colour_datetime" "scale_colour_discrete"
## [29] "scale_colour_distiller" "scale_colour_fermenter"
## [31] "scale_colour_gradient" "scale_colour_gradient2"
## [33] "scale_colour_gradientn" "scale_colour_grey"
## [35] "scale_colour_hue" "scale_colour_identity"
## [37] "scale_colour_manual" "scale_colour_ordinal"
## [39] "scale_colour_steps" "scale_colour_steps2"
## [41] "scale_colour_stepsn" "scale_colour_viridis_b"
## [43] "scale_colour_viridis_c" "scale_colour_viridis_d"
ls("package:ggplot2", pattern = "scale_fill_")
## [1] "scale_fill_binned" "scale_fill_brewer" "scale_fill_continuous"
## [4] "scale_fill_date" "scale_fill_datetime" "scale_fill_discrete"
## [7] "scale_fill_distiller" "scale_fill_fermenter" "scale_fill_gradient"
## [10] "scale_fill_gradient2" "scale_fill_gradientn" "scale_fill_grey"
## [13] "scale_fill_hue" "scale_fill_identity" "scale_fill_manual"
## [16] "scale_fill_ordinal" "scale_fill_steps" "scale_fill_steps2"
## [19] "scale_fill_stepsn" "scale_fill_viridis_b" "scale_fill_viridis_c"
## [22] "scale_fill_viridis_d"
colourlovers 包借助 XML, jsonlite 和 httr 包可以在线获取网站 COLOURlovers 的调色板
library(colourlovers)
palette1 <- clpalette('113451')
palette2 <- clpalette('92095')
palette3 <- clpalette('629637')
palette4 <- clpalette('694737')
使用调色板
layout(matrix(1:4, nrow = 2))
par(mar = c(2, 2, 2, 2))
barplot(VADeaths, col = swatch(palette1)[[1]], border = NA)
barplot(VADeaths, col = swatch(palette2)[[1]], border = NA)
barplot(VADeaths, col = swatch(palette3)[[1]], border = NA)
barplot(VADeaths, col = swatch(palette4)[[1]], border = NA)
调色板的描述信息
palette1
获取调色板中的颜色向量
swatch(palette1)[[1]]
11.3.2 颜色模式
不同的颜色模式,从 RGB 到 HCL 的基本操作 https://stat545.com/block018_colors.html
# https://github.com/hadley/ggplot2-book
hcl <- expand.grid(x = seq(-1, 1, length = 100), y = seq(-1, 1, length = 100)) |>
subset(subset = x^2 + y^2 < 1) |>
transform(
r = sqrt(x^2 + y^2)
) |>
transform(
h = 180 / pi * atan2(y, x),
c = 100 * r,
l = 65
) |>
transform(
colour = hcl(h, c, l)
)
# sin(h) = y / (c / 100)
# y = sin(h) * c / 100
cols <- scales::hue_pal()(5)
selected <- colorspace::RGB(t(col2rgb(cols)) / 255) %>%
as("polarLUV") %>%
colorspace::coords() %>%
as.data.frame() %>%
transform(
x = cos(H / 180 * pi) * C / 100,
y = sin(H / 180 * pi) * C / 100,
colour = cols
)
ggplot(hcl, aes(x, y)) +
geom_raster(aes(fill = colour)) +
scale_fill_identity() +
scale_colour_identity() +
coord_equal() +
scale_x_continuous("", breaks = NULL) +
scale_y_continuous("", breaks = NULL) +
geom_point(data = selected, size = 10, color = "white") +
geom_point(data = selected, size = 5, aes(colour = colour))

图 11.26: HCL调色
R 内置了 502 种不同颜色的名称,下面随机地选取 20 种颜色
## [1] "royalblue4" "plum1" "papayawhip" "darkslategray"
## [5] "darkturquoise" "gray79" "darkred" "maroon4"
## [9] "darkolivegreen4" "springgreen2" "orchid4" "lemonchiffon2"
## [13] "paleturquoise4" "gray49" "cyan" "antiquewhite1"
## [17] "yellow2" "gray13" "cadetblue2" "gray77"
R 包 grDevices 提供 hcl 调色板39 调制两个色板
# Colors from https://github.com/johannesbjork/LaCroixColoR
color_pal <- c("#FF3200", "#E9A17C", "#E9E4A6", "#1BB6AF", "#0076BB", "#172869")
n <- 16
more_colors <- (grDevices::colorRampPalette(color_pal))(n)
scales::show_col(colours = more_colors)

图 11.27: 桃色至梨色的渐变
# colors in colortools from http://www.gastonsanchez.com/
fish_pal <- c(
"#69D2E7", "#6993E7", "#7E69E7", "#BD69E7",
"#E769D2", "#E76993", "#E77E69", "#E7BD69",
"#D2E769", "#93E769", "#69E77E", "#69E7BD"
)
more_colors <- (grDevices::colorRampPalette(fish_pal))(n)
scales::show_col(colours = more_colors)

图 11.28: Hue-Saturation-Value (HSV) 颜色模型
rgb(red = 86, green = 180, blue = 233, maxColorValue = 255) # "#56B4E9"
## [1] "#56B4E9"
rgb(red = 0, green = 158, blue = 115, maxColorValue = 255) # "#009E73"
## [1] "#009E73"
rgb(red = 240, green = 228, blue = 66, maxColorValue = 255) # "#F0E442"
## [1] "#F0E442"
rgb(red = 0, green = 114, blue = 178, maxColorValue = 255) # "#0072B2"
## [1] "#0072B2"
举例子,直方图配色与不配色
# library(pander)
# evalsOptions('graph.unify', TRUE)
# panderOptions('graph.colors') 获取调色板
# https://www.fontke.com/tool/rgbschemes/ 在线配色
cols <- c(
"#56B4E9", "#009E73", "#F0E442", "#0072B2",
"#D55E00", "#CC79A7", "#999999", "#E69F00"
)
hist(mtcars$hp, col = "#56B4E9", border = "white", grid = grid())

图 11.29: 直方图
ggplot(mtcars) +
geom_histogram(aes(x = hp, fill = as.factor(..count..)),
color = "white", bins = 6
) +
scale_fill_manual(values = rep("#56B4E9", 10)) +
ggtitle("Histogram with ggplot2") +
theme_minimal() +
theme(legend.position = "none")

图 11.30: 直方图
11.3.2.1 RGB
红(red)、绿(green)、蓝(blue)是三原色
rgb(red, green, blue, alpha, names = NULL, maxColorValue = 1)
函数参数说明:
-
red, blue, green, alpha
取值范围\([0,M]\),\(M\) 是 maxColorValue -
names
字符向量,给这组颜色值取名 -
maxColorValue
红,绿,蓝三色范围的最大值
The colour specification refers to the standard sRGB colorspace (IEC standard 61966).
rgb 产生一种颜色,如 rgb(255, 0, 0, maxColorValue = 255)
的颜色是 "#FF0000"
,这是一串16进制数,每两个一组,那么一组有 \(16^2 = 256\) 种组合,整个一串有 \(256^3 = 16777216\) 种组合,这就是RGB表达的所有颜色。
11.3.2.3 HSV
Create a vector of colors from vectors specifying hue, saturation and value. 色相饱和度值
hsv(h = 1, s = 1, v = 1, alpha)
This function creates a vector of colors corresponding to the given values in HSV space. rgb and rgb2hsv for RGB to HSV conversion;
hsv函数通过设置色调、饱和度和亮度获得颜色,三个值都是0-1的相对量
RGB HSV HSL 都是不连续的颜色空间,缺点
11.3.2.4 HCL
基于感知的颜色空间替代RGB颜色空间
通过指定色相(hue),色度(chroma)和亮度(luminance/lightness),创建一组(种)颜色
hcl(h = 0, c = 35, l = 85, alpha, fixup = TRUE)
函数参数说明:
h 颜色的色调,取值范围为[0,360],0、120、240分别对应红色、绿色、蓝色
c 颜色的色度,其上界取决于色调和亮度
l 颜色的亮度,取值范围[0,100],给定色调和色度,只有一部分子集可用
alpha 透明度,取值范围[0,1],0 和1分别表示透明和不透明
This function corresponds to polar coordinates in the CIE-LUV color space
选色为什么这么难
色相与阴影相比是无关紧要的,色相对于标记和分类很有用,但表示(精细的)空间数据或形状的效果较差。颜色是改善图形的好工具,但糟糕的配色方案 (color schemes) 可能会导致比灰度调色板更差的效果。[18]
黑、白、灰,看似有三种颜色,其实只有一种颜色,黑和白只是灰色的两极,那么如何设置灰色梯度,使得人眼比较好区分它们呢?这样获得的调色板适用于什么样的绘图环境呢?
11.3.2.5 CMYK
印刷三原色:青 (cyan)、品红 (magenta)、黄 (yellow)
- 颜色模式转化
col2rgb()
、rgb2hsv()
和 rgb()
函数 hex2RGB()
函数 colorspace col2hcl()
函数 scales col2HSV()
colortools col2hex()
col2rgb("lightblue") # color to RGB
## [,1]
## red 173
## green 216
## blue 230
scales::col2hcl("lightblue") # color to HCL
## [1] "#ADD8E6"
# palr::col2hex("lightblue") # color to HEX
# colortools::col2HSV("lightblue") # color to HSV
rgb(173, 216, 230, maxColorValue = 255) # RGB to HEX
## [1] "#ADD8E6"
colorspace::hex2RGB("#ADD8E6") # HEX to RGB
## R G B
## [1,] 0.6784314 0.8470588 0.9019608
rgb(.678, .847, .902, maxColorValue = 1) # RGB to HEX
## [1] "#ADD8E6"
rgb2hsv(173, 216, 230, maxColorValue = 255) # RGB to HSV
## [,1]
## h 0.5409357
## s 0.2478261
## v 0.9019608
11.3.3 LaTeX 配色
LaTeX 宏包 xcolor 中定义颜色的常用方式有两种,其一,\textcolor{green!40!yellow}
表示 40% 的绿色和 60% 的黄色混合色彩,其二,\textcolor[HTML]{34A853}
HEX 表示的色彩直接在 LaTeX 文档中使用的方式,类似地 \textcolor[RGB]{52,168,83}
也表示 Google 图标中的绿色。
\documentclass[tikz,border=10pt]{standalone}
\begin{document}
\begin{tikzpicture}
\draw (0,0) rectangle (2,1) node [midway] {\textcolor[RGB]{52,168,83}{Hello} \textcolor[HTML]{34A853}{\TeX}};
\end{tikzpicture}
\end{document}
对应于 R 中的调用方式为:
rgb(52, 168, 83, maxColorValue = 255)
## [1] "#34A853"
11.3.4 ggplot2 配色
boxplot(weight ~ group,
data = PlantGrowth, col = "lightgray",
notch = FALSE, varwidth = TRUE
)
# 类似 boxplot
ggplot(data = PlantGrowth, aes(x = group, y = weight)) +
geom_boxplot(notch = FALSE, varwidth = TRUE, fill = "lightgray")
# 默认调色板
ggplot(data = PlantGrowth, aes(x = group, y = weight, fill = group)) +
geom_boxplot(notch = FALSE, varwidth = TRUE)
# Google 调色板
ggplot(data = PlantGrowth, aes(x = group, y = weight, fill = group)) +
geom_boxplot(notch = FALSE, varwidth = TRUE) +
scale_fill_manual(values = c("#4285f4", "#34A853", "#FBBC05", "#EA4335"))




图 11.31: 几种不同的箱线图
11.4 图库
11.4.1 饼图
我对饼图是又爱又恨,爱的是它表示百分比的时候,往往让读者联想到蛋糕,份额这类根深蒂固的情景,从而让数字通俗易懂、深入人心,是一种很好的表达方式,恨的也是这一点,我用柱状图表达不香吗?人眼对角度的区分度远不如柱状图呢,特别是当两个类所占的份额比较接近的时候,所以很多时候,除了用饼图表达份额,还会在旁边标上百分比,从数据可视化的角度来说,如图 11.32 所示,这是信息冗余!
BOD %>% transform(., ratio = demand / sum(demand)) %>%
ggplot(., aes(x = "", y = demand, fill = reorder(Time, demand))) +
geom_bar(stat = "identity", show.legend = FALSE, color = "white") +
coord_polar(theta = "y") +
geom_text(aes(x = 1.6, label = paste0(round(ratio, digits = 4) * 100, "%")),
position = position_stack(vjust = 0.5), color = "black"
) +
geom_text(aes(x = 1.2, label = Time),
position = position_stack(vjust = 0.5), color = "black"
) +
theme_void(base_size = 14)

图 11.32: 饼图
plot_ly(type = "pie", ... )
和添加图层 add_pie()
的效果是一样的
dat = aggregate(carat ~ cut, data = diamonds, FUN = length)
plotly::plot_ly() %>%
plotly::add_pie(
data = dat, labels = ~cut, values = ~carat,
name = "简单饼图1", domain = list(row = 0, column = 0)
) %>%
plotly::add_pie(
data = dat, labels = ~cut, values = ~carat, hole = 0.6,
textposition = "inside", textinfo = "label+percent",
name = "简单饼图2", domain = list(row = 0, column = 1)
) %>%
plotly::layout(
title = "多图布局", showlegend = F,
grid = list(rows = 1, columns = 2),
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE)
) %>%
plotly::config(displayModeBar = FALSE)
图 11.33: 饼图
设置参数 hole 可以绘制环形饼图,比如 hole = 0.6
11.4.2 地图
USArrests 数据集描述了1973年美国50个州每10万居民中因袭击、抢劫和强奸而逮捕的人,以及城市人口占比。这里的地图是指按照行政区划为边界的示意图,比如图 11.34
library(maps)
crimes <- data.frame(state = tolower(rownames(USArrests)), USArrests)
# 等价于 crimes %>% tidyr::pivot_longer(Murder:Rape)
vars <- lapply(names(crimes)[-1], function(j) {
data.frame(state = crimes$state, variable = j, value = crimes[[j]])
})
crimes_long <- do.call("rbind", vars)
states_map <- map_data("state")
ggplot(crimes, aes(map_id = state)) +
geom_map(aes(fill = Murder), map = states_map) +
expand_limits(x = states_map$long, y = states_map$lat) +
scale_fill_binned(type = "viridis") +
coord_map() +
theme_minimal()

图 11.34: 1975年美国各州犯罪事件
先来看看中国及其周边,见图11.35,这个地图的缺陷就是中国南海及九段线没有标记,台湾和中国大陆不是一种颜色标记,这里的地图数据来自 R 包 maps 和 mapdata,像这样的地图就不宜在国内正式刊物上出现。
library(maps)
library(mapdata)
east_asia <- map_data("worldHires",
region = c(
"Japan", "Taiwan", "China",
"North Korea", "South Korea"
)
)
ggplot(east_asia, aes(x = long, y = lat, group = group, fill = region)) +
geom_polygon(colour = "black") +
scale_fill_brewer(palette = "Set2") +
coord_map() +
theme_minimal()

图 11.35: 中国及其周边
绘制真正的地图需要考虑投影坐标系,观察角度、分辨率、政策法规等一系列因素,它是一种复杂的图形,如图 11.36 所示。
worldmap <- map_data("world")
# 默认 mercator 投影下的默认视角 c(90, 0, mean(range(x)))
ggplot(worldmap, aes(long, lat, group = group)) +
geom_polygon(aes(fill = region), show.legend = FALSE) +
coord_map(
xlim = c(-120, 40), ylim = c(30, 90)
)
# 换观察角度
ggplot(worldmap, aes(long, lat, group = group)) +
geom_polygon(aes(fill = region), show.legend = FALSE) +
coord_map(
xlim = c(-120, 40), ylim = c(30, 90),
orientation = c(90, 0, 0)
)
# 换投影坐标系
ggplot(worldmap, aes(long, lat, group = group)) +
geom_polygon(aes(fill = region), show.legend = FALSE) +
coord_map("ortho",
xlim = c(-120, 40), ylim = c(30, 90)
)
# 二者皆换
ggplot(worldmap, aes(long, lat, group = group)) +
geom_polygon(aes(fill = region), show.legend = FALSE) +
coord_map("ortho",
xlim = c(-120, 40), ylim = c(30, 90),
orientation = c(90, 0, 0)
)




图 11.36: 画地图的正确姿势
Google 地图
library(RgoogleMaps)
# 一组坐标的中心位置
lat <- c(40.702147, 40.718217, 40.711614)
lon <- c(-74.012318, -74.015794, -73.998284)
center <- c(mean(lat), mean(lon))
zoom <- min(MaxZoom(range(lat), range(lon)))
# 矩形对角线的两个顶点
bb <- qbbox(lat, lon)
# 获取地图数据
myMap <- GetMap(center, size = c(640, 640), zoom = zoom, type = "osm")
# 在地图上添加红、蓝、绿三个点
PlotOnStaticMap(myMap,
lat = lat, lon = lon, pch = 20, cex = 10,
col = c("red", "blue", "green")
)

图 11.37: Google 地图示例
11.4.3 热图
Zuguang Gu 开发的 ComplexHeatmap 包实现复杂数据的可视化,用以发现关联数据集之间的模式。特别地,比如基因数据、生存数据等,更多应用见开发者的书籍 ComplexHeatmap 完全手册 。 R 包发布在 Bioconductor 上 https://www.bioconductor.org/packages/ComplexHeatmap。使用之前我要确保已经安装 BiocManager 包,这个包负责管理 Bioconductor 上所有的包,需要先安装它,然后安装 ComplexHeatmap 包 [22]。
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("ComplexHeatmap")
11.4.4 散点图
下面以 diamonds 数据集为例展示 ggplot2 的绘图过程,首先加载 diamonds 数据集,查看数据集的内容
## tibble [53,940 × 10] (S3: tbl_df/tbl/data.frame)
## $ carat : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
数值型变量 carat 作为 x 轴
ggplot(diamonds, aes(x = carat))
ggplot(diamonds, aes(x = carat, y = price))
ggplot(diamonds, aes(x = carat, color = cut))
ggplot(diamonds, aes(x = carat), color = "steelblue")




图 11.38: 绘图过程
图 11.38 的基础上添加数据图层
sub_diamonds <- diamonds[sample(1:nrow(diamonds), 1000), ]
ggplot(sub_diamonds, aes(x = carat, y = price)) +
geom_point()

图 11.39: 添加数据图层
给散点图11.39上色
ggplot(sub_diamonds, aes(x = carat, y = price)) +
geom_point(color = "steelblue")

图 11.40: 散点图配色
ggplot(sub_diamonds, aes(x = carat, y = price)) +
geom_point(color = "steelblue") +
scale_y_continuous(
labels = scales::unit_format(unit = "k", scale = 1e-3),
breaks = seq(0, 20000, 4000)
)

图 11.41: 格式化坐标轴刻度标签
让另一变量 cut 作为颜色分类指标
ggplot(sub_diamonds, aes(x = carat, y = price, color = cut)) +
geom_point()

图 11.42: 分类散点图
当然还有一种类似的表示就是分组,默认情况下,ggplot2将所有观测点视为一组,以分类变量 cut 来分组
ggplot(sub_diamonds, aes(x = carat, y = price, group = cut)) +
geom_point()

图 11.43: 分组
在图11.43 上没有体现出来分组的意思,下面以 cut 分组线性回归为例
ggplot(sub_diamonds, aes(x = carat, y = price)) +
geom_point() +
geom_smooth(method = "lm")

图 11.44: 分组线性回归
ggplot(sub_diamonds, aes(x = carat, y = price, group = cut)) +
geom_point() +
geom_smooth(method = "lm")

图 11.45: 分组线性回归
我们当然可以选择更加合适的拟合方式,如局部多项式平滑 loess
但是该方法不太适用观测值比较多的情况,因为它会占用比较多的内存,建议使用广义可加模型作平滑拟合
ggplot(sub_diamonds, aes(x = carat, y = price, group = cut)) +
geom_point() +
geom_smooth(method = "loess")

图 10.36: 局部多项式平滑
ggplot(sub_diamonds, aes(x = carat, y = price, group = cut)) +
geom_point() +
geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"))

图 11.46: 数据分组应用广义可加平滑
ggfortify 包支持更多的统计分析结果的可视化。
为了更好地区分开组别,我们在图11.46的基础上分面或者配色
ggplot(sub_diamonds, aes(x = carat, y = price, group = cut)) +
geom_point() +
geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs")) +
facet_grid(~cut)

图 11.47: 分组分面
ggplot(sub_diamonds, aes(x = carat, y = price, group = cut, color = cut)) +
geom_point() +
geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"))

图 11.48: 分组配色
在分类散点图的另一种表示方法就是分面图,以 cut 变量作为分面的依据
ggplot(sub_diamonds, aes(x = carat, y = price)) +
geom_point() +
facet_grid(~cut)

图 11.49: 分面散点图
给图 11.49 上色
ggplot(sub_diamonds, aes(x = carat, y = price)) +
geom_point(color = "steelblue") +
facet_grid(~cut)

图 11.50: 给分面散点图上色
在图11.50的基础上,给不同的类上不同的颜色
ggplot(sub_diamonds, aes(x = carat, y = price, color = cut)) +
geom_point() +
facet_grid(~cut)

图 11.51: 给不同的类上不同的颜色
去掉图例,此时图例属于冗余信息了
ggplot(sub_diamonds, aes(x = carat, y = price, color = cut)) +
geom_point(show.legend = FALSE) +
facet_grid(~cut)

图 11.52: 去掉图例
四块土地,所施肥料不同,肥力大小顺序 4 < 2 < 3 < 1 小麦产量随肥力的变化
data(Wheat2, package = "nlme") # Wheat Yield Trials
library(colorspace)
ggplot(Wheat2, aes(longitude, latitude)) +
geom_point(aes(size = yield, colour = Block)) +
scale_color_discrete_sequential(palette = "Viridis") +
scale_x_continuous(breaks = seq(0, 30, 5)) +
scale_y_continuous(breaks = seq(0, 50, 10))

图 10.37: 多个图例
ggplot(mtcars, aes(x = hp, y = mpg, color = factor(am))) +
geom_point()

图 11.53: 分类散点图
图层、分组、分面和散点图介绍完了,接下来就是其它统计图形,如箱线图,小提琴图和条形图
dat <- as.data.frame(cbind(rep(1948 + seq(12), each = 12), rep(seq(12), 12), AirPassengers))
colnames(dat) <- c("year", "month", "passengers")
ggplot(data = dat, aes(x = as.factor(year), y = as.factor(month))) +
stat_sum(aes(size = passengers), colour = "lightblue") +
scale_size(range = c(1, 10), breaks = seq(100, 650, 50)) +
labs(x = "Year", y = "Month", colour = "Passengers") +
theme_minimal()

图 10.39: 1948年至1960年航班乘客人数变化
11.4.5 条形图
条形图特别适合分类变量的展示,我们这里展示钻石切割质量 cut 不同等级的数量,当然我们可以直接展示各类的数目,在图层 geom_bar
中指定 stat="identity"
## cut
## Fair Good Very Good Premium Ideal
## 1610 4906 12082 13791 21551
cut_df <- as.data.frame(table(diamonds$cut))
ggplot(cut_df, aes(x = Var1, y = Freq)) + geom_bar(stat = "identity")


图 11.54: 频数条形图
还有另外三种表示方法



我们还可以在图 11.54 的基础上再添加一个分类变量钻石的纯净度 clarity,形成堆积条形图

图 11.55: 堆积条形图
再添加一个分类变量钻石颜色 color 比较好的做法是分面
ggplot(diamonds, aes(x = color, fill = clarity)) +
geom_bar() +
facet_grid(~cut)

图 11.56: 分面堆积条形图
实际上,绘制图11.56包含了对分类变量的分组计数过程,如下
## color
## cut D E F G H I J
## Fair 163 224 312 314 303 175 119
## Good 662 933 909 871 702 522 307
## Very Good 1513 2400 2164 2299 1824 1204 678
## Premium 1603 2337 2331 2924 2360 1428 808
## Ideal 2834 3903 3826 4884 3115 2093 896
还有一种堆积的方法是按比例,而不是按数量,如图11.57
ggplot(diamonds, aes(x = color, fill = clarity)) +
geom_bar(position = "fill") +
facet_grid(~cut)

图 11.57: 比例堆积条形图
接下来就是复合条形图

图 11.58: 复合条形图
再添加一个分类变量,就是需要分面大法了,图 11.58 展示了三个分类变量,其实我们还可以再添加一个分类变量用作分面的列依据
ggplot(diamonds, aes(x = color, fill = clarity)) +
geom_bar(position = "dodge") +
facet_grid(rows = vars(cut))

图 11.59: 分面复合条形图
图 11.59 展示的数据如下
## , , cut = Fair
##
## clarity
## color I1 SI2 SI1 VS2 VS1 VVS2 VVS1 IF
## D 4 56 58 25 5 9 3 3
## E 9 78 65 42 14 13 3 0
## F 35 89 83 53 33 10 5 4
## G 53 80 69 45 45 17 3 2
## H 52 91 75 41 32 11 1 0
## I 34 45 30 32 25 8 1 0
## J 23 27 28 23 16 1 1 0
##
## , , cut = Good
##
## clarity
## color I1 SI2 SI1 VS2 VS1 VVS2 VVS1 IF
## D 8 223 237 104 43 25 13 9
## E 23 202 355 160 89 52 43 9
## F 19 201 273 184 132 50 35 15
## G 19 163 207 192 152 75 41 22
## H 14 158 235 138 77 45 31 4
## I 9 81 165 110 103 26 22 6
## J 4 53 88 90 52 13 1 6
##
## , , cut = Very Good
##
## clarity
## color I1 SI2 SI1 VS2 VS1 VVS2 VVS1 IF
## D 5 314 494 309 175 141 52 23
## E 22 445 626 503 293 298 170 43
## F 13 343 559 466 293 249 174 67
## G 16 327 474 479 432 302 190 79
## H 12 343 547 376 257 145 115 29
## I 8 200 358 274 205 71 69 19
## J 8 128 182 184 120 29 19 8
##
## , , cut = Premium
##
## clarity
## color I1 SI2 SI1 VS2 VS1 VVS2 VVS1 IF
## D 12 421 556 339 131 94 40 10
## E 30 519 614 629 292 121 105 27
## F 34 523 608 619 290 146 80 31
## G 46 492 566 721 566 275 171 87
## H 46 521 655 532 336 118 112 40
## I 24 312 367 315 221 82 84 23
## J 13 161 209 202 153 34 24 12
##
## , , cut = Ideal
##
## clarity
## color I1 SI2 SI1 VS2 VS1 VVS2 VVS1 IF
## D 13 356 738 920 351 284 144 28
## E 18 469 766 1136 593 507 335 79
## F 42 453 608 879 616 520 440 268
## G 16 486 660 910 953 774 594 491
## H 38 450 763 556 467 289 326 226
## I 17 274 504 438 408 178 179 95
## J 2 110 243 232 201 54 29 25
# 漫谈条形图 https://cosx.org/2017/10/discussion-about-bar-graph
set.seed(2020)
dat <- data.frame(
age = rep(1:30, 2),
gender = rep(c("man", "woman"), each = 30),
num = sample(x = 1:100, size = 60, replace = T)
)
# 重叠
p1 <- ggplot(data = dat, aes(x = age, y = num, fill = gender)) +
geom_col(position = "identity", alpha = 0.5)
# 堆积
p2 <- ggplot(data = dat, aes(x = age, y = num, fill = gender)) +
geom_col(position = "stack")
# 双柱
p3 <- ggplot(data = dat, aes(x = age, y = num, fill = gender)) +
geom_col(position = "dodge")
# 百分比
p4 <- ggplot(data = dat, aes(x = age, y = num, fill = gender)) +
geom_col(position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
labs(y = "%")
(p1 + p2) / (p3 + p4)

图 11.60: 条形图的四种常见形态
以数据集 diamonds 为例,按照纯净度 clarity 和切工 cut 分组统计钻石的数量,再按切工分组统计不同纯净度的钻石数量占比,如表 11.2 所示
library(data.table)
diamonds <- as.data.table(diamonds)
dat <- diamonds[, .(cnt = .N), by = .(cut, clarity)] %>%
.[, pct := cnt / sum(cnt), by = .(cut)] %>%
.[, pct_pp := paste0(cnt, " (", scales::percent(pct, accuracy = 0.01), ")") ]
# 分组计数 with(diamonds, table(clarity, cut))
dcast(dat, formula = clarity ~ cut, value.var = "pct_pp") %>%
knitr::kable(align = "crrrrr", caption = "数值和比例组合呈现")
clarity | Fair | Good | Very Good | Premium | Ideal |
---|---|---|---|---|---|
I1 | 210 (13.04%) | 96 (1.96%) | 84 (0.70%) | 205 (1.49%) | 146 (0.68%) |
SI2 | 466 (28.94%) | 1081 (22.03%) | 2100 (17.38%) | 2949 (21.38%) | 2598 (12.06%) |
SI1 | 408 (25.34%) | 1560 (31.80%) | 3240 (26.82%) | 3575 (25.92%) | 4282 (19.87%) |
VS2 | 261 (16.21%) | 978 (19.93%) | 2591 (21.45%) | 3357 (24.34%) | 5071 (23.53%) |
VS1 | 170 (10.56%) | 648 (13.21%) | 1775 (14.69%) | 1989 (14.42%) | 3589 (16.65%) |
VVS2 | 69 (4.29%) | 286 (5.83%) | 1235 (10.22%) | 870 (6.31%) | 2606 (12.09%) |
VVS1 | 17 (1.06%) | 186 (3.79%) | 789 (6.53%) | 616 (4.47%) | 2047 (9.50%) |
IF | 9 (0.56%) | 71 (1.45%) | 268 (2.22%) | 230 (1.67%) | 1212 (5.62%) |
分别以堆积条形图和百分比堆积条形图展示,添加注释到条形图上,见 11.61
p1 = ggplot(data = dat, aes(x = cut, y = cnt, fill = clarity)) +
geom_col(position = "dodge") +
geom_text(aes(label = cnt), position = position_dodge(1), vjust = -0.5) +
geom_text(aes(label = scales::percent(pct, accuracy = 0.1)),
position = position_dodge(1), vjust = 1, hjust = 0.5
) +
scale_fill_brewer(palette = "Spectral") +
labs(fill = "clarity", y = "", x = "cut") +
theme_minimal() +
theme(legend.position = "top")
p2 = ggplot(data = dat, aes(y = cut, x = cnt, fill = clarity)) +
geom_col(position = "fill") +
geom_text(aes(label = cnt), position = position_fill(1), vjust = -0.5) +
geom_text(aes(label = scales::percent(pct, accuracy = 0.1)),
position = position_fill(1), vjust = 1, hjust = 0.5
) +
scale_fill_brewer(palette = "Spectral") +
scale_x_continuous(labels = scales::percent) +
labs(fill = "clarity", y = "", x = "cut") +
theme_minimal() +
theme(legend.position = "top")
p1 / p2

图 11.61: 添加注释到条形图
借助 plotly 制作相应的动态百分比堆积条形图
ggplot(data = diamonds, aes(x = cut, fill = clarity)) +
geom_bar(position = "dodge2") +
scale_fill_brewer(palette = "Spectral")

图 11.62: 百分比堆积条形图
# 百分比堆积条形图
plotly::plot_ly(dat,
x = ~cut, color = ~clarity, y = ~pct,
colors = "Spectral", type = "bar",
text = ~ paste0(
cnt, "颗 <br>",
"占比:", scales::percent(pct, accuracy = 0.1), "<br>"
),
hoverinfo = "text"
) %>%
plotly::layout(
barmode = "stack",
yaxis = list(tickformat = ".0%")
) %>%
plotly::config(displayModeBar = FALSE)
图 11.62: 百分比堆积条形图
# `type = "histogram"` 以 cut 和 clarity 分组计数
plotly::plot_ly(diamonds,
x = ~cut, color = ~clarity,
colors = "Spectral", type = "histogram"
) %>%
plotly::config(displayModeBar = FALSE)
图 11.62: 百分比堆积条形图
# 堆积图
plotly::plot_ly(diamonds,
x = ~cut, color = ~clarity,
colors = "Spectral", type = "histogram"
) %>%
plotly::layout(
barmode = "stack",
yaxis = list(title = "cnt"),
legend = list(title = list(text = "clarity"))
) %>%
plotly::config(displayModeBar = FALSE)