第21章 描述性统计表
21.1 概述
在进行数据处理时,我们会将数据描述制成表格,即描述性统计表。我们可以用gtsummary
包完成这些表格的制作。下面我们将使用gtsummary
包中的trial
数据集来了解gtsummary的功能。
21.2 简易表格
我们先来看一下如何用gtsummary
快速制作一张统计描述表。
## # A tibble: 6 × 8
## trt age marker stage grade response death ttdeath
## <chr> <dbl> <dbl> <fct> <fct> <int> <int> <dbl>
## 1 Drug A 23 0.16 T1 II 0 0 24
## 2 Drug B 9 1.11 T2 I 1 0 24
## 3 Drug A 31 0.277 T1 II 0 0 24
## 4 Drug A NA 2.07 T3 III 1 1 17.6
## 5 Drug A 51 2.77 T4 III 1 1 16.4
## 6 Drug B 39 0.613 T4 I 0 1 15.6
Characteristic | N = 2001 |
---|---|
Chemotherapy Treatment | |
Drug A | 98 (49%) |
Drug B | 102 (51%) |
Age | 47 (38, 57) |
Unknown | 11 |
Marker Level (ng/mL) | 0.64 (0.22, 1.39) |
Unknown | 10 |
T Stage | |
T1 | 53 (27%) |
T2 | 54 (27%) |
T3 | 43 (22%) |
T4 | 50 (25%) |
Months to Death/Censor | 22.4 (16.0, 24.0) |
1 n (%); Median (IQR) |
通过上表我们可以发现,连续变量默认描述为中位数(四分位数),缺失值显示为“Unknown”,分类变量描述为例数(百分比)。
注意:
如果连续变量的数值种类在10以内,表格会自动将该连续变量调整为分类变量进行描述。如果要保持连续变量的描述,需要在tbl_summary()
函数中声明参数type = list(变量名称 ~ "continuous")
。
我们也可以对不同变量个性化地设置描述方式。
trial %>%
select(trt, age, marker, stage, ttdeath) %>%
tbl_summary(
statistic = (
list(
all_continuous() ~ "{mean} ({sd})", # 设置所有连续变量的描述方式
c("age","marker") ~ "{median} [{p25}, {p75}]", # 单独设置age,marker变量描述方式
all_categorical() ~ "{n} / {N} ({p}%)" # 设置所有分类变量的描述方式
)
),
digits = (
list(
all_continuous() ~ 2, # 设置连续变量的小数点位数
"age" ~ 0 # 单独设置age变量的小数点位数
)
)
)
Characteristic | N = 2001 |
---|---|
Chemotherapy Treatment | |
Drug A | 98 / 200 (49%) |
Drug B | 102 / 200 (51%) |
Age | 47 [38, 57] |
Unknown | 11 |
Marker Level (ng/mL) | 0.64 [0.22, 1.39] |
Unknown | 10 |
T Stage | |
T1 | 53 / 200 (27%) |
T2 | 54 / 200 (27%) |
T3 | 43 / 200 (22%) |
T4 | 50 / 200 (25%) |
Months to Death/Censor | 19.62 (5.28) |
1 n / N (%); Median [IQR]; Mean (SD) |
21.3 添加组间比较
如果我们想按照不同trt
分组描述,可以声明tbl_summary(by = trt)
参数。
trial %>%
select(trt, age, marker, stage, ttdeath) %>%
tbl_summary(
by = trt,
statistic = (
list(
all_continuous() ~ "{mean} ({sd})", # 设置所有连续变量的描述方式
c("age","marker") ~ "{median} [{p25}, {p75}]", # 单独设置age,marker变量描述方式
all_categorical() ~ "{n} / {N} ({p}%)" # 设置所有分类变量的描述方式
)
),
digits = (
list(
all_continuous() ~ 2, # 设置连续变量的小数点位数
"age" ~ 0 # 单独设置age变量的小数点位数
)
)
) %>%
add_p( # 添加组间比较
test = list(c("marker","ttdeath") ~ "t.test"),
pvalue_fun = function(x){style_pvalue(x, digits = 2)} # 设置P值的小数点位数
) %>%
add_overall() %>% # 添加总体描述
add_n() # 添加总体样本量
Characteristic | N | Overall, N = 2001 | Drug A, N = 981 | Drug B, N = 1021 | p-value2 |
---|---|---|---|---|---|
Age | 189 | 47 [38, 57] | 46 [37, 59] | 48 [39, 56] | 0.72 |
Unknown | 11 | 7 | 4 | ||
Marker Level (ng/mL) | 190 | 0.64 [0.22, 1.39] | 0.84 [0.24, 1.57] | 0.52 [0.19, 1.20] | 0.12 |
Unknown | 10 | 6 | 4 | ||
T Stage | 200 | 0.87 | |||
T1 | 53 / 200 (27%) | 28 / 98 (29%) | 25 / 102 (25%) | ||
T2 | 54 / 200 (27%) | 25 / 98 (26%) | 29 / 102 (28%) | ||
T3 | 43 / 200 (22%) | 22 / 98 (22%) | 21 / 102 (21%) | ||
T4 | 50 / 200 (25%) | 23 / 98 (23%) | 27 / 102 (26%) | ||
Months to Death/Censor | 200 | 19.62 (5.28) | 20.23 (4.99) | 19.04 (5.50) | 0.11 |
1 Median [IQR]; n / N (%); Mean (SD) | |||||
2 Wilcoxon rank sum test; Welch Two Sample t-test; Pearson’s Chi-squared test |
21.4 格式调整
trial %>%
select(trt, age, marker, stage, ttdeath) %>%
tbl_summary(
by = trt,
statistic = (
list(
all_continuous() ~ "{mean} ({sd})", # 设置所有连续变量的描述方式
c("age","marker") ~ "{median} [{p25}, {p75}]", # 单独设置age,marker变量描述方式
all_categorical() ~ "{n} / {N} ({p}%)" # 设置所有分类变量的描述方式
)
),
digits = (
list(
all_continuous() ~ 2, # 设置连续变量的小数点位数
"age" ~ 0 # 单独设置age变量的小数点位数
)
),
# missing = "no", # 此行代码将不显示缺失值
missing_text = "Missing", # 将缺失值一栏显示为"Missing"
label = stage ~ "Tumor Stage", # 调整变量名称
) %>%
add_p( # 添加组间比较
test = list(c("marker","ttdeath") ~ "t.test"),
pvalue_fun = function(x){style_pvalue(x, digits = 2)} # 设置P值的小数点位数
) %>%
add_overall() %>% # 添加总体描述
add_n() %>% # 添加总体样本量
modify_header(
label ~ "**Variable**" # 调整变量栏名称
) %>%
modify_spanning_header(
c("stat_1", "stat_2") ~ "**Treatment Group**" # 为分组添加变量名称stat_1,stat_2不能改
) %>%
modify_footnote(
all_stat_cols() ~ "Median [Q1,Q3], Mean (SD) or Frequency (%)", # 调整表注
) %>%
separate_p_footnotes() %>% # 为各类比较分别添加表注
modify_caption("**Table 1. Patient Characteristics**") %>% # 添加表题
bold_labels() # 将变量名称加粗
Variable | N | Overall, N = 2001 | Treatment Group | p-value | |
---|---|---|---|---|---|
Drug A, N = 981 | Drug B, N = 1021 | ||||
Age | 189 | 47 [38, 57] | 46 [37, 59] | 48 [39, 56] | 0.722 |
Missing | 11 | 7 | 4 | ||
Marker Level (ng/mL) | 190 | 0.64 [0.22, 1.39] | 0.84 [0.24, 1.57] | 0.52 [0.19, 1.20] | 0.123 |
Missing | 10 | 6 | 4 | ||
Tumor Stage | 200 | 0.874 | |||
T1 | 53 / 200 (27%) | 28 / 98 (29%) | 25 / 102 (25%) | ||
T2 | 54 / 200 (27%) | 25 / 98 (26%) | 29 / 102 (28%) | ||
T3 | 43 / 200 (22%) | 22 / 98 (22%) | 21 / 102 (21%) | ||
T4 | 50 / 200 (25%) | 23 / 98 (23%) | 27 / 102 (26%) | ||
Months to Death/Censor | 200 | 19.62 (5.28) | 20.23 (4.99) | 19.04 (5.50) | 0.113 |
1 Median [Q1,Q3], Mean (SD) or Frequency (%) | |||||
2 Wilcoxon rank sum test | |||||
3 Welch Two Sample t-test | |||||
4 Pearson’s Chi-squared test |
gtsummary
还预设了一些期刊风格,我们可以直接调用。
# 将gtsummary风格预设为jama风格
theme_gtsummary_journal("jama")
# 允许不同风格相互叠加
theme_gtsummary_compact()
# 生成表格
trial %>%
select(trt, age, marker, stage, ttdeath) %>%
tbl_summary(by = trt, missing_text = "Missing")
Characteristic | Drug A, N = 98 | Drug B, N = 102 |
---|---|---|
Age, Median (IQR) | 46 (37 – 59) | 48 (39 – 56) |
Missing | 7 | 4 |
Marker Level (ng/mL), Median (IQR) | 0.84 (0.24 – 1.57) | 0.52 (0.19 – 1.20) |
Missing | 6 | 4 |
T Stage, n (%) | ||
T1 | 28 (29) | 25 (25) |
T2 | 25 (26) | 29 (28) |
T3 | 22 (22) | 21 (21) |
T4 | 23 (23) | 27 (26) |
Months to Death/Censor, Median (IQR) | 23.5 (17.4 – 24.0) | 21.2 (14.6 – 24.0) |
21.5 表格导出
gtsummary
表格可以导出为多种形式,此时需要将表格用as_gt()
函数转为gt格式,再使用gtsave()
函数导出。但是这种方法导出为word时容易乱码,此时可以使用flextable
包的save_as_docx()
函数。
summary_table <- trial %>%
select(trt, age, marker, stage, ttdeath) %>%
tbl_summary(
by = trt,
statistic = (
list(
all_continuous() ~ "{mean} ({sd})", # 设置所有连续变量的描述方式
c("age","marker") ~ "{median} [{p25}, {p75}]", # 单独设置age,marker变量描述方式
all_categorical() ~ "{n} / {N} ({p}%)" # 设置所有分类变量的描述方式
)
),
digits = (
list(
all_continuous() ~ 2, # 设置连续变量的小数点位数
"age" ~ 0 # 单独设置age变量的小数点位数
)
),
# missing = "no", # 此行代码将不显示缺失值
missing_text = "Missing", # 将缺失值一栏显示为"Missing"
label = stage ~ "Tumor Stage", # 调整变量名称
) %>%
add_p( # 添加组间比较
test = list(c("marker","ttdeath") ~ "t.test"),
pvalue_fun = function(x){style_pvalue(x, digits = 2)} # 设置P值的小数点位数
) %>%
add_overall() %>% # 添加总体描述
add_n() %>% # 添加总体样本量
modify_header(
label ~ "**Variable**" # 调整变量栏名称
) %>%
modify_spanning_header(
c("stat_1", "stat_2") ~ "**Treatment Group**" # 为分组添加变量名称stat_1,stat_2不能改
) %>%
modify_footnote(
all_stat_cols() ~ "Median [Q1,Q3], Mean (SD) or Frequency (%)", # 调整表注
) %>%
separate_p_footnotes() %>% # 为各类比较分别添加表注
modify_caption("**Table 1. Patient Characteristics**") %>% # 添加表题
bold_labels() # 将变量名称加粗
# # 导出为PDF
# summary_table %>%
# as_gt() %>% # 需要先转换为gt格式再进行存储
# gt::gtsave(filename = "summary.pdf")
# # 导出为word
# summary_table %>%
# as_flex_table() %>%
# flextable::save_as_docx(summary_table, path='summary.docx')