第21章 描述性统计表
21.1 概述
在进行数据处理时,我们会将数据描述制成表格,即描述性统计表。我们可以用gtsummary包完成这些表格的制作。下面我们将使用gtsummary包中的trial数据集来了解gtsummary的功能。
21.2 简易表格
我们先来看一下如何用gtsummary快速制作一张统计描述表。
## # A tibble: 6 × 8
## trt age marker stage grade response death ttdeath
## <chr> <dbl> <dbl> <fct> <fct> <int> <int> <dbl>
## 1 Drug A 23 0.16 T1 II 0 0 24
## 2 Drug B 9 1.11 T2 I 1 0 24
## 3 Drug A 31 0.277 T1 II 0 0 24
## 4 Drug A NA 2.07 T3 III 1 1 17.6
## 5 Drug A 51 2.77 T4 III 1 1 16.4
## 6 Drug B 39 0.613 T4 I 0 1 15.6
| Characteristic | N = 2001 |
|---|---|
| Chemotherapy Treatment | |
| Drug A | 98 (49%) |
| Drug B | 102 (51%) |
| Age | 47 (38, 57) |
| Unknown | 11 |
| Marker Level (ng/mL) | 0.64 (0.22, 1.41) |
| Unknown | 10 |
| T Stage | |
| T1 | 53 (27%) |
| T2 | 54 (27%) |
| T3 | 43 (22%) |
| T4 | 50 (25%) |
| Months to Death/Censor | 22.4 (15.9, 24.0) |
| 1 n (%); Median (Q1, Q3) | |
通过上表我们可以发现,连续变量默认描述为中位数(四分位数),缺失值显示为“Unknown”,分类变量描述为例数(百分比)。
注意:
如果连续变量的数值种类在10以内,表格会自动将该连续变量调整为分类变量进行描述。如果要保持连续变量的描述,需要在tbl_summary()函数中声明参数type = list(变量名称 ~ "continuous")。
我们也可以对不同变量个性化地设置描述方式。
trial %>%
select(trt, age, marker, stage, ttdeath) %>%
tbl_summary(
statistic = (
list(
all_continuous() ~ "{mean} ({sd})", # 设置所有连续变量的描述方式
c("age","marker") ~ "{median} [{p25}, {p75}]", # 单独设置age,marker变量描述方式
all_categorical() ~ "{n} / {N} ({p}%)" # 设置所有分类变量的描述方式
)
),
digits = (
list(
all_continuous() ~ 2, # 设置连续变量的小数点位数
"age" ~ 0 # 单独设置age变量的小数点位数
)
)
)| Characteristic | N = 2001 |
|---|---|
| Chemotherapy Treatment | |
| Drug A | 98 / 200 (49%) |
| Drug B | 102 / 200 (51%) |
| Age | 47 [38, 57] |
| Unknown | 11 |
| Marker Level (ng/mL) | 0.64 [0.22, 1.41] |
| Unknown | 10 |
| T Stage | |
| T1 | 53 / 200 (27%) |
| T2 | 54 / 200 (27%) |
| T3 | 43 / 200 (22%) |
| T4 | 50 / 200 (25%) |
| Months to Death/Censor | 19.62 (5.28) |
| 1 n / N (%); Median [Q1, Q3]; Mean (SD) | |
21.3 添加组间比较
如果我们想按照不同trt分组描述,可以声明tbl_summary(by = trt)参数。
trial %>%
select(trt, age, marker, stage, ttdeath) %>%
tbl_summary(
by = trt,
statistic = (
list(
all_continuous() ~ "{mean} ({sd})", # 设置所有连续变量的描述方式
c("age","marker") ~ "{median} [{p25}, {p75}]", # 单独设置age,marker变量描述方式
all_categorical() ~ "{n} / {N} ({p}%)" # 设置所有分类变量的描述方式
)
),
digits = (
list(
all_continuous() ~ 2, # 设置连续变量的小数点位数
"age" ~ 0 # 单独设置age变量的小数点位数
)
)
) %>%
add_p( # 添加组间比较
test = list(c("marker","ttdeath") ~ "t.test"),
pvalue_fun = function(x){style_pvalue(x, digits = 2)} # 设置P值的小数点位数
) %>%
add_overall() %>% # 添加总体描述
add_n() # 添加总体样本量| Characteristic | N | Overall N = 2001 |
Drug A N = 981 |
Drug B N = 1021 |
p-value2 |
|---|---|---|---|---|---|
| Age | 189 | 47 [38, 57] | 46 [37, 60] | 48 [39, 56] | 0.72 |
| Unknown | 11 | 7 | 4 | ||
| Marker Level (ng/mL) | 190 | 0.64 [0.22, 1.41] | 0.84 [0.23, 1.60] | 0.52 [0.18, 1.21] | 0.12 |
| Unknown | 10 | 6 | 4 | ||
| T Stage | 200 | 0.87 | |||
| T1 | 53 / 200 (27%) | 28 / 98 (29%) | 25 / 102 (25%) | ||
| T2 | 54 / 200 (27%) | 25 / 98 (26%) | 29 / 102 (28%) | ||
| T3 | 43 / 200 (22%) | 22 / 98 (22%) | 21 / 102 (21%) | ||
| T4 | 50 / 200 (25%) | 23 / 98 (23%) | 27 / 102 (26%) | ||
| Months to Death/Censor | 200 | 19.62 (5.28) | 20.23 (4.99) | 19.04 (5.50) | 0.11 |
| 1 Median [Q1, Q3]; n / N (%); Mean (SD) | |||||
| 2 Wilcoxon rank sum test; Welch Two Sample t-test; Pearson’s Chi-squared test | |||||
21.4 格式调整
trial %>%
select(trt, age, marker, stage, ttdeath) %>%
tbl_summary(
by = trt,
statistic = (
list(
all_continuous() ~ "{mean} ({sd})", # 设置所有连续变量的描述方式
c("age","marker") ~ "{median} [{p25}, {p75}]", # 单独设置age,marker变量描述方式
all_categorical() ~ "{n} / {N} ({p}%)" # 设置所有分类变量的描述方式
)
),
digits = (
list(
all_continuous() ~ 2, # 设置连续变量的小数点位数
"age" ~ 0 # 单独设置age变量的小数点位数
)
),
# missing = "no", # 此行代码将不显示缺失值
missing_text = "Missing", # 将缺失值一栏显示为"Missing"
label = stage ~ "Tumor Stage", # 调整变量名称
) %>%
add_p( # 添加组间比较
test = list(c("marker","ttdeath") ~ "t.test"),
pvalue_fun = function(x){style_pvalue(x, digits = 2)} # 设置P值的小数点位数
) %>%
add_overall() %>% # 添加总体描述
add_n() %>% # 添加总体样本量
modify_header(
label ~ "**Variable**" # 调整变量栏名称
) %>%
modify_spanning_header(
c("stat_1", "stat_2") ~ "**Treatment Group**" # 为分组添加变量名称stat_1,stat_2不能改
) %>%
modify_footnote(
all_stat_cols() ~ "Median [Q1,Q3], Mean (SD) or Frequency (%)", # 调整表注
) %>%
separate_p_footnotes() %>% # 为各类比较分别添加表注
modify_caption("**Table 1. Patient Characteristics**") %>% # 添加表题
bold_labels() # 将变量名称加粗| Variable | N | Overall N = 2001 |
Treatment Group
|
p-value | |
|---|---|---|---|---|---|
| Drug A N = 981 |
Drug B N = 1021 |
||||
| Age | 189 | 47 [38, 57] | 46 [37, 60] | 48 [39, 56] | 0.722 |
| Missing | 11 | 7 | 4 | ||
| Marker Level (ng/mL) | 190 | 0.64 [0.22, 1.41] | 0.84 [0.23, 1.60] | 0.52 [0.18, 1.21] | 0.123 |
| Missing | 10 | 6 | 4 | ||
| Tumor Stage | 200 | 0.874 | |||
| T1 | 53 / 200 (27%) | 28 / 98 (29%) | 25 / 102 (25%) | ||
| T2 | 54 / 200 (27%) | 25 / 98 (26%) | 29 / 102 (28%) | ||
| T3 | 43 / 200 (22%) | 22 / 98 (22%) | 21 / 102 (21%) | ||
| T4 | 50 / 200 (25%) | 23 / 98 (23%) | 27 / 102 (26%) | ||
| Months to Death/Censor | 200 | 19.62 (5.28) | 20.23 (4.99) | 19.04 (5.50) | 0.113 |
| 1 Median [Q1,Q3], Mean (SD) or Frequency (%) | |||||
| 2 Wilcoxon rank sum test | |||||
| 3 Welch Two Sample t-test | |||||
| 4 Pearson’s Chi-squared test | |||||
gtsummary还预设了一些期刊风格,我们可以直接调用。
# 将gtsummary风格预设为jama风格
theme_gtsummary_journal("jama")
# 允许不同风格相互叠加
theme_gtsummary_compact()
# 生成表格
trial %>%
select(trt, age, marker, stage, ttdeath) %>%
tbl_summary(by = trt, missing_text = "Missing")| Characteristic | Drug A N = 98 |
Drug B N = 102 |
|---|---|---|
| Age, Median (IQR) | 46 (37 – 60) | 48 (39 – 56) |
| Missing | 7 | 4 |
| Marker Level (ng/mL), Median (IQR) | 0.84 (0.23 – 1.60) | 0.52 (0.18 – 1.21) |
| Missing | 6 | 4 |
| T Stage, n (%) | ||
| T1 | 28 (29) | 25 (25) |
| T2 | 25 (26) | 29 (28) |
| T3 | 22 (22) | 21 (21) |
| T4 | 23 (23) | 27 (26) |
| Months to Death/Censor, Median (IQR) | 23.5 (17.4 – 24.0) | 21.2 (14.5 – 24.0) |
21.5 表格导出
gtsummary表格可以导出为多种形式,此时需要将表格用as_gt()函数转为gt格式,再使用gtsave()函数导出。但是这种方法导出为word时容易乱码,此时可以使用flextable包的save_as_docx()函数。
summary_table <- trial %>%
select(trt, age, marker, stage, ttdeath) %>%
tbl_summary(
by = trt,
statistic = (
list(
all_continuous() ~ "{mean} ({sd})", # 设置所有连续变量的描述方式
c("age","marker") ~ "{median} [{p25}, {p75}]", # 单独设置age,marker变量描述方式
all_categorical() ~ "{n} / {N} ({p}%)" # 设置所有分类变量的描述方式
)
),
digits = (
list(
all_continuous() ~ 2, # 设置连续变量的小数点位数
"age" ~ 0 # 单独设置age变量的小数点位数
)
),
# missing = "no", # 此行代码将不显示缺失值
missing_text = "Missing", # 将缺失值一栏显示为"Missing"
label = stage ~ "Tumor Stage", # 调整变量名称
) %>%
add_p( # 添加组间比较
test = list(c("marker","ttdeath") ~ "t.test"),
pvalue_fun = function(x){style_pvalue(x, digits = 2)} # 设置P值的小数点位数
) %>%
add_overall() %>% # 添加总体描述
add_n() %>% # 添加总体样本量
modify_header(
label ~ "**Variable**" # 调整变量栏名称
) %>%
modify_spanning_header(
c("stat_1", "stat_2") ~ "**Treatment Group**" # 为分组添加变量名称stat_1,stat_2不能改
) %>%
modify_footnote(
all_stat_cols() ~ "Median [Q1,Q3], Mean (SD) or Frequency (%)", # 调整表注
) %>%
separate_p_footnotes() %>% # 为各类比较分别添加表注
modify_caption("**Table 1. Patient Characteristics**") %>% # 添加表题
bold_labels() # 将变量名称加粗
# # 导出为PDF
# summary_table %>%
# as_gt() %>% # 需要先转换为gt格式再进行存储
# gt::gtsave(filename = "summary.pdf")
# # 导出为word
# summary_table %>%
# as_flex_table() %>%
# flextable::save_as_docx(summary_table, path='summary.docx')