第 52 章 双样本t检验

install.packages(c("bayesplot", "palmerpenguins", "rstatix", "broom", "ggstatsplot", "infer", "ggthemes"))

52.1 实验设计

• 组内比较， 同一组人，每个人要完成多次测量（重复测量），比如服药第一天的情况，服药第二天的情况，服药第三天的情况…，每组的人数是恒定的。

• 组间比较A组的被试吃1mg，B组被试吃2mg, C组吃3mg…，每组的人数不要求是恒定的。

52.2 提问

library(tidyverse)
theme_set(bayesplot::theme_default())

penguins <- palmerpenguins::penguins %>%
drop_na()

• 企鹅有男女两种性别(female, male)，不同性别的bill_length_mm的均值是否相同？

• 企鹅种类有三种(Adelie, Chinstrap, Gentoo)，比较在每个种类下男企鹅和女企鹅bill_length_mm的均值？

• 两两比较不同种类的bill_length_mm的均值？

52.2.1 不同性别的嘴峰长度的均值是否相同

penguins %>%
ggplot(aes(x = sex, y = bill_length_mm)) +
geom_boxplot() +
geom_jitter() +
theme(legend.position = "none")

penguins %>%
group_by(sex) %>%
summarize(avg_rating = mean(bill_length_mm, na.rm = TRUE)) %>%
mutate(diff_means = avg_rating - lag(avg_rating))
## # A tibble: 2 × 3
##   sex    avg_rating diff_means
##   <fct>       <dbl>      <dbl>
## 1 female       42.1      NA
## 2 male         45.9       3.76

52.2.1.1 using t.test()

t.test(
bill_length_mm ~ sex,
data = penguins,
var.equal = TRUE    # var.equal =  假定两个样本方差是否相等
)
##
##  Two Sample t-test
##
## data:  bill_length_mm by sex
## t = -6.667, df = 331, p-value = 1.094e-10
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
##  -4.866557 -2.649027
## sample estimates:
## mean in group female   mean in group male
##             42.09697             45.85476
t.test(
bill_length_mm ~ sex,
data = penguins,
var.equal = TRUE
) %>%
broom::tidy()
## # A tibble: 1 × 10
##   estimate estimate1 estimate2 statistic  p.value parameter conf.low conf.high
##      <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>    <dbl>     <dbl>
## 1    -3.76      42.1      45.9     -6.67 1.09e-10       331    -4.87     -2.65
## # ℹ 2 more variables: method <chr>, alternative <chr>

52.2.1.2 using rstatix::t_test()

rstatix宏包提供了类似dplyr风格的语法

library(rstatix)
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
##     filter
penguins %>%
rstatix::t_test(bill_length_mm ~ sex, var.equal = TRUE)
## # A tibble: 1 × 8
##   .y.            group1 group2    n1    n2 statistic    df        p
## * <chr>          <chr>  <chr>  <int> <int>     <dbl> <dbl>    <dbl>
## 1 bill_length_mm female male     165   168     -6.67   331 1.09e-10

52.2.1.3 using ggstatsplot::ggbetweenstats()

library(ggstatsplot)
## Warning: package 'ggstatsplot' was built under R version 4.2.3
## You can cite this package as:
##      Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
##      Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
penguins %>%
ggbetweenstats(
x = sex,
y = bill_length_mm,
pairwise.comparisons = TRUE,
pairwise.display = "all",
var.equal = TRUE
)

52.2.1.4 using infer: 基于模拟的检验

• 实际观察的差别
library(infer)
## Warning: package 'infer' was built under R version 4.2.2
##
## Attaching package: 'infer'
## The following objects are masked from 'package:rstatix':
##
##     chisq_test, prop_test, t_test
obs_diff <- penguins %>%
specify(formula = bill_length_mm ~ sex) %>%
calculate(
stat = "diff in means",
order = c("male", "female")
)
obs_diff
## Response: bill_length_mm (numeric)
## Explanatory: sex (factor)
## # A tibble: 1 × 1
##    stat
##   <dbl>
## 1  3.76
• 模拟
null_dist <- penguins %>%
specify(formula = bill_length_mm ~ sex) %>%
hypothesize(null = "independence") %>%
generate(reps = 5000, type = "permute") %>%
calculate(
stat = "diff in means",
order = c("male", "female")
)
head(null_dist)
## Response: bill_length_mm (numeric)
## Explanatory: sex (factor)
## Null Hypothesis: independence
## # A tibble: 6 × 2
##   replicate   stat
##       <int>  <dbl>
## 1         1  0.946
## 2         2 -0.321
## 3         3 -1.09
## 4         4  0.126
## 5         5  0.341
## 6         6 -1.32
1. specify() 指定解释变量和被解释变量 (y ~ x)

2. hypothesize() 指定零假设 (比如, independence= yx 彼此独立)

3. generate() 从基于零假设的平行世界中抽样:

• reps，指定抽样次数
• type，指定重抽样的类型。
4. calculate() 计算每次抽样的统计值 (stat = "diff in means")

• 可视化
null_dist %>%
visualize() +
shade_p_value(obs_stat = obs_diff, direction = "both")
• 计算p值
pvalue <- null_dist %>%
get_pvalue(obs_stat = obs_diff, direction = "two_sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of reps chosen in the generate() step.
## See ?get_p_value() for more information.
pvalue
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

52.2.1.5 using lm()

model <- lm(bill_length_mm ~ sex, data = penguins)
broom::tidy(model)
## # A tibble: 2 × 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)    42.1      0.400    105.   2.18e-256
## 2 sexmale         3.76     0.564      6.67 1.09e- 10
confint(model)
##                 2.5 %    97.5 %
## (Intercept) 41.309431 42.884509
## sexmale      2.649027  4.866557

52.2.2 每个种类下男企鹅和女企鹅bill_length_mm的均值

penguins %>%
ggplot(aes(x = species, y = bill_length_mm, color = sex)) +
geom_boxplot(position = position_dodge(0.8)) +
geom_jitter(
position = position_jitterdodge()
) +
scale_x_discrete(
expand = expansion(mult = c(0.3, 0.3))
) +
theme(legend.position = "none")

52.2.2.1 using group_modify() + t.test()

penguins %>%
group_by(species) %>%
group_modify(
~ t.test(bill_length_mm ~ sex, data = .x, var.equal = TRUE) %>%
broom::tidy()
)
## # A tibble: 3 × 11
## # Groups:   species [3]
##   species   estimate estimate1 estimate2 statistic  p.value parameter conf.low
##   <fct>        <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>    <dbl>
## 1 Adelie       -3.13      37.3      40.4     -8.78 4.44e-15       144    -3.84
## 2 Chinstrap    -4.52      46.6      51.1     -7.57 1.53e-10        66    -5.71
## 3 Gentoo       -3.91      45.6      49.5     -8.82 1.30e-14       117    -4.79
## # ℹ 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>

52.2.2.2 using rstatix::t_test()

library(rstatix)

penguins %>%
group_by(species) %>%
rstatix::t_test(bill_length_mm ~ sex, var.equal = TRUE)
## # A tibble: 3 × 9
##   species   .y.            group1 group2    n1    n2 statistic    df        p
## * <fct>     <chr>          <chr>  <chr>  <int> <int>     <dbl> <dbl>    <dbl>
## 1 Adelie    bill_length_mm female male      73    73     -8.78   144 4.44e-15
## 2 Chinstrap bill_length_mm female male      34    34     -7.57    66 1.53e-10
## 3 Gentoo    bill_length_mm female male      58    61     -8.82   117 1.3 e-14

52.2.2.3 using ggstatsplot::grouped_ggbetweenstats()

library(ggstatsplot)

penguins %>%
grouped_ggbetweenstats(
x = sex,
y = bill_length_mm,
pairwise.comparisons = TRUE,
pairwise.display = "all",
var.equal = TRUE,
grouping.var = species  # group
)

52.2.3 两两比较不同种类的bill_length_mm的均值

• Adelie - Chinstrap
• Adelie - Gentoo
• Chinstrap - Gentoo
t.test(bill_length_mm ~ species, data = penguins) 
## Error in t.test.formula(bill_length_mm ~ species, data = penguins): grouping factor must have exactly 2 levels

species 有三组，也就说有三个层级，程序不接受。方法是：成对pairwise t-tests

52.2.3.1 using pairwise.t.test()

pairwise.t.test(x, y) # x is a vector of the data, y is the group factor
pairwise.t.test(
penguins$bill_length_mm, penguins$species,
alternative = "two.sided",
paired = FALSE,
) %>%
broom::tidy()
## # A tibble: 3 × 3
##   group1    group2     p.value
##   <chr>     <chr>        <dbl>
## 3 Gentoo    Chinstrap 5.37e- 3

penguins %>%
filter(species %in% c("Gentoo", "Chinstrap")) %>%
t.test(bill_length_mm ~ species, data = .) %>%
broom::tidy()
## # A tibble: 1 × 10
##   estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
##      <dbl>     <dbl>     <dbl>     <dbl>   <dbl>     <dbl>    <dbl>     <dbl>
## 1     1.27      48.8      47.6      2.56  0.0117      131.    0.286      2.25
## # ℹ 2 more variables: method <chr>, alternative <chr>

52.2.3.2 using rstatix::pairwise_t_test()

library(rstatix)

penguins %>%
pairwise_t_test(
bill_length_mm ~ species,
alternative = "two.sided",
paired = FALSE
)
## # A tibble: 3 × 9
## * <chr>        <chr>  <chr>  <int> <int>    <dbl> <chr>       <dbl> <chr>
## 1 bill_length… Adelie Chins…   146    68 2.52e-70 ****     5.05e-70 ****
## 2 bill_length… Adelie Gentoo   146   119 1.05e-73 ****     3.16e-73 ****
## 3 bill_length… Chins… Gentoo    68   119 5.37e- 3 **       5.37e- 3 **

52.2.3.3 using ggstatsplot::ggbetweenstats()

penguins %>%
ggstatsplot::ggbetweenstats(
x = species,
y = bill_length_mm,
pairwise.comparisons = TRUE,
pairwise.display = "all",
)