# 第 30 章 统计检验与线性模型的等价性

experim是一个模拟的数据集，这个虚拟的研究目的是，考察两种不同类型的干预措施对帮助学生应对即将到来的统计学课程的焦虑的影响。实验方案如下：

• 首先，学生完成一定数量的量表(量表1)，包括对统计学课程的害怕(fost)，自信(confid)，抑郁(depress)指标

• 然后，学生被等分成两组，组1 进行技能提升训练；组2进行信心提升训练。训练结束后，完成相同的量表(量表2)

• 三个月后，他们再次完成相同的量表(量表3)

library(tidyverse)
library(knitr)
library(kableExtra)

mutate(group = if_else(group == "maths skills", 1, 2)) %>%
mutate(
across(c(sex, id, group), as.factor)
)
edata
## # A tibble: 30 x 18
##    id    sex     age group fost1 confid1 depress1 fost2
##    <fct> <fct> <dbl> <fct> <dbl>   <dbl>    <dbl> <dbl>
##  1 4     male     23 2        50      15       44    48
##  2 10    male     21 2        47      14       42    45
##  3 9     male     25 1        44      12       40    39
##  4 3     male     30 1        47      11       43    42
##  5 12    male     45 2        46      16       44    45
##  6 11    male     22 1        39      13       43    40
##  7 6     male     22 2        32      21       37    33
##  8 5     male     26 1        44      17       46    37
##  9 8     male     23 2        40      22       37    40
## 10 13    male     21 1        47      20       50    45
## # ... with 20 more rows, and 10 more variables:
## #   confid2 <dbl>, depress2 <dbl>, fost3 <dbl>,
## #   confid3 <dbl>, depress3 <dbl>, exam <dbl>,
## #   mah_1 <dbl>, DepT1gp2 <chr>, DepT2Gp2 <chr>,
## #   DepT3gp2 <chr>
glimpse(edata)
## Rows: 30
## Columns: 18
## $id <fct> 4, 10, 9, 3, 12, 11, 6, 5, 8, 13,... ##$ sex      <fct> male, male, male, male, male, mal...
## $age <dbl> 23, 21, 25, 30, 45, 22, 22, 26, 2... ##$ group    <fct> 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, ...
## $fost1 <dbl> 50, 47, 44, 47, 46, 39, 32, 44, 4... ##$ confid1  <dbl> 15, 14, 12, 11, 16, 13, 21, 17, 2...
## $depress1 <dbl> 44, 42, 40, 43, 44, 43, 37, 46, 3... ##$ fost2    <dbl> 48, 45, 39, 42, 45, 40, 33, 37, 4...
## $confid2 <dbl> 16, 15, 18, 16, 16, 20, 22, 20, 2... ##$ depress2 <dbl> 44, 42, 40, 43, 45, 42, 36, 47, 3...
## $fost3 <dbl> 45, 44, 36, 41, 43, 39, 32, 32, 4... ##$ confid3  <dbl> 14, 18, 19, 20, 20, 22, 23, 26, 2...
## $depress3 <dbl> 40, 40, 38, 43, 43, 38, 35, 42, 3... ##$ exam     <dbl> 52, 55, 58, 60, 58, 62, 59, 70, 6...
## $mah_1 <dbl> 0.5700, 1.6594, 3.5405, 2.4542, 0... ##$ DepT1gp2 <chr> "not depressed", "not depressed",...
## $DepT2Gp2 <chr> "not depressed", "not depressed",... ##$ DepT3gp2 <chr> "not depressed", "not depressed",...

## 30.1 t检验

# Run t-test
model_1_t <- t.test(edata$depress1, mu = 0) model_1_t ## ## One Sample t-test ## ## data: edata$depress1
## t = 51, df = 29, p-value <2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  40.82 44.25
## sample estimates:
## mean of x
##     42.53

edata %>%
ggplot(aes(x = depress1)) +
geom_density()

t.test(depress1 ~ 1, data = edata)
##
##  One Sample t-test
##
## data:  depress1
## t = 51, df = 29, p-value <2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  40.82 44.25
## sample estimates:
## mean of x
##     42.53
# Run equivalent linear model
model_1_lm <- lm(depress1 ~ 1, data = edata)
summary(model_1_lm)
##
## Call:
## lm(formula = depress1 ~ 1, data = edata)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -9.533 -3.533  0.467  2.467  7.467
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   42.533      0.838    50.7   <2e-16 ***
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.59 on 29 degrees of freedom

library(broom)

# tidy() gets model outputs we usually use to report our results
model_1_t_tidy <- tidy(model_1_t) %>% mutate(model = "t.test(y)")
model_1_lm_tidy <- tidy(model_1_lm) %>% mutate(model = "lm(y ~ 1)")

results <- bind_rows(model_1_t_tidy, model_1_lm_tidy) %>%
select(model, estimate, statistic, p.value)
model estimate statistic p.value
t.test(y) 42.53 50.73 0
lm(y ~ 1) 42.53 50.73 0

# run paired t-test testing depression from g1 against g2
model_2_t <- t.test(edata$depress1, edata$depress3, paired = TRUE)
model_2_t_tidy <- tidy(model_2_t) %>% mutate(model = "t.test(x, y, paired = TRUE")
model_2_t
##
##  Paired t-test
##
## data:  edata$depress1 and edata$depress3
## t = 7.2, df = 29, p-value = 6e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.386 4.281
## sample estimates:
## mean of the differences
##                   3.333

# run linear model
model_2_lm <- lm(depress1 - depress3 ~ 1, data = edata)
model_2_lm_tidy <- tidy(model_2_lm) %>% mutate(model = "lm(y-x ~ 1)")
summary(model_2_lm)
##
## Call:
## lm(formula = depress1 - depress3 ~ 1, data = edata)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -4.333 -1.333 -0.333  1.667  5.667
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    3.333      0.463     7.2  6.4e-08 ***
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.54 on 29 degrees of freedom
# we combine the two model outputs, rowwise
results <- bind_rows(model_2_t_tidy, model_2_lm_tidy) %>%
select(model, estimate, statistic, p.value)
model estimate statistic p.value
t.test(x, y, paired = TRUE 3.333 7.196 0
lm(y-x ~ 1) 3.333 7.196 0

haha，通过这个等价的线性模型，我们似乎可以窥探到配对样本t检验是怎么样工作的。 它depress1depress3一一对应相减后得到新的一列，用新的一列做单样本t检验

# run paired t-test testing depression from g1 against g2
model_2_t2 <- t.test(edata$depress1- edata$depress3)
model_2_t2_tidy <- tidy(model_2_t2) %>% mutate(model = "t.test(x - y")
model_2_t2
##
##  One Sample t-test
##
## data:  edata$depress1 - edata$depress3
## t = 7.2, df = 29, p-value = 6e-08
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  2.386 4.281
## sample estimates:
## mean of x
##     3.333

# calculate the difference between baseline and tp3 depression
edata <- edata %>%
mutate(
dep_slope = depress1 - depress3
)

model_2_lm2 <- lm(dep_slope ~ 1, data = edata)
model_2_lm2_tidy <- tidy(model_2_lm2) %>% mutate(model = "lm(delta ~ 1)")

# we combine the three model outputs, rowwise
results <- bind_rows(model_2_t_tidy, model_2_t2_tidy) %>%
bind_rows(model_2_lm_tidy) %>%
bind_rows(model_2_lm2_tidy) %>%
select(model, estimate, statistic, p.value)
model estimate statistic p.value
t.test(x, y, paired = TRUE 3.333 7.196 0
t.test(x - y 3.333 7.196 0
lm(y-x ~ 1) 3.333 7.196 0
lm(delta ~ 1) 3.333 7.196 0

## 30.2 关联

# Run correlation test
model_3_cor <- cor.test(edata$depress3, edata$depress1, method = "pearson")
model_3_cor_tidy <- tidy(model_3_cor) %>% mutate(model = "cor.test(x, y)")
model_3_cor
##
##  Pearson's product-moment correlation
##
## data:  edata$depress3 and edata$depress1
## t = 9.2, df = 28, p-value = 5e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7379 0.9354
## sample estimates:
##    cor
## 0.8675
# Run equivalent linear model
model_3_lm <- lm(depress3 ~ depress1, data = edata)
model_3_lm_tidy <- tidy(model_3_lm) %>% mutate(model = "lm(y ~ x)")
summary(model_3_lm)
##
## Call:
## lm(formula = depress3 ~ depress1, data = edata)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -5.842 -1.474  0.177  1.293  4.197
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -1.687      4.455   -0.38     0.71
## depress1       0.961      0.104    9.23  5.5e-10 ***
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.58 on 28 degrees of freedom
## Multiple R-squared:  0.753,  Adjusted R-squared:  0.744
## F-statistic: 85.2 on 1 and 28 DF,  p-value: 5.49e-10
# we combine the two model outputs, rowwise
results <- bind_rows(model_3_cor_tidy, model_3_lm_tidy) %>%
select(model, term, estimate, statistic, p.value)
model term estimate statistic p.value
cor.test(x, y) NA 0.8675 9.2291 0.0000
lm(y ~ x) (Intercept) -1.6871 -0.3787 0.7078
depress1 0.9613 9.2291 0.0000

## 30.3 单因素方差分析

# Run one-way anova
model_4_anova <- aov(depress1 ~ group, data = edata)
model_4_anova_tidy <- tidy(model_4_anova) %>% mutate(model = "aov(y ~ factor(x))")
summary(model_4_anova)
##             Df Sum Sq Mean Sq F value Pr(>F)
## group        1      2    2.13     0.1   0.76
## Residuals   28    609   21.76
# Run equivalent linear model
model_4_lm <- lm(depress1 ~ group, data = edata)
model_4_lm_tidy <- tidy(model_4_lm) %>% mutate(model = "lm(y ~ factor(x))")
summary(model_4_lm)
##
## Call:
## lm(formula = depress1 ~ group, data = edata)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -9.800 -3.667  0.467  2.733  7.733
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   42.800      1.204   35.53   <2e-16 ***
## group2        -0.533      1.703   -0.31     0.76
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.66 on 28 degrees of freedom
## Multiple R-squared:  0.00349,    Adjusted R-squared:  -0.0321
## F-statistic: 0.098 on 1 and 28 DF,  p-value: 0.757
# we combine the two model outputs, rowwise
results <- bind_rows(model_4_anova_tidy, model_4_lm_tidy) %>%
select(model, term, estimate, statistic, p.value)
model term estimate statistic p.value
aov(y ~ factor(x)) group NA 0.0980 0.7565
Residuals NA NA NA
lm(y ~ factor(x)) (Intercept) 42.8000 35.5337 0.0000
group2 -0.5333 -0.3131 0.7565

lm()中，是将group1视为基线, 依次比较其它因子水平（比如group2）与基线group1的之间的偏离，并评估每次这种偏离的显著性；

aov评估group变量整体是否有效应，即如果group中的任何一个因子水平都显著偏离基线水平，那么说明group变量是显著的。

# take the square root of the anova stat
sqrt(model_4_anova_tidy$statistic[1]) ## [1] 0.3131 # same as stat from lm model_4_lm_tidy$statistic[2]
## [1] -0.3131
# or, square the lm stat
model_4_lm_tidy$statistic[2]^2 ## [1] 0.09803 # same as anova stat model_4_anova_tidy$statistic[1]
## [1] 0.09803

## 30.4 单因素协变量分析

# Run one-way anova
model_5_ancova <- aov(dep_slope ~ group + confid1, data = edata)
model_5_ancova_tidy <- tidy(model_5_ancova) %>% mutate(model = "aov(y ~ x + z)")
summary(model_5_ancova)
##             Df Sum Sq Mean Sq F value Pr(>F)
## group        1   19.2   19.20    3.13  0.088 .
## confid1      1    2.0    2.03    0.33  0.569
## Residuals   27  165.4    6.13
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Run equivalent linear model
model_5_lm <- lm(dep_slope ~ group + confid1, data = edata)
model_5_lm_tidy <- tidy(model_5_lm) %>% mutate(model = "lm(y ~ x + z)")
summary(model_5_lm)
##
## Call:
## lm(formula = dep_slope ~ group + confid1, data = edata)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -4.485 -2.041  0.424  1.523  4.909
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   3.4637     1.7375    1.99    0.056 .
## group2        1.6132     0.9041    1.78    0.086 .
## confid1      -0.0493     0.0856   -0.58    0.569
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.48 on 27 degrees of freedom
## Multiple R-squared:  0.114,  Adjusted R-squared:  0.0481
## F-statistic: 1.73 on 2 and 27 DF,  p-value: 0.196
# we combine the two model outputs, rowwise
results <- bind_rows(model_5_ancova_tidy, model_5_lm_tidy) %>%
select(model, term, estimate, statistic, p.value)
model term estimate statistic p.value
aov(y ~ x + z) group NA 3.1336 0.0880
confid1 NA 0.3316 0.5695
Residuals NA NA NA
lm(y ~ x + z) (Intercept) 3.4637 1.9935 0.0564
group2 1.6132 1.7842 0.0856
confid1 -0.0493 -0.5758 0.5695

aov模型中，group整体的p-value是 ~0.088， lm模型中检验group2偏离group1的p.value是0.086. confidence 这个变量p.value都是0.56993。

# or, square the lm stat
model_5_lm_tidy$statistic[-1]^2 ## [1] 3.1832 0.3316 # same as anova stat model_5_ancova_tidy$statistic
## [1] 3.1336 0.3316     NA

## 30.5 双因素方差分析

# Run anova
model_6_anova <- aov(dep_slope ~ group * sex, data = edata)
model_6_anova_tidy <- tidy(model_6_anova) %>% mutate(model = "aov(y ~ x * z)")
summary(model_6_anova)
##             Df Sum Sq Mean Sq F value Pr(>F)
## group        1   19.2   19.20    3.33  0.079 .
## sex          1    5.1    5.15    0.89  0.353
## group:sex    1   12.5   12.51    2.17  0.153
## Residuals   26  149.8    5.76
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Run equivalent linear model
model_6_lm <- lm(dep_slope ~ group * sex, data = edata)
model_6_lm_tidy <- tidy(model_6_lm) %>% mutate(model = "lm(y ~ x * z)")
summary(model_6_lm)
##
## Call:
## lm(formula = dep_slope ~ group * sex, data = edata)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -5.125 -1.821  0.482  1.719  3.875
##
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)       2.286      0.907    2.52    0.018 *
## group2            2.839      1.242    2.29    0.031 *
## sexmale           0.464      1.242    0.37    0.712
## group2:sexmale   -2.589      1.757   -1.47    0.153
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.4 on 26 degrees of freedom
## Multiple R-squared:  0.197,  Adjusted R-squared:  0.105
## F-statistic: 2.13 on 3 and 26 DF,  p-value: 0.12
# we combine the two model outputs, rowwise
results <- bind_rows(model_6_anova_tidy, model_6_lm_tidy) %>%
select(model, term, estimate, statistic, p.value)
model term estimate statistic p.value
aov(y ~ x * z) group NA 3.3324 0.0794
sex NA 0.8935 0.3532
group:sex NA 2.1721 0.1525
Residuals NA NA NA
lm(y ~ x * z) (Intercept) 2.2857 2.5194 0.0182
group2 2.8393 2.2855 0.0307
sexmale 0.4643 0.3737 0.7116
group2:sexmale -2.5893 -1.4738 0.1525

aov()lm() 公式写法是一样，输出结果又一次不一样，但结论是相同的（尤其是分组变量只有两个水平时候）

aov评估的是 （这些变量作为整体），是否对Y产生差异；而lm 模型，我们评估的是分类变量中的某个因子水平是否与基线水平之间是否有著差异。

edata_mock <- edata %>%
# Add 2 to numeric version of groups
mutate(group = as.numeric(group) + 2) %>%
# bind this by row to the origincal eData (with group as numeric)
bind_rows(edata %>%
mutate(group = as.numeric(group))) %>%
# make group a factor again so the correct test is applied
mutate(group = as.factor(group))

# Run  anova
model_7_anova <- aov(dep_slope ~ group * sex, data = edata_mock)
model_7_anova_tidy <- tidy(model_7_anova) %>% mutate(model = "aov(y ~ x * z)")
summary(model_7_anova)
##             Df Sum Sq Mean Sq F value Pr(>F)
## group        3   38.4   12.80    2.22  0.097 .
## sex          1   10.3   10.30    1.79  0.187
## group:sex    3   25.0    8.34    1.45  0.239
## Residuals   52  299.6    5.76
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Run equivalent linear model
model_7_lm <- lm(dep_slope ~ group * sex, data = edata_mock)
model_7_lm_tidy <- tidy(model_7_lm) %>% mutate(model = "lm(y ~ x * z)")
summary(model_7_lm)
##
## Call:
## lm(formula = dep_slope ~ group * sex, data = edata_mock)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -5.125 -2.000  0.482  1.875  3.875
##
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)
## (Intercept)     2.29e+00   9.07e-01    2.52    0.015 *
## group2          2.84e+00   1.24e+00    2.29    0.026 *
## group3         -2.81e-15   1.28e+00    0.00    1.000
## group4          2.84e+00   1.24e+00    2.29    0.026 *
## sexmale         4.64e-01   1.24e+00    0.37    0.710
## group2:sexmale -2.59e+00   1.76e+00   -1.47    0.147
## group3:sexmale  1.69e-15   1.76e+00    0.00    1.000
## group4:sexmale -2.59e+00   1.76e+00   -1.47    0.147
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.4 on 52 degrees of freedom
## Multiple R-squared:  0.197,  Adjusted R-squared:  0.0894
## F-statistic: 1.83 on 7 and 52 DF,  p-value: 0.101
# we combine the two model outputs, rowwise
results <- bind_rows(model_7_anova_tidy, model_7_lm_tidy) %>%
select(model, term, estimate, statistic, p.value)
model term estimate statistic p.value
aov(y ~ x * z) group NA 2.2216 0.0966
sex NA 1.7871 0.1871
group:sex NA 1.4481 0.2395
Residuals NA NA NA
lm(y ~ x * z) (Intercept) 2.2857 2.5194 0.0149
group2 2.8393 2.2855 0.0264
group3 0.0000 0.0000 1.0000
group4 2.8393 2.2855 0.0264
sexmale 0.4643 0.3737 0.7101
group2:sexmale -2.5893 -1.4738 0.1466
group3:sexmale 0.0000 0.0000 1.0000
group4:sexmale -2.5893 -1.4738 0.1466

edata %>%
mutate(sex:group = interaction(sex, group, sep = ":")) %>%
ggplot(aes(
x = sex:group,
y = dep_slope,
colour = sex:group
)) +
geom_jitter(width = .2) +
geom_boxplot(width = .3, alpha = .2) +
labs(
y = "Depression difference",
title = "Depression difference between baseline and EOS",
subtitle = "Divided by intervention group and sex"
)