# 第 48 章 模拟与抽样2

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'readr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.2
## Warning: package 'lubridate' was built under R version 4.2.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(infer)
## Warning: package 'infer' was built under R version 4.2.2
penguins <- palmerpenguins::penguins %>% drop_na()

penguins %>%
specify(formula = bill_length_mm ~ sex) %>%
hypothesize(null = "independence") %>%
generate(reps = 5000, type = "permute") %>%
calculate(
stat = "diff in means",
order = c("male", "female")
) %>%
visualize()

## 48.1 重复infer中diff in means的抽样过程

• 独立性假设。假设有 y 和 x 两者独立，即y 与 x 无关，一个怎么变，都不会影响另一个。
• 零假设，x下有两个组（male, female），每组对应的y的均值是相等的，均值之差为0.

### 48.1.1 抽样

• 解释变量x列不动
• 响应变量y这一列，洗牌后放回。

tbl <- tibble(
y = 1:4,
x = c("a", "a", "b", "b")
)
tbl
## # A tibble: 4 × 2
##       y x
##   <int> <chr>
## 1     1 a
## 2     2 a
## 3     3 b
## 4     4 b

y <- tbl[[1]]
y_prime <- sample(y, size = length(y), replace = FALSE)

tbl[1] <- y_prime
tbl
## # A tibble: 4 × 2
##       y x
##   <int> <chr>
## 1     4 a
## 2     2 a
## 3     1 b
## 4     3 b

permute_once <- function(df) {
y <- df[[1]]
y_prime <- sample(y, size = length(y), replace = FALSE)

df[1] <- y_prime
return(df)
}

tbl %>% permute_once()
## # A tibble: 4 × 2
##       y x
##   <int> <chr>
## 1     2 a
## 2     3 a
## 3     1 b
## 4     4 b

1:100 %>%
purrr::map(~ permute_once(tbl)) 

permuate_repeat <- function(df, reps = 30){
df_out <-
purrr::map_dfr(.x = 1:reps, .f = ~ permute_once(df)) %>%
dplyr::mutate(replicate = rep(1:reps, each = nrow(df))) %>%
dplyr::group_by(replicate)

return(df_out)
}

tbl %>% permuate_repeat(reps = 1000)
## # A tibble: 4,000 × 3
## # Groups:   replicate [1,000]
##        y x     replicate
##    <int> <chr>     <int>
##  1     1 a             1
##  2     2 a             1
##  3     3 b             1
##  4     4 b             1
##  5     4 a             2
##  6     2 a             2
##  7     3 b             2
##  8     1 b             2
##  9     3 a             3
## 10     1 a             3
## # ℹ 3,990 more rows

### 48.1.2 计算null假设分布

null_dist <- tbl %>%
permuate_repeat(reps = 1000) %>%

group_by(replicate, x) %>%
summarise(ybar = mean(y)) %>%
group_by(replicate) %>%
summarise(
stat = ybar[x == "a"] - ybar[x == "b"]
)
## summarise() has grouped output by 'replicate'. You can override using the
## .groups argument.
null_dist
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1    -2
##  2         2     1
##  3         3     0
##  4         4     0
##  5         5     2
##  6         6     2
##  7         7     0
##  8         8     1
##  9         9     2
## 10        10    -1
## # ℹ 990 more rows

### 48.1.3 可视化

null_dist %>%
ggplot(aes(x = stat)) +
geom_histogram(bins = 15, color = "white")

### 48.1.4 应用penguins

samples <- penguins %>%
select(bill_length_mm, sex) %>%
permuate_repeat(reps = 5000)

null_dist <- samples %>%
group_by(replicate, sex) %>%
summarise(ybar = mean(bill_length_mm)) %>%
group_by(replicate) %>%
summarise(
stat = ybar[sex == "male"] - ybar[sex == "female"]
)
## summarise() has grouped output by 'replicate'. You can override using the
## .groups argument.
null_dist %>%
ggplot(aes(x = stat)) +
geom_histogram(bins = 15, color = "white")
penguins %>%
group_by(sex) %>%
summarize(avg_rating = mean(bill_length_mm, na.rm = TRUE)) %>%
mutate(diff_means = avg_rating - lag(avg_rating))
## # A tibble: 2 × 3
##   sex    avg_rating diff_means
##   <fct>       <dbl>      <dbl>
## 1 female       42.1      NA
## 2 male         45.9       3.76
p_value <- sum(null_dist$stat > 3.757792) / length(null_dist$stat)
p_value
## [1] 0