# 坑 6 数据处理

## 6.1 探索性数据分析

• 一维数据看分布与异常值
• 二维数据看关系与趋势
• 三维看维度间两两关系
• 高维数据降维
• 相似个体聚类与网络结构

# 读入鸢尾花数据集，150个样本及其花萼长度/宽度，花瓣长度/宽度还有种属
data("iris")
# 一维可视化花萼长度
hist(iris$Sepal.Length, xlab = '花萼长度', ylab = '频数', main = '') # 二维可视化花萼长度与宽度的关系 plot(iris$Sepal.Length~iris$Sepal.Width,xlab = '花萼宽度', ylab = '花萼长度', pch = 19) # 三维可视化花萼长度、宽度及种属的关系 plot(iris$Sepal.Length~iris$Sepal.Width,xlab = '花萼宽度', ylab = '花萼长度', col= iris$Species, pch = c(15,17,19)[as.numeric(iris$Species)]) legend('topright',legend = unique(iris$Species), col = unique(iris$Species), pch = c(15,17,19)) # 利用样本相关性探索样本间关系，这里用32辆车的10维测试数据 cor <- cor(t(mtcars)) # 相关系数阈值0.99 cor[cor<0.99] <- 0 df <- data.frame(from=rownames(mtcars)[which(lower.tri(cor), arr.ind = T)[, 1]],to=rownames(mtcars)[which(lower.tri(cor), arr.ind = T)[, 2]],cor=cor[lower.tri(cor)]) df <- df[abs(df$cor)>0,]
# 用网络可视化
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
##     decompose, spectrum
## The following object is masked from 'package:base':
##
##     union
library(ggraph)
## Loading required package: ggplot2
net <- graph_from_data_frame(df,directed = F)
autograph(net,layout = 'fr',node_size = 3,)+
geom_node_text(aes(label = name), repel = TRUE,
guides(size = 'none')+
theme_void()

## 6.2 统计推断

$\begin{eqnarray*} E\left[\sum_{i=1}^n (X_i - \bar X)^2\right] & = & \sum_{i=1}^n E\left[X_i^2\right] - n E\left[\bar X^2\right] \\ \\ & = & \sum_{i=1}^n \left\{Var(X_i) + \mu^2\right\} - n \left\{Var(\bar X) + \mu^2\right\} \\ \\ & = & \sum_{i=1}^n \left\{\sigma^2 + \mu^2\right\} - n \left\{\sigma^2 / n + \mu^2\right\} \\ \\ & = & n \sigma^2 + n \mu ^ 2 - \sigma^2 - n \mu^2 \\ \\ & = & (n - 1) \sigma^2 \end{eqnarray*}$

$\begin{eqnarray*} Var(\bar X) & = & Var \left( \frac{1}{n}\sum_{i=1}^n X_i \right)\\ \\ & = & \frac{1}{n^2} Var\left(\sum_{i=1}^n X_i \right)\\ \\ & = & \frac{1}{n^2} \sum_{i=1}^n Var(X_i) \\ \\ & = & \frac{1}{n^2} \times n\sigma^2 \\ \\ & = & \frac{\sigma^2}{n} \end{eqnarray*}$

1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3

set.seed(42)
# 随机数的10000次比较
pvalue <- NULL
for (i in 1:10000) {
a <- rnorm(10)
b <- rnorm(10)
c <- t.test(a, b)
pvalue[i] <- c$p.value } # 看下p值分布 hist( pvalue, main = substitute(paste(italic('p'), "值分布")), xlab = substitute(paste(italic('p'), "值")), ylab = '频率' ) # 小于0.05的个数 sum(pvalue < 0.05) ## [1] 477 # 小于0.0001的个数 sum(pvalue < 0.0001) ## [1] 0 这样我们会看到进行了整体的控制之后，确实是找不到有差异的了，而且此时我们也可以看到p值的分布应该是0到1之间的均匀分布。但假如里面本来就有有差异的呢？ set.seed(42) pvalue <- NULL for (i in 1:10000) { a <- rnorm(10, 1) b <- a + 1 c <- t.test(a, b) pvalue[i] <- c$p.value
}
# 看下p值分布
hist(
pvalue,
main = substitute(paste(italic('p'), "值分布")),
xlab = substitute(paste(italic('p'), "值")),
ylab = '频率'
)
# 显示p值里小于0.05的个数
sum(pvalue < 0.05)
## [1] 6559
# 显示p值里小于0.0001的个数
sum(pvalue < 0.0001)
## [1] 45

set.seed(42)
pvalue <- NULL
for (i in 1:5000) {
a <- rnorm(10, 1)
b <- a + 1
c <- t.test(a, b)
pvalue[i] <- c$p.value } for (i in 1:5000) { a <- rnorm(10, 1) b <- rnorm(10, 1) c <- t.test(a, b) pvalue[i + 5000] <- c$p.value
}
# 看下p值分布
hist(
pvalue,
main = substitute(paste(italic('p'), "值分布")),
xlab = substitute(paste(italic('p'), "值")),
ylab = '频率'
)
# 显示p值里小于0.05的个数
sum(pvalue < 0.05)
## [1] 3499
# 显示p值里小于0.0001的个数
sum(pvalue < 0.0001)
## [1] 21

$P(至少一个显著)=1-P(无显著差异) = 1-(1-\alpha/n)^n$

# 比较次数
n <- c(1:10 %o% 10 ^ (1:2))
# 整体出现假阳性概率
p0 <- 1 - (1 - 0.05) ^ n
# Bonferroni方法控制后的整体假阳性概率
p <- 1 - (1 - 0.05 / n) ^ n
# 不进行控制
plot(
p0 ~ n,
ylim = c(0, 1),
pch = 19,
xlab = '测试数',
ylab = '整体错误率（FWER）'
)
# Bonferroni方法控制
points(p ~ n, pch = 17, col = 'grey')
legend('right',
c('Bonferroni方法', '无矫正'),
pch = c(17,19),
col = c('grey','black'))

$P(至少一个显著)=1-P(无显著差异) = 1-(1-\alpha')^n = 0.05$

# 所有有差异的
R <- sum(pvalue<0.05)
R
## [1] 3499
# 假阳性
V <- sum(pvalue[5001:10000]<0.05)
V
## [1] 225
# 错误发现率
Q <- V/R
Q
## [1] 0.06430409

# 所有有差异的
R <- sum(pvalue<0.01)
R
## [1] 999
# 假阳性
V <- sum(pvalue[5001:10000]<0.01)
V
## [1] 34
# 错误发现率
Q <- V/R
Q
## [1] 0.03403403

Benjamini-Hochberg方法跟Holm方法很像，也是先排序，但之后p值并不是简单的乘排序，而是乘检验总数后除排序：

$p_i \leq \frac{i}{m} \alpha$

# BH法控制错误发现率
pbh <- p.adjust(pvalue, method = 'BH')
# holm法控制错误发现率
ph <- p.adjust(pvalue, method = 'holm')
plot(pbh ~ pvalue,
ylab = '错误发现率（FDR）',
xlab = substitute(paste(italic('p'), "值")),
pch = 19)
points(ph ~ pvalue, pch = 17, col = 'grey')
legend('bottomright',
c('Holm 法', 'BH 法'),
col = c('grey', 'black'),
pch = c(17,19))

$\hat\pi_0 = \frac{\#\{p_i>\lambda\}}{(1-\lambda)m}$

library(qvalue)
q <- qvalue(pvalue)
# q值
plot(
q$qvalues ~ pvalue, ylab = substitute(paste(italic('q'), "值")), xlab = substitute(paste(italic('p'), "值")) ) 如上图所示，q值增大后会最终逼近到0.5，而我们的模拟中零假设的比例就设定就是50%。我们重新模拟一个零假设比例5%的实验： set.seed(42) pvalue <- NULL for (i in 1:500) { a <- rnorm(10, 1) b <- a + 1 c <- t.test(a, b) pvalue[i] <- c$p.value
}
for (i in 1:9500) {
a <- rnorm(10, 1)
b <- rnorm(10, 1)
c <- t.test(a, b)
pvalue[i + 500] <- c$p.value } pbh <- p.adjust(pvalue, method = 'BH') q <- qvalue(pvalue) plot(pbh ~ pvalue, ylab = '错误发现率（FDR）', xlab = substitute(paste(italic('p'), "值")), pch = 15) # Q值 points(q$qvalues ~ pvalue, pch = 16, col = 'grey')
legend(
'bottomright',
c(expression(paste(italic(
"q"
), " 值")), 'BH 法'),
pch = c(16,15),
col = c('grey','black')
)

## 6.3 线性模型

set.seed(42)
a <- runif(100,min=0,max=10)+rnorm(100)
b <- a*1.2+rnorm(100)
c <- b*1.2+rnorm(100)
y <- a+b+c+rnorm(100)

summary(lm(y~a))
##
## Call:
## lm.default(formula = y ~ a)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -4.1929 -1.9935 -0.2595  1.9301  5.3011
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.14295    0.44101  -0.324    0.747
## a            3.65619    0.07162  51.048   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.339 on 98 degrees of freedom
## Multiple R-squared:  0.9638, Adjusted R-squared:  0.9634
## F-statistic:  2606 on 1 and 98 DF,  p-value: < 2.2e-16
summary(lm(y~a+b+c))
##
## Call:
## lm.default(formula = y ~ a + b + c)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -2.10681 -0.69646 -0.06907  0.66959  2.31841
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.03088    0.18341  -0.168    0.867
## a            1.20491    0.12910   9.333 4.03e-15 ***
## b            0.69905    0.16038   4.359 3.28e-05 ***
## c            1.11364    0.10523  10.583  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9716 on 96 degrees of freedom
## Multiple R-squared:  0.9939, Adjusted R-squared:  0.9937
## F-statistic:  5194 on 3 and 96 DF,  p-value: < 2.2e-16

## 6.6 主成分分析

$t = Wx + \mu + \epsilon$