第 2 章 基础模型
2.1 概述
统计模型计算的参数模型,并不是实际数据的关系,而是基于某原则(最小二乘,最大似然)最接近实际数据的模型。
2.2 热身
library(ggplot2)
library(dplyr)
##
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(modelr)
2.3 简单模型
用sim1
的数据画图,观察数据分布。
ggplot(sim1, aes(x,y)) +
geom_point()
看起来随着变量X的增大,Y也增大,从形状来看二者具备线性关系,比如y = a0 + a1 * x
。好了,开始建立一个模型,利用runif()
函数随机生成a1, a2
,将随机生成的a1,a2
分配给`geom_abline()
绘制线图看看。
<- tibble(
models a1 = runif(250,-20,40),
a2 = runif(250, -5,5)
)
ggplot(sim1, aes(x,y)) +
geom_point() +
geom_abline(data = models, aes(intercept = a1, slope = a2), alpha = 0.2)
绘制了250条线,现在需要找到实际数据点距离模型距离最短的模型。
set.seed(1)
<- rnorm(30)
adj <- sim1 %>% mutate(x = x + adj)
sim1 <- tibble(x = sim1$x, y = sim1$y, xend = x, yend = 4.22 + 2.05 * x )
distance ggplot(sim1, aes(x,y)) +
geom_point(alpha = 0.5) +
geom_abline(aes(intercept = 4.22, slope = 2.05)) +
geom_segment(data =distance, aes(x = x, y = y , xend = xend, yend = yend), col = "blue")
为了计算距离,创建1个函数,生成模型预测值。
<- function(a, data){
model1 1] + a[2] * data$x
a[
}
model1(c(3,6), sim1)
## [1] 5.241277 10.101860 3.986228 24.571685 16.977047 10.077190 23.924574
## [8] 25.429948 24.454688 25.167670 36.070687 29.339059 29.272557 19.711801
## [15] 39.749586 38.730398 38.902858 44.663017 49.927327 48.563408 50.513864
## [22] 55.692818 51.447390 39.063890 60.718954 56.663228 56.065227 54.175486
## [29] 60.131100 65.507649
创建1个函数计算预测值与实际值之间的距离的残差
<- function(mod, data){
measure_dis <- data$y - model1(mod,data)
diff sqrt(mean(diff^2))
}
measure_dis(c(3,6), sim1)
## [1] 24.21746
现在可以使用purrr
计算250个模型的残差。
<- function(a1, a2){
sim1_dist measure_dis(c(a1, a2),sim1)
}
<- models %>%
models mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
%>% arrange(dist) models
## # A tibble: 250 x 3
## a1 a2 dist
## <dbl> <dbl> <dbl>
## 1 1.97 2.44 2.96
## 2 7.94 1.41 3.06
## 3 7.87 1.62 3.10
## 4 2.38 2.02 3.17
## 5 1.50 2.59 3.21
## 6 8.59 1.58 3.41
## 7 1.28 2.83 3.94
## 8 9.76 1.57 4.15
## 9 -3.63 3.18 4.63
## 10 -0.608 2.18 4.75
## # ... with 240 more rows
现在画出距离最短的10条线看看
ggplot(sim1, aes(x,y)) +
geom_point(size = 2) +
geom_abline(data = filter(models, rank(dist) <= 10) , aes(intercept = a1, slope = a2, col = -dist))
把模型的参数绘图
%>% ggplot(aes(x = a1, y = a2, col = - dist)) +
models geom_point(data = filter(models, rank(dist) <= 10 ), aes(x = a1, y = a2), col = "red", size = 4) +
geom_point() +
stat_ellipse(type = "norm", level = 0.90)
根据上图可以看出最合适的参数a1的分布范围在-5到20之间,a2在0到4之间,因此我们可以创建一个参数矩阵找到最合适的参数
<- expand.grid(
grid a1 = seq(-5,20,length = 50),
a2 = seq(0,4,length = 50)
%>% mutate(
) dist = purrr::map2_dbl(a1, a2, sim1_dist)
)%>% ggplot(aes(x = a1, y = a2)) +
grid geom_point(data = filter(grid, rank(dist) <= 10), aes(x = a1, y = a2), size = 4, col = "red")+
geom_point(aes(col = -dist))
用最终获得的10个参数绘图
ggplot(sim1,aes(x = x, y = y )) +
geom_point() +
geom_abline(data = filter(grid, rank(dist) <= 10), aes(intercept = a1, slope = a2, col = - dist))
使用线性回归创建模型
<- lm(y ~ x, data = sim1)
sim1_mod coef(sim1_mod)
## (Intercept) x
## 4.587061 1.955625
使用data_grid()
生成样本空间
data(sim1)
<- sim1 %>% data_grid(x) grid
生成预测值
<- grid %>% add_predictions(sim1_mod)
grid grid
## # A tibble: 10 x 2
## x pred
## <int> <dbl>
## 1 1 6.54
## 2 2 8.50
## 3 3 10.5
## 4 4 12.4
## 5 5 14.4
## 6 6 16.3
## 7 7 18.3
## 8 8 20.2
## 9 9 22.2
## 10 10 24.1
使用真实值添加残差
<- sim1 %>% add_residuals(sim1_mod)
sim1 sim1
## # A tibble: 30 x 3
## x y resid
## <int> <dbl> <dbl>
## 1 1 4.20 -2.34
## 2 1 7.51 0.968
## 3 1 2.13 -4.42
## 4 2 8.99 0.491
## 5 2 10.2 1.74
## 6 2 11.3 2.80
## 7 3 7.36 -3.10
## 8 3 10.5 0.0514
## 9 3 10.5 0.0577
## 10 4 12.4 0.0250
## # ... with 20 more rows
绘制残差密度分布图
%>% ggplot() +
sim1 geom_freqpoly(aes(x = resid))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
残差代替预测值的图
%>% ggplot(aes(x, resid)) +
sim1 geom_ref_line(h = 0) +
geom_point()
这些点看起来随机分布,提示模型预测效果较好。
2.3.1 练习
使用loess()
曲线拟合sim1数据
data(sim1)
<- loess(y ~ x, data = sim1)
sim1_Mod2 %>% summary() sim1_Mod2
## Call:
## loess(formula = y ~ x, data = sim1)
##
## Number of Observations: 30
## Equivalent Number of Parameters: 4.19
## Residual Standard Error: 2.262
## Trace of smoother matrix: 4.58 (exact)
##
## Control settings:
## span : 0.75
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
## normalize: TRUE
## parametric: FALSE
## drop.square: FALSE
<- sim1 %>% data_grid(x) %>%
grid add_predictions(sim1_Mod2)
%>% ggplot(aes(x = x, y = y )) +
sim1 geom_point() +
geom_line(data = grid, aes(x = x ,y = pred),col = "blue", size = 3) +
geom_smooth(formula = y ~ x, col = "red", se = F, alpha = 0.2)
## `geom_smooth()` using method = 'loess'
%>% spread_predictions(sim1_Mod2, sim1_mod) sim1
## # A tibble: 30 x 4
## x y sim1_Mod2 sim1_mod
## <int> <dbl> <dbl> <dbl>
## 1 1 4.20 5.34 6.54
## 2 1 7.51 5.34 6.54
## 3 1 2.13 5.34 6.54
## 4 2 8.99 8.27 8.50
## 5 2 10.2 8.27 8.50
## 6 2 11.3 8.27 8.50
## 7 3 7.36 10.8 10.5
## 8 3 10.5 10.8 10.5
## 9 3 10.5 10.8 10.5
## 10 4 12.4 12.8 12.4
## # ... with 20 more rows
2.4 矩阵填充tribble
<- tribble(
df ~y, ~x1, ~x2,
4, 3, 5,
5, 2, 6,
6, 1, 7
) df
## # A tibble: 3 x 3
## y x1 x2
## <dbl> <dbl> <dbl>
## 1 4 3 5
## 2 5 2 6
## 3 6 1 7
model_matrix(df, y ~ x1 + x2 )
## # A tibble: 3 x 3
## `(Intercept)` x1 x2
## <dbl> <dbl> <dbl>
## 1 1 3 5
## 2 1 2 6
## 3 1 1 7
2.5 分类变量模型
当使用分类变量作为因变量的时候,字符型变量无法参与计算。在R中将factor
默认转换为数字计算。例如factor(c("男" ,"女"))
将男和女分别转换为0,1进行计算。
<- tribble(
df ~ sex, ~ response,
"male", 1,
"female", 2,
"male", 1
)
model.matrix(data = df, response ~ sex)
## (Intercept) sexmale
## 1 1 1
## 2 1 0
## 3 1 1
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$sex
## [1] "contr.treatment"
使用sim2
的数据建模,观察一下数据分布
data("sim2")
ggplot(sim2,aes(x, y)) + geom_point()
使用线性模型拟合
<- lm(y ~ x, data = sim2)
mod2 <- sim2 %>%
grid data_grid(x) %>%
add_predictions(mod2)
grid
## # A tibble: 4 x 2
## x pred
## <chr> <dbl>
## 1 a 1.15
## 2 b 8.12
## 3 c 6.13
## 4 d 1.91
看看模型预测值的分布
ggplot(sim2,aes(x, y)) +
geom_point(data = grid, aes(x, pred), col = "red", size = 4) +
geom_point()
看起来预测的值是自变量的均值。注意: 无法使用未观测的自变量在模型中预测。