Chapter 21 財務管理課堂數據集之迴歸建模研究專案管理
21.1 旭
21.2 預測研究專案之基本盤
21.2.1 解析問題
21.2.2 準備數據
21.2.3 特徵值工程
21.2.4 檔案管理
21.2.5 建模工程
21.2.6 預測工程
21.2.7 解讀工程
21.3 建模前的預備知識
作者小編在這裡引用波士頓數據集介紹部分建模必要的R。先準備數據:
library(MASS)
data(Boston)
::datatable(Boston,
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
21.3.1 formula
# https://www.datacamp.com/community/tutorials/r-formula-tutorial
<- paste0("x", 1:25)
xnam xnam
## [1] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "x8" "x9" "x10" "x11" "x12"
## [13] "x13" "x14" "x15" "x16" "x17" "x18" "x19" "x20" "x21" "x22" "x23" "x24"
## [25] "x25"
<- as.formula(paste("y ~ ", paste(xnam, collapse= "+")))) (fmla
## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 +
## x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 +
## x22 + x23 + x24 + x25
class(fmla)
## [1] "formula"
args(as.formula)
## function (object, env = parent.frame())
## NULL
21.3.2 lm
# https://stat.ethz.ch/R-manual/R-devel/library/base/html/args.html
args(lm)
## function (formula, data, subset, weights, na.action, method = "qr",
## model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE,
## contrasts = NULL, offset, ...)
## NULL
21.3.3 predict
# https://stat.ethz.ch/R-manual/R-devel/library/base/html/args.html
args(predict)
## function (object, ...)
## NULL
21.3.4 broom
21.4 傳統迴歸建模研究方法論
記住這句話:
模型千千百百種。
假設作者小編想產生以下這一張報表並且找到名列其中的最佳模型。
21.4.1 暴力解
<- colnames(Boston)[-14]
Var
<- character(0)
model <- numeric(0)
R2
# 第1個解釋變數
<- as.formula(paste(colnames(Boston)[14], "~", Var[1]))
bostonLM <- c(model, paste(colnames(Boston)[14], "~", Var[1]))
model <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
R2 # 第2個解釋變數
<- as.formula(paste(colnames(Boston)[14], "~", Var[2]))
bostonLM <- c(model, paste(colnames(Boston)[14], "~", Var[2]))
model <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
R2 # 第3個解釋變數
<- as.formula(paste(colnames(Boston)[14], "~", Var[3]))
bostonLM <- c(model, paste(colnames(Boston)[14], "~", Var[3]))
model <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
R2 # 第4個解釋變數
<- as.formula(paste(colnames(Boston)[14], "~", Var[4]))
bostonLM <- c(model, paste(colnames(Boston)[14], "~", Var[4]))
model <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
R2 # 第5個解釋變數
<- as.formula(paste(colnames(Boston)[14], "~", Var[5]))
bostonLM <- c(model, paste(colnames(Boston)[14], "~", Var[5]))
model <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
R2 # 第6個解釋變數
<- as.formula(paste(colnames(Boston)[14], "~", Var[6]))
bostonLM <- c(model, paste(colnames(Boston)[14], "~", Var[6]))
model <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
R2 # 第7個解釋變數
<- as.formula(paste(colnames(Boston)[14], "~", Var[7]))
bostonLM <- c(model, paste(colnames(Boston)[14], "~", Var[7]))
model <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
R2 # 第8個解釋變數
<- as.formula(paste(colnames(Boston)[14], "~", Var[8]))
bostonLM <- c(model, paste(colnames(Boston)[14], "~", Var[8]))
model <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
R2 # 第9個解釋變數
<- as.formula(paste(colnames(Boston)[14], "~", Var[9]))
bostonLM <- c(model, paste(colnames(Boston)[14], "~", Var[9]))
model <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
R2 # 第10個解釋變數
<- as.formula(paste(colnames(Boston)[14], "~", Var[10]))
bostonLM <- c(model, paste(colnames(Boston)[14], "~", Var[10]))
model <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
R2 # 第11個解釋變數
<- as.formula(paste(colnames(Boston)[14], "~", Var[11]))
bostonLM <- c(model, paste(colnames(Boston)[14], "~", Var[11]))
model <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
R2 # 第12個解釋變數
<- as.formula(paste(colnames(Boston)[14], "~", Var[12]))
bostonLM <- c(model, paste(colnames(Boston)[14], "~", Var[12]))
model <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
R2 # 第13個解釋變數
<- as.formula(paste(colnames(Boston)[14], "~", Var[13]))
bostonLM <- c(model, paste(colnames(Boston)[14], "~", Var[13]))
model <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
R2
<- data.frame(model = model,
models R2 = R2,
stringsAsFactors = FALSE)
::datatable(models, options = list(pageLength = 13)) DT
21.4.2 迴圈解
<- character(0)
model <- numeric(0)
R2 for (Var in colnames(Boston)[-14]) {
<- as.formula(paste(colnames(Boston)[14], "~", Var))
bostonLM #print(summary(lm(bostonLM, data = Boston))$r.squared)
<- c(model, paste(colnames(Boston)[14], "~", Var))
model <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
R2
}
<- data.frame(model = model,
models R2 = R2,
stringsAsFactors = FALSE)
::datatable(models, options = list(pageLength = 13)) DT
$model[which.max(models$R2)] models
## [1] "medv ~ lstat"
summary(lm(as.formula(models$model[which.max(models$R2)]),data = Boston))
##
## Call:
## lm(formula = as.formula(models$model[which.max(models$R2)]),
## data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <0.0000000000000002 ***
## lstat -0.95005 0.03873 -24.53 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 0.00000000000000022
summary(lm(medv ~ lstat, data = Boston))
##
## Call:
## lm(formula = medv ~ lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <0.0000000000000002 ***
## lstat -0.95005 0.03873 -24.53 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 0.00000000000000022
21.4.3 相關係數解
cor(x = Boston[colnames(Boston)[-14]],
y = Boston[colnames(Boston)[14]])
## medv
## crim -0.3883046
## zn 0.3604453
## indus -0.4837252
## chas 0.1752602
## nox -0.4273208
## rm 0.6953599
## age -0.3769546
## dis 0.2499287
## rad -0.3816262
## tax -0.4685359
## ptratio -0.5077867
## black 0.3334608
## lstat -0.7376627
<- cor(x = Boston[colnames(Boston)[-14]],
COR y = Boston[colnames(Boston)[14]])
COR
## medv
## crim -0.3883046
## zn 0.3604453
## indus -0.4837252
## chas 0.1752602
## nox -0.4273208
## rm 0.6953599
## age -0.3769546
## dis 0.2499287
## rad -0.3816262
## tax -0.4685359
## ptratio -0.5077867
## black 0.3334608
## lstat -0.7376627
which(abs(COR) == max(abs(COR)), arr.ind = TRUE)] COR[
## [1] -0.7376627
which.max(abs(COR))
## [1] 13
colnames(Boston)[-14][which.max(abs(COR))]
## [1] "lstat"
<- colnames(Boston)[-14][which.max(abs(COR))]
VAR VAR
## [1] "lstat"
<- as.formula(paste(colnames(Boston)[14], "~", Var))
bostonLM bostonLM
## medv ~ lstat
summary(lm(bostonLM, data = Boston))
##
## Call:
## lm(formula = bostonLM, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <0.0000000000000002 ***
## lstat -0.95005 0.03873 -24.53 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 0.00000000000000022
summary(lm(medv ~ lstat, data = Boston))
##
## Call:
## lm(formula = medv ~ lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <0.0000000000000002 ***
## lstat -0.95005 0.03873 -24.53 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 0.00000000000000022
21.5 80-20迴歸建模研究方法論
set.seed(123)
<- caTools::sample.split(Boston, SplitRatio = 0.8) #assigns booleans to a new coloumn based on the split ratio
split split
## [1] TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE
## [13] TRUE TRUE
<- subset(Boston, split == TRUE)
Boston.train <- subset(Boston, split == FALSE)
Boston.test dim(Boston.train)
## [1] 398 14
dim(Boston.test)
## [1] 108 14
21.5.1 training + test
<- cor(x = Boston.train[colnames(Boston)[-14]],
COR y = Boston.train[colnames(Boston)[14]])
COR
## medv
## crim -0.4006247
## zn 0.3676476
## indus -0.5036929
## chas 0.1861409
## nox -0.4404786
## rm 0.7059564
## age -0.3914000
## dis 0.2681954
## rad -0.3936754
## tax -0.4855203
## ptratio -0.5220044
## black 0.3326074
## lstat -0.7465333
<- colnames(Boston)[-14][which.max(abs(COR))]
VAR VAR
## [1] "lstat"
<- as.formula(paste(colnames(Boston)[14], "~", Var))
bostonLM bostonLM
## medv ~ lstat
summary(lm(bostonLM, data = Boston.train))
##
## Call:
## lm(formula = bostonLM, data = Boston.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.039 -3.874 -1.352 1.754 24.021
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.5049 0.6202 55.64 <0.0000000000000002 ***
## lstat -0.9601 0.0430 -22.33 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.038 on 396 degrees of freedom
## Multiple R-squared: 0.5573, Adjusted R-squared: 0.5562
## F-statistic: 498.5 on 1 and 396 DF, p-value: < 0.00000000000000022
<- lm(bostonLM, data = Boston.train)
model1 <- predict(model1, newdata = Boston.test)
test.pred.model1
<- mean((Boston.test$medv - test.pred.model1)^2)
model1.mpse model1.mpse
## [1] 46.80385