Chapter 21 財務管理課堂數據集之迴歸建模研究專案管理

21.1

21.2 預測研究專案之基本盤

21.2.1 解析問題

21.2.2 準備數據

21.2.3 特徵值工程

21.2.4 檔案管理

21.2.5 建模工程

21.2.6 預測工程

21.2.7 解讀工程

21.3 建模前的預備知識

作者小編在這裡引用波士頓數據集介紹部分建模必要的R。先準備數據:

library(MASS)
data(Boston)
DT::datatable(Boston, 
              options = list(scrollX = TRUE, 
                             fixedColumns = TRUE))

21.3.1 formula

# https://www.datacamp.com/community/tutorials/r-formula-tutorial
xnam <- paste0("x", 1:25)
xnam
##  [1] "x1"  "x2"  "x3"  "x4"  "x5"  "x6"  "x7"  "x8"  "x9"  "x10" "x11" "x12"
## [13] "x13" "x14" "x15" "x16" "x17" "x18" "x19" "x20" "x21" "x22" "x23" "x24"
## [25] "x25"
(fmla <- as.formula(paste("y ~ ", paste(xnam, collapse= "+"))))
## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + 
##     x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 + 
##     x22 + x23 + x24 + x25
class(fmla)
## [1] "formula"
args(as.formula)
## function (object, env = parent.frame()) 
## NULL

21.3.2 lm

# https://stat.ethz.ch/R-manual/R-devel/library/base/html/args.html
args(lm)
## function (formula, data, subset, weights, na.action, method = "qr", 
##     model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, 
##     contrasts = NULL, offset, ...) 
## NULL

21.3.3 predict

# https://stat.ethz.ch/R-manual/R-devel/library/base/html/args.html
args(predict)
## function (object, ...) 
## NULL

21.3.4 broom

21.4 傳統迴歸建模研究方法論

記住這句話:

模型千千百百種。

假設作者小編想產生以下這一張報表並且找到名列其中的最佳模型。

21.4.1 暴力解

Var <- colnames(Boston)[-14]

model <- character(0)
R2 <- numeric(0)

# 第1個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[1]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[1]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第2個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[2]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[2]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第3個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[3]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[3]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第4個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[4]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[4]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第5個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[5]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[5]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第6個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[6]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[6]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第7個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[7]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[7]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第8個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[8]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[8]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第9個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[9]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[9]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第10個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[10]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[10]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第11個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[11]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[11]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第12個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[12]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[12]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第13個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[13]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[13]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)

models <- data.frame(model = model, 
                     R2 = R2, 
                     stringsAsFactors = FALSE)

DT::datatable(models, options = list(pageLength = 13))

21.4.2 迴圈解

model <- character(0)
R2 <- numeric(0)
for (Var in colnames(Boston)[-14]) {
  bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var))
  #print(summary(lm(bostonLM, data = Boston))$r.squared)
  model <- c(model, paste(colnames(Boston)[14], "~", Var))
  R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
}

models <- data.frame(model = model, 
                     R2 = R2, 
                     stringsAsFactors = FALSE)
DT::datatable(models, options = list(pageLength = 13))
models$model[which.max(models$R2)]
## [1] "medv ~ lstat"
summary(lm(as.formula(models$model[which.max(models$R2)]),data = Boston))
## 
## Call:
## lm(formula = as.formula(models$model[which.max(models$R2)]), 
##     data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41 <0.0000000000000002 ***
## lstat       -0.95005    0.03873  -24.53 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 0.00000000000000022
summary(lm(medv ~ lstat, data = Boston))
## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41 <0.0000000000000002 ***
## lstat       -0.95005    0.03873  -24.53 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 0.00000000000000022

21.4.3 相關係數解

cor(x = Boston[colnames(Boston)[-14]], 
    y = Boston[colnames(Boston)[14]])
##               medv
## crim    -0.3883046
## zn       0.3604453
## indus   -0.4837252
## chas     0.1752602
## nox     -0.4273208
## rm       0.6953599
## age     -0.3769546
## dis      0.2499287
## rad     -0.3816262
## tax     -0.4685359
## ptratio -0.5077867
## black    0.3334608
## lstat   -0.7376627
COR <- cor(x = Boston[colnames(Boston)[-14]], 
           y = Boston[colnames(Boston)[14]])
COR
##               medv
## crim    -0.3883046
## zn       0.3604453
## indus   -0.4837252
## chas     0.1752602
## nox     -0.4273208
## rm       0.6953599
## age     -0.3769546
## dis      0.2499287
## rad     -0.3816262
## tax     -0.4685359
## ptratio -0.5077867
## black    0.3334608
## lstat   -0.7376627
COR[which(abs(COR) == max(abs(COR)), arr.ind = TRUE)]
## [1] -0.7376627
which.max(abs(COR))
## [1] 13
colnames(Boston)[-14][which.max(abs(COR))]
## [1] "lstat"
VAR <- colnames(Boston)[-14][which.max(abs(COR))]
VAR
## [1] "lstat"
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var))
bostonLM
## medv ~ lstat
summary(lm(bostonLM, data = Boston))
## 
## Call:
## lm(formula = bostonLM, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41 <0.0000000000000002 ***
## lstat       -0.95005    0.03873  -24.53 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 0.00000000000000022
summary(lm(medv ~ lstat, data = Boston))
## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41 <0.0000000000000002 ***
## lstat       -0.95005    0.03873  -24.53 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 0.00000000000000022

21.5 80-20迴歸建模研究方法論

set.seed(123)
split <- caTools::sample.split(Boston, SplitRatio = 0.8) #assigns booleans to a new coloumn based on the split ratio
split
##  [1]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE
## [13]  TRUE  TRUE
Boston.train <- subset(Boston, split == TRUE)
Boston.test <- subset(Boston, split == FALSE)
dim(Boston.train)
## [1] 398  14
dim(Boston.test)
## [1] 108  14

21.5.1 training + test

COR <- cor(x = Boston.train[colnames(Boston)[-14]], 
           y = Boston.train[colnames(Boston)[14]])
COR
##               medv
## crim    -0.4006247
## zn       0.3676476
## indus   -0.5036929
## chas     0.1861409
## nox     -0.4404786
## rm       0.7059564
## age     -0.3914000
## dis      0.2681954
## rad     -0.3936754
## tax     -0.4855203
## ptratio -0.5220044
## black    0.3326074
## lstat   -0.7465333
VAR <- colnames(Boston)[-14][which.max(abs(COR))]
VAR
## [1] "lstat"
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var))
bostonLM
## medv ~ lstat
summary(lm(bostonLM, data = Boston.train))
## 
## Call:
## lm(formula = bostonLM, data = Boston.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.039  -3.874  -1.352   1.754  24.021 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  34.5049     0.6202   55.64 <0.0000000000000002 ***
## lstat        -0.9601     0.0430  -22.33 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.038 on 396 degrees of freedom
## Multiple R-squared:  0.5573, Adjusted R-squared:  0.5562 
## F-statistic: 498.5 on 1 and 396 DF,  p-value: < 0.00000000000000022
model1 <- lm(bostonLM, data = Boston.train)
test.pred.model1 <- predict(model1, newdata = Boston.test)

model1.mpse <- mean((Boston.test$medv - test.pred.model1)^2)
model1.mpse
## [1] 46.80385

21.5.2 training + validation + test