Chapter 21 財務管理課堂數據集之迴歸建模研究專案管理

21.1 旭

21.2 預測研究專案之基本盤

21.2.1 解析問題

21.2.2 準備數據

21.2.3 特徵值工程

21.2.4 檔案管理

21.2.5 建模工程

21.2.6 預測工程

21.2.7 解讀工程

21.3 建模前的預備知識

作者小編在這裡引用波士頓數據集介紹部分建模必要的R。先準備數據：

library(MASS)
data(Boston)
DT::datatable(Boston, 
              options = list(scrollX = TRUE, 
                             fixedColumns = TRUE))

Show entries

Search:

	crim	zn	indus	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
1	0.00632	18	2.31	0.538	6.575	65.2	4.09	1	296	15.3	396.9	4.98	24
2	0.02731	0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.9	9.14	21.6
3	0.02729	0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	34.7
4	0.03237	0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	33.4
5	0.06905	0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.9	5.33	36.2
6	0.02985	0	2.18	0.458	6.43	58.7	6.0622	3	222	18.7	394.12	5.21	28.7
7	0.08829	12.5	7.87	0.524	6.012	66.6	5.5605	5	311	15.2	395.6	12.43	22.9
8	0.14455	12.5	7.87	0.524	6.172	96.1	5.9505	5	311	15.2	396.9	19.15	27.1
9	0.21124	12.5	7.87	0.524	5.631	100	6.0821	5	311	15.2	386.63	29.93	16.5
10	0.17004	12.5	7.87	0.524	6.004	85.9	6.5921	5	311	15.2	386.71	17.1	18.9

Showing 1 to 10 of 506 entries

Previous1 2 3 4 5…51Next

21.3.1 `formula`

# https://www.datacamp.com/community/tutorials/r-formula-tutorial
xnam <- paste0("x", 1:25)
xnam

##  [1] "x1"  "x2"  "x3"  "x4"  "x5"  "x6"  "x7"  "x8"  "x9"  "x10" "x11" "x12"
## [13] "x13" "x14" "x15" "x16" "x17" "x18" "x19" "x20" "x21" "x22" "x23" "x24"
## [25] "x25"

(fmla <- as.formula(paste("y ~ ", paste(xnam, collapse= "+"))))

## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + 
##     x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 + 
##     x22 + x23 + x24 + x25

class(fmla)

## [1] "formula"

args(as.formula)

## function (object, env = parent.frame()) 
## NULL

21.3.2 `lm`

# https://stat.ethz.ch/R-manual/R-devel/library/base/html/args.html
args(lm)

## function (formula, data, subset, weights, na.action, method = "qr", 
##     model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, 
##     contrasts = NULL, offset, ...) 
## NULL

21.3.3 `predict`

# https://stat.ethz.ch/R-manual/R-devel/library/base/html/args.html
args(predict)

## function (object, ...) 
## NULL

21.3.4 `broom`

21.4 傳統迴歸建模研究方法論

記住這句話：

模型千千百百種。

假設作者小編想產生以下這一張報表並且找到名列其中的最佳模型。

Show entries

Search:

	model	R2
1	medv ~ crim	0.150780469049757
2	medv ~ zn	0.129920844894289
3	medv ~ indus	0.233990030444475
4	medv ~ chas	0.0307161297087748
5	medv ~ nox	0.182603042501699
6	medv ~ rm	0.483525455991334
7	medv ~ age	0.142094744077804
8	medv ~ dis	0.0624643721217825
9	medv ~ rad	0.145638579912325
10	medv ~ tax	0.219525921044219
11	medv ~ ptratio	0.257847318009224
12	medv ~ black	0.111196118246363
13	medv ~ lstat	0.54414629758648

Showing 1 to 13 of 13 entries

Previous1Next

21.4.1 暴力解

Var <- colnames(Boston)[-14]

model <- character(0)
R2 <- numeric(0)

# 第1個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[1]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[1]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第2個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[2]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[2]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第3個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[3]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[3]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第4個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[4]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[4]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第5個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[5]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[5]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第6個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[6]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[6]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第7個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[7]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[7]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第8個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[8]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[8]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第9個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[9]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[9]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第10個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[10]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[10]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第11個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[11]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[11]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第12個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[12]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[12]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
# 第13個解釋變數
bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var[13]))
model <- c(model, paste(colnames(Boston)[14], "~", Var[13]))
R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)

models <- data.frame(model = model, 
                     R2 = R2, 
                     stringsAsFactors = FALSE)

DT::datatable(models, options = list(pageLength = 13))

Show entries

Search:

	model	R2
1	medv ~ crim	0.150780469049757
2	medv ~ zn	0.129920844894289
3	medv ~ indus	0.233990030444475
4	medv ~ chas	0.0307161297087748
5	medv ~ nox	0.182603042501699
6	medv ~ rm	0.483525455991334
7	medv ~ age	0.142094744077804
8	medv ~ dis	0.0624643721217825
9	medv ~ rad	0.145638579912325
10	medv ~ tax	0.219525921044219
11	medv ~ ptratio	0.257847318009224
12	medv ~ black	0.111196118246363
13	medv ~ lstat	0.54414629758648

Showing 1 to 13 of 13 entries

Previous1Next

21.4.2 迴圈解

model <- character(0)
R2 <- numeric(0)
for (Var in colnames(Boston)[-14]) {
  bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var))
  #print(summary(lm(bostonLM, data = Boston))$r.squared)
  model <- c(model, paste(colnames(Boston)[14], "~", Var))
  R2 <- c(R2, summary(lm(bostonLM, data = Boston))$r.squared)
}

models <- data.frame(model = model, 
                     R2 = R2, 
                     stringsAsFactors = FALSE)
DT::datatable(models, options = list(pageLength = 13))

Show entries

Search:

	model	R2
1	medv ~ crim	0.150780469049757
2	medv ~ zn	0.129920844894289
3	medv ~ indus	0.233990030444475
4	medv ~ chas	0.0307161297087748
5	medv ~ nox	0.182603042501699
6	medv ~ rm	0.483525455991334
7	medv ~ age	0.142094744077804
8	medv ~ dis	0.0624643721217825
9	medv ~ rad	0.145638579912325
10	medv ~ tax	0.219525921044219
11	medv ~ ptratio	0.257847318009224
12	medv ~ black	0.111196118246363
13	medv ~ lstat	0.54414629758648

Showing 1 to 13 of 13 entries

Previous1Next

models$model[which.max(models$R2)]

## [1] "medv ~ lstat"

summary(lm(as.formula(models$model[which.max(models$R2)]),data = Boston))

## 
## Call:
## lm(formula = as.formula(models$model[which.max(models$R2)]), 
##     data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41 <0.0000000000000002 ***
## lstat       -0.95005    0.03873  -24.53 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 0.00000000000000022

summary(lm(medv ~ lstat, data = Boston))

## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41 <0.0000000000000002 ***
## lstat       -0.95005    0.03873  -24.53 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 0.00000000000000022

21.4.3 相關係數解

cor(x = Boston[colnames(Boston)[-14]], 
    y = Boston[colnames(Boston)[14]])

##               medv
## crim    -0.3883046
## zn       0.3604453
## indus   -0.4837252
## chas     0.1752602
## nox     -0.4273208
## rm       0.6953599
## age     -0.3769546
## dis      0.2499287
## rad     -0.3816262
## tax     -0.4685359
## ptratio -0.5077867
## black    0.3334608
## lstat   -0.7376627

COR <- cor(x = Boston[colnames(Boston)[-14]], 
           y = Boston[colnames(Boston)[14]])
COR

##               medv
## crim    -0.3883046
## zn       0.3604453
## indus   -0.4837252
## chas     0.1752602
## nox     -0.4273208
## rm       0.6953599
## age     -0.3769546
## dis      0.2499287
## rad     -0.3816262
## tax     -0.4685359
## ptratio -0.5077867
## black    0.3334608
## lstat   -0.7376627

COR[which(abs(COR) == max(abs(COR)), arr.ind = TRUE)]

## [1] -0.7376627

which.max(abs(COR))

## [1] 13

colnames(Boston)[-14][which.max(abs(COR))]

## [1] "lstat"

VAR <- colnames(Boston)[-14][which.max(abs(COR))]
VAR

## [1] "lstat"

bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var))
bostonLM

## medv ~ lstat

summary(lm(bostonLM, data = Boston))

## 
## Call:
## lm(formula = bostonLM, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41 <0.0000000000000002 ***
## lstat       -0.95005    0.03873  -24.53 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 0.00000000000000022

summary(lm(medv ~ lstat, data = Boston))

## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41 <0.0000000000000002 ***
## lstat       -0.95005    0.03873  -24.53 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 0.00000000000000022

21.5 80-20迴歸建模研究方法論

set.seed(123)
split <- caTools::sample.split(Boston, SplitRatio = 0.8) #assigns booleans to a new coloumn based on the split ratio
split

##  [1]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE
## [13]  TRUE  TRUE

Boston.train <- subset(Boston, split == TRUE)
Boston.test <- subset(Boston, split == FALSE)
dim(Boston.train)

## [1] 398  14

dim(Boston.test)

## [1] 108  14

21.5.1 training + test

COR <- cor(x = Boston.train[colnames(Boston)[-14]], 
           y = Boston.train[colnames(Boston)[14]])
COR

##               medv
## crim    -0.4006247
## zn       0.3676476
## indus   -0.5036929
## chas     0.1861409
## nox     -0.4404786
## rm       0.7059564
## age     -0.3914000
## dis      0.2681954
## rad     -0.3936754
## tax     -0.4855203
## ptratio -0.5220044
## black    0.3326074
## lstat   -0.7465333

VAR <- colnames(Boston)[-14][which.max(abs(COR))]
VAR

## [1] "lstat"

bostonLM <- as.formula(paste(colnames(Boston)[14], "~", Var))
bostonLM

## medv ~ lstat

summary(lm(bostonLM, data = Boston.train))

## 
## Call:
## lm(formula = bostonLM, data = Boston.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.039  -3.874  -1.352   1.754  24.021 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  34.5049     0.6202   55.64 <0.0000000000000002 ***
## lstat        -0.9601     0.0430  -22.33 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.038 on 396 degrees of freedom
## Multiple R-squared:  0.5573, Adjusted R-squared:  0.5562 
## F-statistic: 498.5 on 1 and 396 DF,  p-value: < 0.00000000000000022

model1 <- lm(bostonLM, data = Boston.train)
test.pred.model1 <- predict(model1, newdata = Boston.test)

model1.mpse <- mean((Boston.test$medv - test.pred.model1)^2)
model1.mpse

## [1] 46.80385

Chapter 21 財務管理課堂數據集之迴歸建模研究專案管理

21.1 旭

21.2 預測研究專案之基本盤

21.2.1 解析問題

21.2.2 準備數據

21.2.3 特徵值工程

21.2.4 檔案管理

21.2.5 建模工程

21.2.6 預測工程

21.2.7 解讀工程

21.3 建模前的預備知識

21.3.1 formula

21.3.2 lm

21.3.3 predict

21.3.4 broom

21.4 傳統迴歸建模研究方法論

21.4.1 暴力解

21.4.2 迴圈解

21.4.3 相關係數解

21.5 80-20迴歸建模研究方法論

21.5.1 training + test

21.5.2 training + validation + test

21.3.1 `formula`

21.3.2 `lm`

21.3.3 `predict`

21.3.4 `broom`