Chapter 22 背景表現預測學期總成績
22.2 「背景表現預測學期總成績」之專案管理
22.2.1 解析問題
### 期初調查
library(readxl) # 有請套件「readxl」幫忙!
<- read_excel("data/Q1_insurance_2014-09-24.xlsx")
Q1
::datatable(Q1[,-1],
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
到底問了什麼呢?
光看「欄位名稱」,絕對無法得知,當年這一位財務管理授課師長到底問了什麼?為此,當年作者小編跟研究生第一次研究這一組數據集的時候,邊寫過一份所謂的「編碼簿」,紀錄著每一個欄位相關的資訊。作者小編抓出「期初調查」來,
22.2.2 準備數據
### 學期總成績
<- read_excel("data/grade_insurance_2015-01-07.xlsx")
grade
<- data.frame(ID = Q1$ID,
backDF calculus = Q1$Q1V13,
economics = Q1$Q1V14,
accounting = Q1$Q1V15,
grade = grade$semester_grade,
stringsAsFactors = FALSE)
::datatable(backDF[,-1],
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
執行以下幾個簡單的檢查動作:
class(backDF)
## [1] "data.frame"
sapply(backDF, class)
## ID calculus economics accounting grade
## "character" "character" "character" "character" "numeric"
dim(backDF)
## [1] 85 5
colnames(backDF)
## [1] "ID" "calculus" "economics" "accounting" "grade"
sapply(backDF[,-1], unique)
## $calculus
## [1] "75" "85" "65" "45" "NA" "90" "55" "19" "35"
##
## $economics
## [1] "85" "75" "65" "45" "NA" "0" "55" "90"
##
## $accounting
## [1] "75" "65" "85" "55" "NA" "90" "45" "0" "19"
##
## $grade
## [1] 86 78 77 85 83 81 76 69 79 73 87 56 80 82 71 75 72 70 65 68 64 63 61 84 67
## [26] 57 66 93 60 74 53
觀察過後,看到了「好朋友遺漏值」,所以作者小編啟動「清洗工程」
<- as.data.frame(backDF, stringsAsFactors = FALSE)
backDF which(backDF == "NA", arr.ind = TRUE)] <- NA
backDF[
sapply(backDF[,-1], unique)
## $calculus
## [1] "75" "85" "65" "45" NA "90" "55" "19" "35"
##
## $economics
## [1] "85" "75" "65" "45" NA "0" "55" "90"
##
## $accounting
## [1] "75" "65" "85" "55" NA "90" "45" "0" "19"
##
## $grade
## [1] 86 78 77 85 83 81 76 69 79 73 87 56 80 82 71 75 72 70 65 68 64 63 61 84 67
## [26] 57 66 93 60 74 53
<- backDF[complete.cases(backDF),]
backDF
$calculus <- as.numeric(backDF$calculus)
backDF$economics <- as.numeric(backDF$economics)
backDF$accounting <- as.numeric(backDF$accounting)
backDF
sapply(backDF[,-1], unique)
## $calculus
## [1] 75 85 65 45 90 55 19 35
##
## $economics
## [1] 85 75 65 45 0 55 90
##
## $accounting
## [1] 75 65 85 55 90 45 0 19
##
## $grade
## [1] 86 78 77 85 83 81 76 79 73 87 56 80 82 71 75 72 70 65 68 64 63 61 84 67 57
## [26] 66 93 60 74 53 69
::datatable(backDF[,-1],
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
確實清乾淨了。但是
紀錄裡有「0分」,通常表示這一位同學「未完成」,所以作者小編決定再清一次數據。
which(backDF == 0, arr.ind = TRUE)] <- NA
backDF[<- backDF[complete.cases(backDF),]
backDF ::datatable(backDF[,-1],
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
22.2.3 特徵值工程
<- prcomp(backDF[,-c(1,5)])
PCAfm summary(PCAfm)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 20.8699 11.1213 7.16964
## Proportion of Variance 0.7133 0.2026 0.08418
## Cumulative Proportion 0.7133 0.9158 1.00000
names(summary(PCAfm))
## [1] "sdev" "rotation" "center" "scale" "x"
## [6] "importance"
::datatable(summary(PCAfm)$x,
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
$PC1 <- summary(PCAfm)$x[,"PC1"]
backDF$PC2 <- summary(PCAfm)$x[,"PC2"]
backDF::datatable(backDF[,-1],
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
22.2.4 檔案管理
set.seed(123)
<- caTools::sample.split(backDF, SplitRatio = 0.8)
split <- subset(backDF, split == TRUE)
backDF.train <- subset(backDF, split == FALSE)
backDF.test
::datatable(backDF.train[,-1],
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
::datatable(backDF.test[,-1],
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
22.2.5 建模工程
<- lm(grade ~ calculus + economics + accounting, data = backDF.train)
modelback summary(modelback)
##
## Call:
## lm(formula = grade ~ calculus + economics + accounting, data = backDF.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.854 -3.931 0.409 4.229 12.860
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.73304 6.17679 8.052 0.0000000000814 ***
## calculus 0.15239 0.05627 2.708 0.00904 **
## economics 0.08976 0.11757 0.763 0.44854
## accounting 0.11222 0.08185 1.371 0.17601
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.613 on 54 degrees of freedom
## Multiple R-squared: 0.3222, Adjusted R-squared: 0.2845
## F-statistic: 8.555 on 3 and 54 DF, p-value: 0.00009636
<- lm(grade ~ PC1 + PC2, data = backDF.train)
modelbackPC summary(modelbackPC)
##
## Call:
## lm(formula = grade ~ PC1 + PC2, data = backDF.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.8530 -3.9317 0.4088 4.2293 12.8620
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.79078 0.86255 85.550 < 0.0000000000000002 ***
## PC1 -0.20723 0.04065 -5.098 0.00000436 ***
## PC2 0.03036 0.07305 0.416 0.679
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.553 on 55 degrees of freedom
## Multiple R-squared: 0.3222, Adjusted R-squared: 0.2975
## F-statistic: 13.07 on 2 and 55 DF, p-value: 0.00002269
<- lm(grade ~ calculus * economics * accounting, data = backDF.train)
modelbackInter summary(modelbackInter)
##
## Call:
## lm(formula = grade ~ calculus * economics * accounting, data = backDF.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.2078 -3.4393 0.9152 3.9658 12.8765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 160.1956109 89.8020521 1.784 0.0805 .
## calculus -1.9250889 1.4678097 -1.312 0.1957
## economics -1.4326375 1.4734422 -0.972 0.3356
## accounting -1.5794857 1.4464885 -1.092 0.2801
## calculus:economics 0.0284435 0.0226035 1.258 0.2141
## calculus:accounting 0.0305456 0.0222524 1.373 0.1760
## economics:accounting 0.0230175 0.0226259 1.017 0.3139
## calculus:economics:accounting -0.0004133 0.0003308 -1.249 0.2173
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.715 on 50 degrees of freedom
## Multiple R-squared: 0.353, Adjusted R-squared: 0.2624
## F-statistic: 3.896 on 7 and 50 DF, p-value: 0.001847
<- lm(grade ~ PC1 * PC2, data = backDF.train)
modelbackPCInter summary(modelbackPCInter)
##
## Call:
## lm(formula = grade ~ PC1 * PC2, data = backDF.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.8346 -4.0052 0.4275 4.1558 12.8655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.7920444 0.8709251 84.728 < 0.0000000000000002 ***
## PC1 -0.2075697 0.0416938 -4.978 0.00000692 ***
## PC2 0.0282430 0.0873281 0.323 0.748
## PC1:PC2 0.0001583 0.0034931 0.045 0.964
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.613 on 54 degrees of freedom
## Multiple R-squared: 0.3222, Adjusted R-squared: 0.2845
## F-statistic: 8.556 on 3 and 54 DF, p-value: 0.00009626
22.2.6 預測工程
<- predict(modelback, newdata = backDF.test)
test.pred.model <- data.frame(ID = names(test.pred.model),
PREDgrade grade = backDF.test$grade,
pred = test.pred.model,
pse = abs(backDF.test$grade - test.pred.model))
<- mean((backDF.test$grade - test.pred.model)^2)
model.mpse model.mpse
## [1] 49.83167
::datatable(PREDgrade,
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
<- predict(modelbackPC, newdata = backDF.test)
test.pred.model <- data.frame(ID = names(test.pred.model),
PREDgrade grade = backDF.test$grade,
pred = test.pred.model,
pse = abs(backDF.test$grade - test.pred.model))
<- mean((backDF.test$grade - test.pred.model)^2)
model.mpse model.mpse
## [1] 49.8403
::datatable(PREDgrade,
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
<- predict(modelbackInter, newdata = backDF.test)
test.pred.model <- data.frame(ID = names(test.pred.model),
PREDgrade grade = backDF.test$grade,
pred = test.pred.model,
pse = abs(backDF.test$grade - test.pred.model))
<- mean((backDF.test$grade - test.pred.model)^2)
model.mpse model.mpse
## [1] 51.58819
::datatable(PREDgrade,
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
<- predict(modelbackPCInter, newdata = backDF.test)
test.pred.model <- data.frame(ID = names(test.pred.model),
PREDgrade grade = backDF.test$grade,
pred = test.pred.model,
pse = abs(backDF.test$grade - test.pred.model))
<- mean((backDF.test$grade - test.pred.model)^2)
model.mpse model.mpse
## [1] 49.85057
::datatable(PREDgrade,
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
22.2.7 解讀工程
22.3 「背景表現預測學期總成績」之「男女有別」
22.3.1 解析問題
22.3.2 準備數據
::datatable(backDF[,-1],
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
22.3.2.1 如何把「男女有別」的性別欄位加進來?
<- Q1$Q1V1[which(Q1$ID %in% backDF$ID)]
sex sex
## [1] 0 1 1 0 0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 1 1 1 0 1
## [39] 0 0 0 1 0 1 1 1 0 0 1 1 0 0 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0 0 0
## [77] 1 1 0 1
$sex <- sex
backDFsapply(backDF, class)
## ID calculus economics accounting grade PC1
## "character" "numeric" "numeric" "numeric" "numeric" "numeric"
## PC2 sex
## "numeric" "numeric"
$sex <- factor(sex)
backDFsapply(backDF, class)
## ID calculus economics accounting grade PC1
## "character" "numeric" "numeric" "numeric" "numeric" "numeric"
## PC2 sex
## "numeric" "factor"
::datatable(backDF[,-1],
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
22.3.3 特徵值工程
library(fastDummies)
<- dummy_cols(backDF, select_columns = "sex")
backDF
sapply(backDF, class)
## ID calculus economics accounting grade PC1
## "character" "numeric" "numeric" "numeric" "numeric" "numeric"
## PC2 sex sex_0 sex_1
## "numeric" "factor" "integer" "integer"
::datatable(backDF[,-1],
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
作者小編建議讀者諸君一定要看這一份救命符:
22.3.4 檔案管理
set.seed(123)
<- caTools::sample.split(backDF, SplitRatio = 0.8)
split <- subset(backDF, split == TRUE)
backDF.train <- subset(backDF, split == FALSE)
backDF.test
::datatable(backDF.train[,-1],
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
::datatable(backDF.test[,-1],
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
22.3.5 建模工程
<- lm(grade ~ calculus + economics + accounting + sex, data = backDF.train)
modelback summary(modelback)
##
## Call:
## lm(formula = grade ~ calculus + economics + accounting + sex,
## data = backDF.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.5548 -2.9334 0.8704 3.7161 12.6614
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.64970 6.04411 6.560 0.000000015 ***
## calculus 0.22162 0.05376 4.122 0.000119 ***
## economics 0.12858 0.09895 1.299 0.198854
## accounting 0.14577 0.06926 2.105 0.039587 *
## sex1 1.38086 1.61447 0.855 0.395843
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.961 on 59 degrees of freedom
## Multiple R-squared: 0.4906, Adjusted R-squared: 0.4561
## F-statistic: 14.21 on 4 and 59 DF, p-value: 0.00000003522
<- lm(grade ~ PC1 + PC2 + sex, data = backDF.train)
modelbackPC summary(modelbackPC)
##
## Call:
## lm(formula = grade ~ PC1 + PC2 + sex, data = backDF.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.5066 -2.9013 0.8326 3.6652 12.7229
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.27670 1.04320 70.242 < 0.0000000000000002 ***
## PC1 -0.29304 0.03953 -7.414 0.000000000492 ***
## PC2 0.02953 0.06688 0.442 0.660
## sex1 1.36930 1.59089 0.861 0.393
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.912 on 60 degrees of freedom
## Multiple R-squared: 0.4906, Adjusted R-squared: 0.4651
## F-statistic: 19.26 on 3 and 60 DF, p-value: 0.000000007252
<- lm(grade ~ calculus * economics * accounting + sex, data = backDF.train)
modelbackInter summary(modelbackInter)
##
## Call:
## lm(formula = grade ~ calculus * economics * accounting + sex,
## data = backDF.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.407 -3.428 1.440 3.262 12.587
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.72382163 80.60570535 0.580 0.565
## calculus -0.30913293 1.20828900 -0.256 0.799
## economics 0.50321152 1.39449427 0.361 0.720
## accounting 0.16291712 1.26849792 0.128 0.898
## sex1 1.51620193 1.64248469 0.923 0.360
## calculus:economics 0.00023204 0.01930431 0.012 0.990
## calculus:accounting 0.00530191 0.01831578 0.289 0.773
## economics:accounting -0.00720555 0.02073069 -0.348 0.729
## calculus:economics:accounting 0.00003006 0.00027985 0.107 0.915
##
## Residual standard error: 5.942 on 55 degrees of freedom
## Multiple R-squared: 0.5283, Adjusted R-squared: 0.4597
## F-statistic: 7.699 on 8 and 55 DF, p-value: 0.0000007366
<- lm(grade ~ PC1 * PC2 + sex, data = backDF.train)
modelbackPCInter summary(modelbackPCInter)
##
## Call:
## lm(formula = grade ~ PC1 * PC2 + sex, data = backDF.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.5005 -2.8763 0.6485 3.9965 12.6687
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.214436 1.063544 68.840 < 0.0000000000000002 ***
## PC1 -0.292771 0.039819 -7.353 0.000000000684 ***
## PC2 0.044563 0.078210 0.570 0.571
## sex1 1.472131 1.625264 0.906 0.369
## PC1:PC2 -0.001225 0.003237 -0.378 0.707
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.954 on 59 degrees of freedom
## Multiple R-squared: 0.4918, Adjusted R-squared: 0.4574
## F-statistic: 14.28 on 4 and 59 DF, p-value: 0.00000003293
22.3.6 預測工程
<- predict(modelback, newdata = backDF.test)
test.pred.model <- data.frame(ID = names(test.pred.model),
PREDgrade grade = backDF.test$grade,
pred = test.pred.model,
pse = abs(backDF.test$grade - test.pred.model))
<- mean((backDF.test$grade - test.pred.model)^2)
model.mpse model.mpse
## [1] 85.06121
::datatable(PREDgrade,
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
<- predict(modelbackPC, newdata = backDF.test)
test.pred.model <- data.frame(ID = names(test.pred.model),
PREDgrade grade = backDF.test$grade,
pred = test.pred.model,
pse = abs(backDF.test$grade - test.pred.model))
<- mean((backDF.test$grade - test.pred.model)^2)
model.mpse model.mpse
## [1] 85.22221
::datatable(PREDgrade,
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
<- predict(modelbackInter, newdata = backDF.test)
test.pred.model <- data.frame(ID = names(test.pred.model),
PREDgrade grade = backDF.test$grade,
pred = test.pred.model,
pse = abs(backDF.test$grade - test.pred.model))
<- mean((backDF.test$grade - test.pred.model)^2)
model.mpse model.mpse
## [1] 91.49328
::datatable(PREDgrade,
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))
<- predict(modelbackPCInter, newdata = backDF.test)
test.pred.model <- data.frame(ID = names(test.pred.model),
PREDgrade grade = backDF.test$grade,
pred = test.pred.model,
pse = abs(backDF.test$grade - test.pred.model))
<- mean((backDF.test$grade - test.pred.model)^2)
model.mpse model.mpse
## [1] 85.89551
::datatable(PREDgrade,
DToptions = list(scrollX = TRUE,
fixedColumns = TRUE))