Chapter 5 建模
5.1 划分选来集合测试集合
bins <- scorecard::woebin(cdata_reduced_2,y = 'label')
dt_woe <- scorecard::woebin_ply(cdata_reduced_2,bins)## Woe transformating on 322715 rows and 11 columns in 00:00:12
dt_woe$label <- as.factor(dt_woe$label)
div_part_1 <- createDataPartition(y = dt_woe$label, p = 0.7, list = F)
# Training Sample
train_1 <- dt_woe[div_part_1,] # 70% here
pct(train_1$label)| Count | Percentage | |
|---|---|---|
| 0 | 199708 | 88.41 |
| 1 | 26193 | 11.59 |
# Test Sample
test_1 <- dt_woe[-div_part_1,] # rest of the 30% data goes here
pct(test_1$label)| Count | Percentage | |
|---|---|---|
| 0 | 85589 | 88.41 |
| 1 | 11225 | 11.59 |
5.2 训练模型以及模型选择
5.2.1 逻辑回归以及逐步回归
m1 <- glm(label~.,data=train_1,family=binomial())
m1 <- step(m1)## Start: AIC=141064.5
## label ~ 地址种类_woe + app1_woe + 下单与付款时间间隔_woe +
## cod运费_woe + 修改后金额_woe + 原始来单金额_woe +
## 金额差异_woe + 付款到派送_woe + 发货方式_woe +
## 用户性别_woe + 州_woe
##
##
## Step: AIC=141064.5
## label ~ 地址种类_woe + app1_woe + 下单与付款时间间隔_woe +
## cod运费_woe + 修改后金额_woe + 原始来单金额_woe +
## 付款到派送_woe + 发货方式_woe + 用户性别_woe +
## 州_woe
##
## Df Deviance AIC
## - 修改后金额_woe 1 141042 141062
## <none> 141042 141064
## - 原始来单金额_woe 1 141053 141073
## - cod运费_woe 1 141170 141190
## - 下单与付款时间间隔_woe 1 141309 141329
## - 州_woe 1 141389 141409
## - 发货方式_woe 1 141516 141536
## - 用户性别_woe 1 142610 142630
## - 付款到派送_woe 1 145987 146007
## - app1_woe 1 146280 146300
## - 地址种类_woe 1 146451 146471
##
## Step: AIC=141062.5
## label ~ 地址种类_woe + app1_woe + 下单与付款时间间隔_woe +
## cod运费_woe + 原始来单金额_woe + 付款到派送_woe +
## 发货方式_woe + 用户性别_woe + 州_woe
##
## Df Deviance AIC
## <none> 141042 141062
## - 原始来单金额_woe 1 141123 141141
## - cod运费_woe 1 141172 141190
## - 下单与付款时间间隔_woe 1 141309 141327
## - 州_woe 1 141389 141407
## - 发货方式_woe 1 141516 141534
## - 用户性别_woe 1 142610 142628
## - 付款到派送_woe 1 145988 146006
## - app1_woe 1 146280 146298
## - 地址种类_woe 1 146451 146469
summary(m1)##
## Call:
## glm(formula = label ~ 地址种类_woe + app1_woe + 下单与付款时间间隔_woe +
## cod运费_woe + 原始来单金额_woe + 付款到派送_woe +
## 发货方式_woe + 用户性别_woe + 州_woe, family = binomial(),
## data = train_1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5995 -0.5347 -0.3976 -0.2337 3.8521
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.036266 0.007616 -267.35 <2e-16 ***
## 地址种类_woe 1.011125 0.021069 47.99 <2e-16 ***
## app1_woe 1.017471 0.015451 65.85 <2e-16 ***
## 下单与付款时间间隔_woe 0.952940 0.058721 16.23 <2e-16 ***
## cod运费_woe 0.529066 0.046369 11.41 <2e-16 ***
## 原始来单金额_woe 0.398816 0.044658 8.93 <2e-16 ***
## 付款到派送_woe 1.410965 0.021934 64.33 <2e-16 ***
## 发货方式_woe 0.807609 0.037204 21.71 <2e-16 ***
## 用户性别_woe 0.795311 0.019860 40.05 <2e-16 ***
## 州_woe 0.559377 0.030152 18.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 162095 on 225900 degrees of freedom
## Residual deviance: 141042 on 225891 degrees of freedom
## AIC: 141062
##
## Number of Fisher Scoring iterations: 7
significant.variables <- summary(m1)$coeff[-1,4] < 0.01
names(significant.variables)[significant.variables == TRUE]## [1] "地址种类_woe" "app1_woe" "下单与付款时间间隔_woe"
## [4] "cod运费_woe" "原始来单金额_woe" "付款到派送_woe"
## [7] "发货方式_woe" "用户性别_woe" "州_woe"
dt_pred = predict(m1, type='response', test_1)
perf_eva(test_1$label, dt_pred, type = c("ks","lift","roc","pr"))## Warning: Removed 1 rows containing missing values (geom_path).

## $KS
## [1] 0.337
##
## $AUC
## [1] 0.7436
##
## $Gini
## [1] 0.4872
##
## $pic
## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## pks 1 (1-1,1-1) arrange gtable[layout]
## plift 2 (1-1,2-2) arrange gtable[layout]
## proc 3 (2-2,1-1) arrange gtable[layout]
## ppr 4 (2-2,2-2) arrange gtable[layout]
5.2.2 随即森林
m3 <- randomForest(label ~ ., data = train_1)
par(family='STKaiti')
varImpPlot(m3, main="Random Forest: Variable Importance")
dt_pred = predict(m3, type='prob', test_1)[,1]
perf_eva(test_1$label, dt_pred, type = c("ks","lift","roc","pr"))## Warning: Removed 1 rows containing missing values (geom_path).

## $KS
## [1] 0.1644
##
## $AUC
## [1] 0.4067
##
## $Gini
## [1] -0.1866
##
## $pic
## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## pks 1 (1-1,1-1) arrange gtable[layout]
## plift 2 (1-1,2-2) arrange gtable[layout]
## proc 3 (2-2,1-1) arrange gtable[layout]
## ppr 4 (2-2,2-2) arrange gtable[layout]
不平衡的数据会造成非常低AUC,需要尝试解决样本不平衡的问题
5.2.3 欠抽样
load('/Users/milin/COD\ 建模/model_rf_under.RData')
load('/Users/milin/COD\ 建模/dt_woe.RData')
require(scorecard)
dt_pred = predict(model_rf_under, type = 'prob', dt_woe)
perf_eva(dt_woe$label, dt_pred$`1`)
## $KS
## [1] 0.3986
##
## $AUC
## [1] 0.7641
##
## $Gini
## [1] 0.5281
##
## $pic
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## pks 1 (1-1,1-1) arrange gtable[layout]
## proc 2 (1-1,2-2) arrange gtable[layout]
5.2.3.1 重抽样
load('/Users/milin/COD\ 建模/model_rf_under1.RData')
dt_pred = predict(model_rf_under, type = 'prob', dt_woe)
perf_eva(dt_woe$label, dt_pred$`1`)
## $KS
## [1] 0.3986
##
## $AUC
## [1] 0.7641
##
## $Gini
## [1] 0.5281
##
## $pic
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## pks 1 (1-1,1-1) arrange gtable[layout]
## proc 2 (1-1,2-2) arrange gtable[layout]