6 简单的机器学习
代码提供:
PartI 徐乐瑶 黄舜尧 黄恺潼
PartII 曾子尧
主要内容:
PartI -1.决策树模型 -2.随机森林模型
PartII 应用:根据FIFA数据预测足球运动员身价
6.1 决策树模型
预测鸢尾花品种对其植株特征影响的决策树模型分析
#读取环境
library(rpart)
library(rpart.plot)
library(readr)
library(ggplot2)
#iris
data("iris")
attach(iris)
#data split 数据划分
= sample(c(1:150),120,replace = F)
s = iris[s,]
trainset = iris[-s,]
testset
#train 模型构建
= rpart(Species ~ .,data = trainset)
fit1 summary(fit1)
## Call:
## rpart(formula = Species ~ ., data = trainset)
## n= 120
##
## CP nsplit rel error xerror xstd
## 1 0.5000000 0 1.00000000 1.00000000 0.06945906
## 2 0.4605263 1 0.50000000 0.47368421 0.06605211
## 3 0.0100000 2 0.03947368 0.06578947 0.02880247
##
## Variable importance
## Petal.Width Petal.Length Sepal.Length Sepal.Width
## 35 32 20 14
##
## Node number 1: 120 observations, complexity param=0.5
## predicted class=versicolor expected loss=0.6333333 P(node) =1
## class counts: 38 44 38
## probabilities: 0.317 0.367 0.317
## left son=2 (38 obs) right son=3 (82 obs)
## Primary splits:
## Petal.Length < 2.45 to the left, improve=39.01951, (0 missing)
## Petal.Width < 0.8 to the left, improve=39.01951, (0 missing)
## Sepal.Length < 5.45 to the left, improve=25.57920, (0 missing)
## Sepal.Width < 3.35 to the right, improve=17.24325, (0 missing)
## Surrogate splits:
## Petal.Width < 0.8 to the left, agree=1.000, adj=1.000, (0 split)
## Sepal.Length < 5.45 to the left, agree=0.917, adj=0.737, (0 split)
## Sepal.Width < 3.35 to the right, agree=0.858, adj=0.553, (0 split)
##
## Node number 2: 38 observations
## predicted class=setosa expected loss=0 P(node) =0.3166667
## class counts: 38 0 0
## probabilities: 1.000 0.000 0.000
##
## Node number 3: 82 observations, complexity param=0.4605263
## predicted class=versicolor expected loss=0.4634146 P(node) =0.6833333
## class counts: 0 44 38
## probabilities: 0.000 0.537 0.463
## left son=6 (45 obs) right son=7 (37 obs)
## Primary splits:
## Petal.Width < 1.75 to the left, improve=35.012320, (0 missing)
## Petal.Length < 4.75 to the left, improve=31.943280, (0 missing)
## Sepal.Length < 6.15 to the left, improve=11.990980, (0 missing)
## Sepal.Width < 2.45 to the left, improve= 3.287154, (0 missing)
## Surrogate splits:
## Petal.Length < 4.75 to the left, agree=0.927, adj=0.838, (0 split)
## Sepal.Length < 6.15 to the left, agree=0.732, adj=0.405, (0 split)
## Sepal.Width < 2.95 to the left, agree=0.646, adj=0.216, (0 split)
##
## Node number 6: 45 observations
## predicted class=versicolor expected loss=0.04444444 P(node) =0.375
## class counts: 0 43 2
## probabilities: 0.000 0.956 0.044
##
## Node number 7: 37 observations
## predicted class=virginica expected loss=0.02702703 P(node) =0.3083333
## class counts: 0 1 36
## probabilities: 0.000 0.027 0.973
rpart.plot(fit1,type = 2)
#predict
= predict(fit1,testset)
pre
library(rpart)
<-rpart(Species~.,data = iris)
m
plot(m,compress = T,margin = 0.2)
text(m,cex=1.5)
library(rpart.plot)
prp(m,type=4,extra=2,digits=3)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
<-ctree(Species~.,data=iris)
m
plot(m)
levels(iris$Species)
## [1] "setosa" "versicolor" "virginica"
6.2 随机森林模型
预测华沙的公寓价格的随机森林模型分析
##读取环境
library("DALEX")
## Welcome to DALEX (version: 2.3.0).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
## Additional features will be available after installation of: ggpubr.
## Use 'install_dependencies()' to get all suggested dependencies
##
## Attaching package: 'DALEX'
## The following object is masked from 'package:dplyr':
##
## explain
library("randomForest")
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library("ggplot2")
library("rms")
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## Attaching package: 'rms'
## The following object is masked from 'package:modeltools':
##
## Predict
##读取数据
<- apartments
apartments head(apartments)
## m2.price construction.year surface floor no.rooms district
## 1 5897 1953 25 3 1 Srodmiescie
## 2 1818 1992 143 9 5 Bielany
## 3 3643 1937 56 1 2 Praga
## 4 3517 1995 93 7 3 Ochota
## 5 3013 1992 144 6 5 Mokotow
## 6 5795 1926 61 6 2 Srodmiescie
##使用并解释随机森林模型
set.seed(72)
<- randomForest(m2.price ~ ., data = apartments)
apartments_rf <- DALEX::explain(model = apartments_rf,
explainer_apartments_rf data = apartments_test[,-1],
y = apartments_test$m2.price,
label = "Random Forest")
## Preparation of a new explainer is initiated
## -> model label : Random Forest
## -> data : 9000 rows 5 cols
## -> target variable : 9000 values
## -> predict function : yhat.randomForest will be used ( [33m default [39m )
## -> predicted values : No value for predict function target column. ( [33m default [39m )
## -> model_info : package randomForest , ver. 4.6.14 , task regression ( [33m default [39m )
## -> predicted values : numerical, min = 1985.837 , mean = 3506.107 , max = 5788.052
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -762.3422 , mean = 5.416971 , max = 1318.093
## [32m A new explainer has been created! [39m
## 模型可视化
<- model_performance(explainer_apartments_rf )
model_performance_ranger_apartments model_performance_ranger_apartments
## Measures for: regression
## mse : 80061.77
## rmse : 282.9519
## r2 : 0.9012564
## mad : 169.0738
##
## Residuals:
## 0% 10% 20% 30% 40% 50%
## -762.342192 -289.095065 -215.594108 -156.778789 -109.479193 -57.127519
## 60% 70% 80% 90% 100%
## 5.668714 87.171569 200.940812 402.184651 1318.092741
plot(model_performance_ranger_apartments)
## 变量重要性度量
<- model_parts(explainer_apartments_rf)
vi_rf plot(vi_rf)
##使用表面和施工年份两个重要变量和 PD 配置文件来评估模型性能
<- model_profile(explainer = explainer_apartments_rf, variables = "surface")
pdp_surface_rf <- model_profile(explainer = explainer_apartments_rf, variables = "construction.year")
pdp_constrction.year_rf plot(pdp_surface_rf)
plot(pdp_constrction.year_rf)
##预测房价
predict(apartments_rf, apartments_test[1:72,])
## 1001 1002 1003 1004 1005 1006 1007 1008
## 4214.084 3178.061 2695.787 2744.775 2951.069 2999.450 3325.692 2662.586
## 1009 1010 1011 1012 1013 1014 1015 1016
## 2827.747 4231.489 3198.826 2737.938 4375.699 3109.098 3938.727 3184.121
## 1017 1018 1019 1020 1021 1022 1023 1024
## 4307.783 4069.748 4452.784 2286.641 3297.630 3672.018 4332.968 2830.935
## 1025 1026 1027 1028 1029 1030 1031 1032
## 3382.471 4059.084 2736.103 2986.361 3762.799 4135.743 2533.184 3706.525
## 1033 1034 1035 1036 1037 1038 1039 1040
## 5117.775 3719.939 3945.399 4439.944 3564.239 2337.870 3035.085 3705.500
## 1041 1042 1043 1044 1045 1046 1047 1048
## 4470.851 3219.128 2809.955 2688.750 4847.457 3657.901 4405.674 4157.527
## 1049 1050 1051 1052 1053 1054 1055 1056
## 3650.222 3188.809 2780.834 2519.497 3230.555 2637.206 3169.776 3448.061
## 1057 1058 1059 1060 1061 1062 1063 1064
## 3076.687 3642.569 3544.335 2402.510 3278.932 4280.645 3941.493 3231.770
## 1065 1066 1067 1068 1069 1070 1071 1072
## 3430.896 3548.905 3888.528 4020.859 3991.719 2182.934 2213.622 3075.885
##模型的预测值与观察值间的关系
<- model_diagnostics(explainer_apartments_rf)
md_rf plot(md_rf, variable = "y", yvariable = "y_hat") +
scale_x_continuous("Ture value of m2.price") +
scale_y_continuous("Predicted m2.price") +
geom_abline(colour = "red", intercept = 0, slope = 1) +
ggtitle("Predicted and observed house prices", "")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
##总结预测性能
<- predict(apartments_rf, apartments_test)
predicted_apartments_rf sqrt(mean((predicted_apartments_rf - apartments_test$m2.price)^2))
## [1] 282.9519
6.3 预测身价
本页面为Rmarkdown用法和DALEX包用法的一个示例。本页面使用DALEX包,基于FIFA20数据,依据GBM deep算法,预测球员身价。
## 读取环境
library("DALEX")
library(gbm)
## Loaded gbm 2.1.8
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
#读取、处理数据
<- fifa
fifa $LogValue <- log10(fifa$value_eur)
fifa<- fifa[,-c(1, 2, 3, 4, 5)]
fifa_small
##建立预测模型
<- gbm(LogValue~., data = fifa_small, n.trees = 250,
fifa_gbm_deep interaction.depth = 4, distribution = "gaussian")
<- DALEX::explain(fifa_gbm_deep,
fifa_gbm_exp_deep data = fifa_small, y = 10^fifa_small$LogValue,
predict_function = function(m,x) 10^predict(m, x, n.trees = 250),
label = "GBM deep")
## Preparation of a new explainer is initiated
## -> model label : GBM deep
## -> data : 5000 rows 38 cols
## -> target variable : 5000 values
## -> predict function : function(m, x) 10^predict(m, x, n.trees = 250)
## -> predicted values : No value for predict function target column. ( [33m default [39m )
## -> model_info : package gbm , ver. 2.1.8 , task regression ( [33m default [39m )
## -> predicted values : numerical, min = 244198.5 , mean = 7329481 , max = 104789967
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -22474906 , mean = 143805.9 , max = 22083559
## [32m A new explainer has been created! [39m
##查看模型
model_performance(fifa_gbm_exp_deep)
## Measures for: regression
## mse : 4.116765e+12
## rmse : 2028981
## r2 : 0.9476419
## mad : 588852.9
##
## Residuals:
## 0% 10% 20% 30% 40% 50%
## -22474906.5 -1279068.3 -661008.8 -364965.6 -157909.1 24781.0
## 60% 70% 80% 90% 100%
## 233663.9 505202.7 913586.5 1652537.1 22083559.0
##模型可视化
<- model_diagnostics(fifa_gbm_exp_deep)
fifa_md_gbm_deep plot(fifa_md_gbm_deep,
variable = "y", yvariable = "y_hat") +
scale_x_continuous("Value in Euro", trans = "log10",
labels = dollar_format(suffix = "€", prefix = "")) +
scale_y_continuous("Predicted value in Euro", trans = "log10",
labels = dollar_format(suffix = "€", prefix = "")) +
geom_abline(slope = 1) +
ggtitle("Predicted and observed players' values", "")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## 球员各数据重要度展示
<- model_parts(fifa_gbm_exp_deep)
fifa_mp_gbm_deep plot(fifa_mp_gbm_deep, max_vars = 20,
bar_width = 4, show_boxplots = FALSE)
##球员 反应、控球、终结能力、年龄对身价影响
<- c("movement_reactions", "skill_ball_control", "attacking_finishing", "age")
selected_variables <- model_profile(fifa_gbm_exp_deep,
fifa19_pd_deep variables = selected_variables)
plot(fifa19_pd_deep)
#预测梅西身价
<- predict_parts(fifa_gbm_exp_deep,
fifa_bd_gbm_M new_observation = fifa["L. Messi",],
type = "break_down")
plot(fifa_bd_gbm_M) +
scale_y_continuous("Predicted value in Euro",
labels = dollar_format(suffix = "€", prefix = "")) +
ggtitle("Break-down plot for L.Messi","")
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
<- predict_parts(fifa_gbm_exp_deep,
fifa_shap_gbm_M new_observation = fifa["L. Messi",],
type = "shap")
plot(fifa_shap_gbm_M, show_boxplots = FALSE) +
scale_y_continuous("Estimated value in Euro",
labels = dollar_format(suffix = "€", prefix = "")) +
ggtitle("Shapley values for L.Messi","")
#预测姆巴佩身价
<- predict_parts(fifa_gbm_exp_deep,
fifa_bd_gbm_MBP new_observation = fifa["K. Mbappe",],
type = "break_down")
plot(fifa_bd_gbm_MBP) +
scale_y_continuous("Predicted value in Euro",
labels = dollar_format(suffix = "€", prefix = "")) +
ggtitle("Break-down plot for MBP","")
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
<- predict_parts(fifa_gbm_exp_deep,
fifa_shap_gbm_MBP new_observation = fifa["K. Mbappe",],
type = "shap")
plot(fifa_shap_gbm_MBP, show_boxplots = FALSE) +
scale_y_continuous("Estimated value in Euro",
labels = dollar_format(suffix = "€", prefix = "")) +
ggtitle("Shapley values for MBP","")
#预测内马尔
<- predict_parts(fifa_gbm_exp_deep,
fifa_bd_gbm_NM new_observation = fifa["Neymar Jr",],
type = "break_down")
plot(fifa_bd_gbm_NM) +
scale_y_continuous("Predicted value in Euro",
labels = dollar_format(suffix = "€", prefix = "")) +
ggtitle("Break-down plot for neymar","")
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.