Chapter 5 决策树

计算训练数据集D的经验熵H(D),定义HD()函数

sapply(c("magrittr", "dplyr", "purrr", "purrrlyr"), 
       require, character.only = TRUE)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: purrr
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:magrittr':
## 
##     set_names
## Loading required package: purrrlyr
## magrittr    dplyr    purrr purrrlyr 
##     TRUE     TRUE     TRUE     TRUE
HD <- function(DF) {
  
  stopifnot(is.data.frame(DF))
  
  nc <- ncol(DF)
  last_col_nam <- colnames(DF)[nc] <- "Classes"
  total_y <- DF[, last_col_nam] %>% length(.) 
  nam_cla <- DF[, last_col_nam] %>% unique(.)
  
  Pro_Y_DF <- DF %>% group_by(Classes) %>% 
    summarise(pro_y = length(Classes) / total_y) %>% 
    as.data.frame()
  
  rownames(Pro_Y_DF) <- Pro_Y_DF[, "Classes"] %>% 
    map_chr(., function(char) paste("P", "Y", char, sep = "_"))
  
  Pro_Y_DF[["log2_proy"]] <- log2(Pro_Y_DF[, "pro_y"])
  Pro_Y_DF[["multiply"]] <- Pro_Y_DF[,"pro_y"] * 
    Pro_Y_DF[, "log2_proy"]
  
  Pro_Y_li <- list()
  for (i in seq(nrow(Pro_Y_DF))) {
    Pro_Y_li[[rownames(Pro_Y_DF)[i]]] <- 
      assign(rownames(Pro_Y_DF)[i], Pro_Y_DF[i, "pro_y"])
  }
  
  li <- list(entropy = -Reduce(`+`, 
                               Pro_Y_DF[["multiply"]]), 
             Pro_Y_li = Pro_Y_li)
  return(li)
}

计算特征A对训练数据集D的经验条件熵H(D|A),定义HDA()函数

HDA <- function(DF,char_name) {
  
  stopifnot(is.data.frame(DF))
  stopifnot(is.character(char_name))
  
  nam_cla <- DF[, char_name] %>% unique(.)
  df_li <- nam_cla %>% 
    map(., function(cha) filter(DF, DF[, char_name] == cha))
  
  vec <- df_li %>% map(., nrow) %>% as_vector(.) 
  per <- vec / sum(vec) %>% round(., 4)
  li <- df_li %>% map(., HD)
  names(li) <- nam_cla
  
  HDA <- 0
  for (i in 1:length(df_li)) {    
    HDAi <- per[i] * li[[i]][["entropy"]]
    HDA <- HDA + HDAi
  }
  HDA
}

计算特征A对训练数据集D的信息增益,定义info_gain()函数

info_gain <- function(DF, char_var) {
  info_gain <- HD(DF)[["entropy"]] - HDA(DF, char_var)
  info_gain
}

计算训练数据集D关于特征A的值的熵,定义AHD()函数

AHD <- function(DF,char_var) {
  
  stopifnot(is.data.frame(DF))
  stopifnot(is.character(char_var))
  
  idx <- which(colnames(DF) == char_var)
  targ_col_nam <- colnames(DF)[idx] <- "Classes"
  total_tar <- DF[, targ_col_nam] %>% length(.) 
  nam_cla <- DF[, targ_col_nam] %>% unique(.)
  
  Pro_TAR_DF <- DF %>% group_by(Classes) %>% 
    summarise(pro_tar = length(Classes) / total_tar) %>% 
    as.data.frame()
  
  rownames(Pro_TAR_DF) <- Pro_TAR_DF[, "Classes"] %>% 
    map_chr(., function(char) paste("P", "TAR", char,sep = "_"))
  
  Pro_TAR_DF[["log2_protar"]] <- log2(Pro_TAR_DF[, "pro_tar"])
  Pro_TAR_DF[["multiply"]] <- Pro_TAR_DF[, "pro_tar"] * 
    Pro_TAR_DF[, "log2_protar"]
  
  Pro_TAR_li <- list()
  for (i in seq(nrow(Pro_TAR_DF))) {
    Pro_TAR_li[[rownames(Pro_TAR_DF)[i]]] <- 
      assign(rownames(Pro_TAR_DF)[i], Pro_TAR_DF[i, "pro_tar"])
  }
  
  li <- list(entropy = -Reduce(`+`, 
                               Pro_TAR_DF[["multiply"]]), 
             Pro_TAR_li = Pro_TAR_li)
  return(li)
}

计算特征A对训练数据集D的信息增益比,定义info_gain_ratio()函数

info_gain_ratio <- function(DF,char_var) {
  
  nominator <- HD(DF)[["entropy"]] - HDA(DF,char_var)
  denominator <- AHD(DF,char_var)[["entropy"]]
  info_gain_ratio <- nominator / denominator 
  info_gain_ratio
}

定义returnlabel()函数,返回某一数据集DF的所属类别,基于少数服从多数原则

returnlabel <- function(DF) {
  nc <- ncol(DF)
  tab <- DF[, nc] %>% table(.)
  idx_max <- tab %>% which.max()
  label <- names(tab)[idx_max]
  label
}

例5.1,例5.2,58页,62页

chap5_1

chap5_1

chap5_2

chap5_2

#定义样本集
sampletest <- data.frame(
  age = c(rep("青年",5),rep("中年",5),rep("老年",5)),
  has_job = c("否","否","是","是","否",
              "否","否","是","否","否",
              "否","否","是","是","否"),
  has_house = c("否","否","否","是","否",
                "否","否","是","是","是",
                "是","是","否","否","否"),
  credit = c("一般","好","好","一般","一般",
             "一般","好","好","非常好","非常好",
             "非常好","好","好","非常好","一般"),
  category = c("否","否","是","是","否",
               "否","否","是","是","是",
               "是","是","是","是","否"),
  stringsAsFactors = FALSE
)

#计算信息增益
HD(sampletest)
## $entropy
## [1] 0.9709506
## 
## $Pro_Y_li
## $Pro_Y_li$P_Y_否
## [1] 0.4
## 
## $Pro_Y_li$P_Y_是
## [1] 0.6
info_gain(sampletest, "age")
## [1] 0.0830075
info_gain(sampletest, "has_job")
## [1] 0.3236502
info_gain(sampletest, "has_house")
## [1] 0.4199731
info_gain(sampletest, "credit")
## [1] 0.3629896
#计算信息增益比
info_gain_ratio(sampletest, "age")
## [1] 0.0523719
info_gain_ratio(sampletest, "has_job")
## [1] 0.3524465
info_gain_ratio(sampletest, "has_house")
## [1] 0.4325381
info_gain_ratio(sampletest, "credit")
## [1] 0.2318539

ID3算法、C4.5算法实现

chap5_3

chap5_3

chap5_4

chap5_4

# 可以接受输入参数值为函数; func可以枚举info_gain()和info_gain_ratio(),
# 分别实现ID3算法、C4.5算法,func_e是信息增益(比)阈值
createTree_if <- function(DF, func, func_e = 0.08) {  

  nc <- ncol(DF)
  labels_all <- colnames(DF)
  last_col_nam <- colnames(DF)[nc]
  length_lab_las <- DF[, last_col_nam] %>% 
    unique(.) %>% length(.)
  
  if ( length_lab_las == 1) {
    
    label <- DF[, last_col_nam] %>% unique(.)
    #"全部"表示类别列全是同一类别
    paste0("叶节点, belong to ", label, " ----全部") %>% 
      cat(., "\n") 
  } else if (nc == 1) {

    label <- returnlabel(DF)
    #"少数服从多数"表示基于returnlabel()得到label,即少数服从多数原则
    paste0("叶节点, should be ", label, " ----少数服从多数") %>%
      cat(., "\n")
  } else {
  info_G <- c()
  for (i in seq(nc - 1)) {
    char_var <- colnames(DF)[i]
    info_Gain <- func(DF, char_var)
    info_G <- append(info_G, info_Gain)
  }
  
  names(info_G) <- colnames(DF)[1:(nc - 1)]
  max_ <- which.max(info_G) %>% names(info_G)[.]
  cat(max_, "\n")
  cat(info_G[max_], "\n")
  
  if (info_G[max_] < func_e) {
    label <- returnlabel(DF)
    paste0("叶节点,  * ", label, " *", " ----少数服从多数") %>% 
      cat(., "\n")
  } else {
    subsets <- DF %>% slice_rows(max_)
    out <- subsets %>% by_slice(returnlabel)
    #cat(out[[2]])
    labels_leave <- setdiff(labels_all,max_)
    for (i in 1:nrow(out)) {
      cat("\n")
      #attr(subsets, "indices")[[i]] 返回的索引是从0开始的
      subset_idx <- attr(subsets,"indices")[[i]] + 1 
      createTree_if(DF[subset_idx, labels_leave], func, func_e)
    }
  }
 }
}

# 例5.3
createTree_if(DF = sampletest, func = info_gain, func_e = 0.08)
## has_house 
## 0.4199731 
## 
## has_job 
## 0.9182958 
## 
## 叶节点, belong to 否 ----全部 
## 
## 叶节点, belong to 是 ----全部 
## 
## 叶节点, belong to 是 ----全部
#导入数据集housevotes.csv,该数据集位于h20包的extdata文件中
tes <- read.csv("c:/housevotes.csv", header = TRUE, 
                stringsAsFactors = FALSE)
tes <- tes[, c(2:ncol(tes), 1)]   #把类别列置于最后一列
dim(tes)
## [1] 232  17
head(tes)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16      Class
## 1  n  y  y  n  y  y  n  n  n   n   n   n   y   y   y   y   democrat
## 2  n  y  n  y  y  y  n  n  n   n   n   y   y   y   n   y republican
## 3  y  y  y  n  n  n  y  y  y   n   y   n   n   n   y   y   democrat
## 4  y  y  y  n  n  n  y  y  y   n   n   n   n   n   y   y   democrat
## 5  y  n  y  n  n  n  y  y  y   y   n   n   n   n   y   y   democrat
## 6  y  n  y  n  n  n  y  y  y   n   y   n   n   n   y   y   democrat
#基于ID3算法生成决策树
createTree_if(DF = tes, func = info_gain, func_e = 0.08)
## V4 
## 0.8148211 
## 
## V3 
## 0.02466213 
## 叶节点,  * democrat * ----少数服从多数 
## 
## V11 
## 0.130869 
## 
## 叶节点, belong to republican ----全部 
## 
## V9 
## 0.08051445 
## 
## V16 
## 0.09186438 
## 
## V1 
## 0.204434 
## 
## V2 
## 0.1908745 
## 
## 叶节点, belong to democrat ----全部 
## 
## V3 
## 0.3219281 
## 
## V13 
## 0.8112781 
## 
## 叶节点, belong to democrat ----全部 
## 
## 叶节点, belong to republican ----全部 
## 
## 叶节点, belong to democrat ----全部 
## 
## 叶节点, belong to republican ----全部 
## 
## V3 
## 0.1842429 
## 
## 叶节点, belong to republican ----全部 
## 
## V2 
## 0.9182958 
## 
## 叶节点, belong to republican ----全部 
## 
## 叶节点, belong to democrat ----全部 
## 
## V1 
## 0.2516292 
## 
## 叶节点, belong to democrat ----全部 
## 
## V3 
## 1 
## 
## 叶节点, belong to democrat ----全部 
## 
## 叶节点, belong to republican ----全部

整理下输出结果,生成的树如下; 基于生成的树,找几条if-then规则验证

Dt

Dt

(idx_1 <- with(tes,which(V4 == "y" & 
                         V11 == "y" & 
                         V9 == "n" &
                         V16 == "n" & 
                         V1 == "n" &
                         V2 == "y" &
                         V3 == "n" &
                         V13 == "n")))
## [1] 201
tes[idx_1, "Class"]
## [1] "democrat"
(idx_2 <- with(tes,which(V4 == "y" & 
                         V11 == "y" & 
                         V9 == "n" &
                         V16 == "n" & 
                         V1 == "n" &
                         V2 == "y" &
                         V3 == "n" &
                         V13 == "y")))
## [1]  24 171 211
tes[idx_2, "Class"]
## [1] "republican" "republican" "republican"
(idx_3 <- with(tes,which(V4 == "y" & 
                         V11 == "y" & 
                         V9 == "n" &
                         V16 == "y" &
                         V3 == "n")))
## [1]  25  31  66  73 166 180 196 208 217
tes[idx_3, "Class"]
## [1] "republican" "republican" "republican" "republican" "republican"
## [6] "republican" "republican" "republican" "republican"
#类似的,也可以基于C4.5算法生成决策树,就不演示了
#createTree_if(DF = tes, func = info_gain_ratio, func_e = 0.01)

找出一个矩阵中取值最小的元素的行列索引,定义idx_min()函数

idx_min <- function(store_min) {
  
  idx_li <- list()
  benmach <- Inf
  for (i in 1:nrow(store_min)) {
    for (j in 1:ncol(store_min)) {
      if (min(store_min[i, ]) == min(store_min[, j]) &&
          which.min(store_min[i, ]) == j && 
          store_min[i, j] < benmach) {
          benmach <- store_min[i, j]
          #返回行列索引位置
          idx_li <- append(idx_li, list(idx = c(i, j)))  
          #print(idx_li)
      }
    }
  }
  idx_li[[length(idx_li)]]
}

最小二乘回归树生成算法

chap5_5

chap5_5

createLeastSquareTree <- function(DF, sse) {
  nc <- ncol(DF)
  nr <- nrow(DF)
  
  store_min <- as.data.frame(matrix(NA, nr, nc - 1))
  
  for (i in seq(nc - 1)) {
    for (j in seq(nr)) {
      R1 <- filter(DF, DF[, i] <  DF[j, i]) # s = DF[j, i]
      R2 <- filter(DF, DF[, i] >= DF[j, i])
      
      c1 <- sum(R1[, nc]) / length(R1[, nc]) %>% round(., 4)
      minc1 <- (R1[, nc] - c1)^2 %>% sum(.)
      
      c2 <- sum(R2[, nc]) / length(R2[, nc]) %>% round(., 4)
      minc2 <- (R2[, nc] - c2)^2 %>% sum(.)
      store_min[j, i] <- min <- minc1 + minc2
    }
  }
  
  idx_best <- idx_min(store_min)     #调用idx_min()函数
  colum <- idx_best[2]
  ro <- idx_best[1]
  
  print(paste0("如果满足数据集切分条件,将选取变量:", colnames(DF)[colum]))
  print(paste0("切分点(值):", DF[ro, colum]))
  
  if (store_min[ro, colum] %>% round(.,4) > sse) {
    print(paste0("残差平方和为:", 
                 store_min[ro, colum] %>% 
                   round(.,4), " > ", sse, ", 满足切分条件"))
  } else {
    print(paste0("残差平方和为:", 
                 store_min[ro, colum] %>% 
                   round(.,4), " <= ", sse, ", 不满足切分条件"))
  }
  
  if (store_min[ro,colum] > sse) {
    
    R1 <- filter(DF, DF[, colum] <  DF[ro, colum])
    R2 <- filter(DF, DF[, colum] >= DF[ro, colum])
    
    print(paste0("组R1的样本点个数:", nrow(R1)))
    print(paste0("组R1的y均值:", R1[, nc] %>% 
                   mean(.) %>% round(., 4)))
    print(paste0("组R2的样本点个数:", nrow(R2)))
    print(paste0("组R2的y均值:", R2[, nc] %>% 
                   mean(.) %>% round(., 4)))
    
    if (nrow(R1) != 1 && 
        nrow(R1) != 0 && 
        nrow(R1) != nrow(DF)) {
      createLeastSquareTree(R1, sse)
    }
    if (nrow(R2) != 1 && 
        nrow(R2) != 0 && 
        nrow(R2) != nrow(DF)) {
      createLeastSquareTree(R2, sse)
    }
  }
}

#iris1数据集测试算法,
iris1 <- iris[, 1:4]
head(iris1)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4
createLeastSquareTree(iris1, sse = 3)
## [1] "如果满足数据集切分条件,将选取变量:Petal.Length"
## [1] "切分点(值):3"
## [1] "残差平方和为:18.4066 > 3, 满足切分条件"
## [1] "组R1的样本点个数:50"
## [1] "组R1的y均值:0.246"
## [1] "组R2的样本点个数:100"
## [1] "组R2的y均值:1.676"
## [1] "如果满足数据集切分条件,将选取变量:Sepal.Length"
## [1] "切分点(值):5"
## [1] "残差平方和为:0.4575 <= 3, 不满足切分条件"
## [1] "如果满足数据集切分条件,将选取变量:Petal.Length"
## [1] "切分点(值):4.8"
## [1] "残差平方和为:6.2953 > 3, 满足切分条件"
## [1] "组R1的样本点个数:45"
## [1] "组R1的y均值:1.3"
## [1] "组R2的样本点个数:55"
## [1] "组R2的y均值:1.9836"
## [1] "如果满足数据集切分条件,将选取变量:Petal.Length"
## [1] "切分点(值):4.2"
## [1] "残差平方和为:0.8058 <= 3, 不满足切分条件"
## [1] "如果满足数据集切分条件,将选取变量:Petal.Length"
## [1] "切分点(值):5.1"
## [1] "残差平方和为:3.6867 > 3, 满足切分条件"
## [1] "组R1的样本点个数:13"
## [1] "组R1的y均值:1.7308"
## [1] "组R2的样本点个数:42"
## [1] "组R2的y均值:2.0619"
## [1] "如果满足数据集切分条件,将选取变量:Sepal.Length"
## [1] "切分点(值):6.8"
## [1] "残差平方和为:0.2814 <= 3, 不满足切分条件"
## [1] "如果满足数据集切分条件,将选取变量:Sepal.Width"
## [1] "切分点(值):3.1"
## [1] "残差平方和为:2.473 <= 3, 不满足切分条件"
#调用rpart()函数,作对比
require(rpart)
## Loading required package: rpart
rpart(Petal.Width ~ Sepal.Length + Sepal.Width + Petal.Length,iris1)
## n= 150 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 150 86.5699300 1.199333  
##    2) Petal.Length< 2.45 50  0.5442000 0.246000 *
##    3) Petal.Length>=2.45 100 17.8624000 1.676000  
##      6) Petal.Length< 4.75 45  1.5200000 1.300000 *
##      7) Petal.Length>=4.75 55  4.7752730 1.983636  
##       14) Petal.Length< 5.05 13  0.4676923 1.730769 *
##       15) Petal.Length>=5.05 42  3.2190480 2.061905 *

计算样本集合D的基尼系数

calcu_gini_D <- function(DF) {
  
  nc <- ncol(DF)
  prop.tab <- DF[, nc] %>% table(.) %>% prop.table(.)
  sum_square <- prop.tab^2 %>% sum(.)
  gini_D <- 1 - sum_square
  gini_D
}

calcu_gini_D(iris)
## [1] 0.6666667

计算在特征A的条件下,样本集合D的基尼指数,返回基尼指数最小的对应的特征A取值

#可以接受class(df[, A]) 为factor
min_gini_baseA <- function(df, A) { 
  
  value <- df[, A] %>% as.character(.)   
  class(df[, A]) <- "character"
  df[, A] <- value
  
  group_cou <- df %>% count_(., A)
  group_cou[, "prop"] <- (group_cou[, 2] / 
                            sum(group_cou[, 2])) %>% 
    round(., 4)
  group_cou <- tibble::column_to_rownames(
    group_cou %>% as.data.frame(.), var = A)

  nr <- nrow(group_cou)
  gini_df <- matrix(NA, nr, 3) %>% as.data.frame(.)
  colnames(gini_df) <- c("characteristic", "value", "gini")
  gini_df[, "characteristic"] <- A
  
  for (i in seq(nr)) {
    ii <- rownames(group_cou)[i]
    D1 <- filter(df, df[, A] == ii)
    D2 <- filter(df, df[, A] != ii)
    
    gini_df[i, "gini"] <- group_cou[i, "prop"] * calcu_gini_D(D1) + 
                          (1 - group_cou[i, "prop"]) * calcu_gini_D(D2)
    gini_df[i, "value"] <- ii
  }
  arrange(gini_df, gini)[1, ]
}

min_gini_baseA(iris, "Species")
##   characteristic  value    gini
## 1        Species setosa 0.33335
min_gini_baseA(iris, "Petal.Length")
##   characteristic value      gini
## 1   Petal.Length   1.4 0.6033843
sampletest_fac <- data.frame(
  age = c(rep("青年",5),rep("中年",5),rep("老年",5)),
  has_job = c("否","否","是","是","否",
              "否","否","是","否","否",
              "否","否","是","是","否"),
  has_house = c("否","否","否","是","否",
                "否","否","是","是","是",
                "是","是","否","否","否"),
  credit = factor(c("一般","好","好","一般","一般",
                    "一般","好","好","非常好","非常好",
                    "非常好","好","好","非常好","一般")),
  category = c("否","否","是","是","否",
               "否","否","是","是","是",
               "是","是","是","是","否"))

class(sampletest_fac[, "credit"])
## [1] "factor"
min_gini_baseA(sampletest_fac, "credit")
##   characteristic value gini
## 1         credit  一般 0.32
min_gini_baseA(sampletest_fac, "age")
##   characteristic value     gini
## 1            age  青年 0.439998

CART回归树生成算法

chap5_6

chap5_6

creatCARTtree <- function(DF, mini_gini = 0.05) {
  
  nc <- ncol(DF)
  colnam <- colnames(DF)
  colnam_ <- colnam[-nc]  
  
  min_gini_df <- map_df(colnam_, min_gini_baseA, df = DF)
  
  MIN <- arrange(min_gini_df, gini)[1, ]
  cha <- MIN[1, "characteristic"]
  val <- MIN[1, "value"]
  
  print(MIN[1, c("characteristic", "value")])
  
  colnam_leave <- setdiff(colnam, cha)
  
  D1_ <- filter(DF,DF[,cha] == val)[, colnam_leave]
  D2_ <- filter(DF,DF[,cha] != val)[, colnam_leave]
  
  print(paste0("子节点D1_划归:", returnlabel(D1_)))
  print(paste0("子节点D2_划归:", returnlabel(D2_)))
  
  print(paste0("样本集D1_的基尼指数:", calcu_gini_D(D1_)))
  print(paste0("样本集D2_的基尼指数:", calcu_gini_D(D2_)))
  
  if (calcu_gini_D(D1_) > mini_gini) {
    creatCARTtree(D1_)
  }
  if (calcu_gini_D(D2_) > mini_gini) {
    creatCARTtree(D2_)
  }
}

# 例5.4
creatCARTtree(sampletest_fac, mini_gini = 0.05)
##   characteristic value
## 1      has_house    否
## [1] "子节点D1_划归:否"
## [1] "子节点D2_划归:是"
## [1] "样本集D1_的基尼指数:0.444444444444444"
## [1] "样本集D2_的基尼指数:0"
##   characteristic value
## 1        has_job    否
## [1] "子节点D1_划归:否"
## [1] "子节点D2_划归:是"
## [1] "样本集D1_的基尼指数:0"
## [1] "样本集D2_的基尼指数:0"
#导入数据集housevotes.csv,该数据集位于h20包的extdata文件中
tes <- read.csv("c:/housevotes.csv", header = TRUE, 
                stringsAsFactors = FALSE)
tes <- tes[, c(2:ncol(tes), 1)]   #把类别列置于最后一列
dim(tes)
## [1] 232  17
head(tes)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16      Class
## 1  n  y  y  n  y  y  n  n  n   n   n   n   y   y   y   y   democrat
## 2  n  y  n  y  y  y  n  n  n   n   n   y   y   y   n   y republican
## 3  y  y  y  n  n  n  y  y  y   n   y   n   n   n   y   y   democrat
## 4  y  y  y  n  n  n  y  y  y   n   n   n   n   n   y   y   democrat
## 5  y  n  y  n  n  n  y  y  y   y   n   n   n   n   y   y   democrat
## 6  y  n  y  n  n  n  y  y  y   n   y   n   n   n   y   y   democrat
creatCARTtree(tes, mini_gini = 0.05)
##   characteristic value
## 1             V4     n
## [1] "子节点D1_划归:democrat"
## [1] "子节点D2_划归:republican"
## [1] "样本集D1_的基尼指数:0.0166654897253018"
## [1] "样本集D2_的基尼指数:0.100556034145195"
##   characteristic value
## 1            V11     y
## [1] "子节点D1_划归:republican"
## [1] "子节点D2_划归:republican"
## [1] "样本集D1_的基尼指数:0.385633270321361"
## [1] "样本集D2_的基尼指数:0"
##   characteristic value
## 1             V9     n
## [1] "子节点D1_划归:republican"
## [1] "子节点D2_划归:democrat"
## [1] "样本集D1_的基尼指数:0.32"
## [1] "样本集D2_的基尼指数:0.444444444444444"
##   characteristic value
## 1             V3     n
## [1] "子节点D1_划归:republican"
## [1] "子节点D2_划归:democrat"
## [1] "样本集D1_的基尼指数:0.21875"
## [1] "样本集D2_的基尼指数:0.5"
##   characteristic value
## 1            V16     n
## [1] "子节点D1_划归:republican"
## [1] "子节点D2_划归:republican"
## [1] "样本集D1_的基尼指数:0.408163265306122"
## [1] "样本集D2_的基尼指数:0"
##   characteristic value
## 1            V13     n
## [1] "子节点D1_划归:democrat"
## [1] "子节点D2_划归:republican"
## [1] "样本集D1_的基尼指数:0"
## [1] "样本集D2_的基尼指数:0.277777777777778"
##   characteristic value
## 1             V2     n
## [1] "子节点D1_划归:democrat"
## [1] "子节点D2_划归:republican"
## [1] "样本集D1_的基尼指数:0.5"
## [1] "样本集D2_的基尼指数:0"
##   characteristic value
## 1             V1     n
## [1] "子节点D1_划归:democrat"
## [1] "子节点D2_划归:republican"
## [1] "样本集D1_的基尼指数:0"
## [1] "样本集D2_的基尼指数:0"
##   characteristic value
## 1             V2     n
## [1] "子节点D1_划归:republican"
## [1] "子节点D2_划归:democrat"
## [1] "样本集D1_的基尼指数:0"
## [1] "样本集D2_的基尼指数:0"
##   characteristic value
## 1             V1     y
## [1] "子节点D1_划归:democrat"
## [1] "子节点D2_划归:democrat"
## [1] "样本集D1_的基尼指数:0.5"
## [1] "样本集D2_的基尼指数:0"
##   characteristic value
## 1             V3     n
## [1] "子节点D1_划归:democrat"
## [1] "子节点D2_划归:republican"
## [1] "样本集D1_的基尼指数:0"
## [1] "样本集D2_的基尼指数:0"