Chapter 4 朴素贝叶斯

chap4_1

chap4_1

chap4_2

chap4_2

#注意assign(), get(), Reduce(), list()的应用方法
require(magrittr)
pred_naive_bayes <- function(DF, Pred, lambda = 0) {
  #####################
  stopifnot(is.data.frame(DF))
  stopifnot(is.data.frame(Pred))
  
  nc <- ncol(DF)
  nc_pre <- ncol(Pred)
  stopifnot(nc_pre == nc - 1)
  lev <- levels(DF[, nc])
  total_Y <- length(DF[, nc])
  colna <- colnames(DF)
  
  P_Y <- list()
  for (i in seq(length(lev))) {
    
    nominator <- (DF[, nc] == lev[i]) %>% 
      as.numeric(.) %>% 
      sum(.) + lambda
    denominator <- total_Y + lambda * length(lev)

    pro_Y <- (nominator / denominator) %>% round(., 4)
    
    var_name <- paste("P", colna[nc], lev[i], sep = "_")
    assign(var_name, pro_Y)
    P_Y[[var_name]] <- get(var_name)
  }
  
  P_x <- list()
  for (h in seq(length(lev))) { #标签y有多少个水平
    for (k in seq(length(colna) - 1)) {  #有多少个特征
      unique_x <- colna[k] %>% DF[[.]] %>% unique(.)
      for (i in seq(length(unique_x))) {   #x1取1,2,3;或x2取S,M,L 的条件概率
        
        nominator <- ((colna[k] %>% DF[[.]] == unique_x[i]) & 
                        (colna[nc] %>% DF[[.]] == lev[h])) %>% 
          as.numeric(.) %>% 
          sum(.) + lambda
        denominator <- (colna[nc] %>% DF[[.]] == lev[h]) %>% 
          as.numeric(.) %>% 
          sum(.) + lambda * length(unique_x)
        
        probability <- (nominator / denominator) %>% round(., 4)
        var_name <- paste("P", colna[k], unique_x[i], 
                          colna[nc], lev[h], sep = "_")
        assign(var_name, probability)
        P_x[[var_name]] <- get(var_name)
      }
    }
  }
  ############################
  pr <- matrix(NA, length(lev), length(colna) - 1)
  colnames(pr) <- colna[1:(length(colna) - 1)]
  rownames(pr) <- lev[1:length(lev)]
  
  pro_targ <- as.data.frame(matrix(NA, nrow(pr), 1))
  r.names <- rownames(pr)
  rownames(pro_targ) <- r.names

  labels_all <- matrix(NA, nrow(Pred), 1)
  for (h in seq(nrow(Pred))) {
    for (k in seq(length(lev))) {
      for (i in seq((length(colna) - 1))) {
        pr[k,i] <- get(paste("P", colna[i], Pred[h,colna[i]], 
                             colna[nc], lev[k], sep = "_"))
      }
    }
    for (j in seq(nrow(pr))) {
      pro_targ[r.names[j], 1] <- Reduce(`*`, pr[r.names[j], ]) *
        get(paste("P", colna[nc], r.names[j], sep = "_"))
    }
    
    idx <- which.max(pro_targ[, 1])
    lab <- rownames(pro_targ)[idx]
    #print(pro_targ)
    labels_all[h] <- lab
  }
  return(labels_all)
}

DF <- data.frame(x1 = c(rep(1,5),rep(2,5),rep(3,5)),
                 x2 = c("S","M","M","S","S",
                        "S","M","M","L","L",
                        "L","M","M","L","L"),
                 y = factor(c(0,0,1,1,0,
                              0,0,1,1,1,
                              1,1,1,1,0),levels = c(0,1)))
tes <- data.frame(x1 = c(2),
                  x2 = c("S"))
# 例题4.1
pred_naive_bayes(DF = DF, Pred = tes, lambda = 0)
##      [,1]
## [1,] "0"
# 例题4.2
pred_naive_bayes(DF = DF, Pred = tes, lambda = 1)
##      [,1]
## [1,] "0"
pred_naive_bayes(DF = DF, Pred = DF[, -ncol(DF)], lambda = 0)
##       [,1]
##  [1,] "0" 
##  [2,] "0" 
##  [3,] "0" 
##  [4,] "0" 
##  [5,] "0" 
##  [6,] "0" 
##  [7,] "1" 
##  [8,] "1" 
##  [9,] "1" 
## [10,] "1" 
## [11,] "1" 
## [12,] "1" 
## [13,] "1" 
## [14,] "1" 
## [15,] "1"
pred_naive_bayes(DF = DF, Pred = DF[, -ncol(DF)], lambda = 1)
##       [,1]
##  [1,] "0" 
##  [2,] "1" 
##  [3,] "1" 
##  [4,] "0" 
##  [5,] "0" 
##  [6,] "0" 
##  [7,] "1" 
##  [8,] "1" 
##  [9,] "1" 
## [10,] "1" 
## [11,] "1" 
## [12,] "1" 
## [13,] "1" 
## [14,] "1" 
## [15,] "1"
test_1 <- read.csv(file = "c:/housevotes.csv", header = TRUE,
                   stringsAsFactors = FALSE)
head(test_1)
##        Class V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
## 1   democrat  n  y  y  n  y  y  n  n  n   n   n   n   y   y   y   y
## 2 republican  n  y  n  y  y  y  n  n  n   n   n   y   y   y   n   y
## 3   democrat  y  y  y  n  n  n  y  y  y   n   y   n   n   n   y   y
## 4   democrat  y  y  y  n  n  n  y  y  y   n   n   n   n   n   y   y
## 5   democrat  y  n  y  n  n  n  y  y  y   y   n   n   n   n   y   y
## 6   democrat  y  n  y  n  n  n  y  y  y   n   y   n   n   n   y   y
test_1 <- test_1[, c(2:ncol(test_1), 1)]
test_1$Class <- factor(test_1$Class)
head(test_1)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16      Class
## 1  n  y  y  n  y  y  n  n  n   n   n   n   y   y   y   y   democrat
## 2  n  y  n  y  y  y  n  n  n   n   n   y   y   y   n   y republican
## 3  y  y  y  n  n  n  y  y  y   n   y   n   n   n   y   y   democrat
## 4  y  y  y  n  n  n  y  y  y   n   n   n   n   n   y   y   democrat
## 5  y  n  y  n  n  n  y  y  y   y   n   n   n   n   y   y   democrat
## 6  y  n  y  n  n  n  y  y  y   n   y   n   n   n   y   y   democrat
labels <- pred_naive_bayes(DF = test_1, Pred = test_1[, -ncol(test_1)], lambda = 0)
(cl <- table(test_1$Class, labels))
##             labels
##              democrat republican
##   democrat        111         13
##   republican        6        102
(cl[1, 1] + cl[2, 2]) / sum(cl)
## [1] 0.9181034
labels_1 <- pred_naive_bayes(DF = test_1,Pred = test_1[,-ncol(test_1)],lambda = 1)
(cl_1 <- table(test_1$Class,labels_1))
##             labels_1
##              democrat republican
##   democrat        110         14
##   republican        6        102
(cl_1[1,1] + cl_1[2,2]) / sum(cl_1)
## [1] 0.9137931
lamb <- seq(1, 5, by = 0.2)
accuracy <- NULL
for (i in lamb) {
  
  labels <- pred_naive_bayes(DF = test_1, Pred = test_1[, -ncol(test_1)], lambda = i)
  cl <- table(test_1$Class, labels)
  accu <- (cl[1, 1] + cl[2, 2]) / sum(cl)
  accuracy <- append(accuracy, accu)
}

require(ggplot2)
df_plot <- data.frame(x = lamb, y = accuracy)
ggplot(df_plot,aes(lamb, accuracy)) + 
  geom_point() + 
  scale_y_continuous(limits = c(0.85, 0.95))