# Chapter 4 朴素贝叶斯

``````#注意assign(), get(), Reduce(), list()的应用方法
require(magrittr)
pred_naive_bayes <- function(DF, Pred, lambda = 0) {
#####################
stopifnot(is.data.frame(DF))
stopifnot(is.data.frame(Pred))

nc <- ncol(DF)
nc_pre <- ncol(Pred)
stopifnot(nc_pre == nc - 1)
lev <- levels(DF[, nc])
total_Y <- length(DF[, nc])
colna <- colnames(DF)

P_Y <- list()
for (i in seq(length(lev))) {

nominator <- (DF[, nc] == lev[i]) %>%
as.numeric(.) %>%
sum(.) + lambda
denominator <- total_Y + lambda * length(lev)

pro_Y <- (nominator / denominator) %>% round(., 4)

var_name <- paste("P", colna[nc], lev[i], sep = "_")
assign(var_name, pro_Y)
P_Y[[var_name]] <- get(var_name)
}

P_x <- list()
for (h in seq(length(lev))) { #标签y有多少个水平
for (k in seq(length(colna) - 1)) {  #有多少个特征
unique_x <- colna[k] %>% DF[[.]] %>% unique(.)
for (i in seq(length(unique_x))) {   #x1取1,2,3；或x2取S，M，L 的条件概率

nominator <- ((colna[k] %>% DF[[.]] == unique_x[i]) &
(colna[nc] %>% DF[[.]] == lev[h])) %>%
as.numeric(.) %>%
sum(.) + lambda
denominator <- (colna[nc] %>% DF[[.]] == lev[h]) %>%
as.numeric(.) %>%
sum(.) + lambda * length(unique_x)

probability <- (nominator / denominator) %>% round(., 4)
var_name <- paste("P", colna[k], unique_x[i],
colna[nc], lev[h], sep = "_")
assign(var_name, probability)
P_x[[var_name]] <- get(var_name)
}
}
}
############################
pr <- matrix(NA, length(lev), length(colna) - 1)
colnames(pr) <- colna[1:(length(colna) - 1)]
rownames(pr) <- lev[1:length(lev)]

pro_targ <- as.data.frame(matrix(NA, nrow(pr), 1))
r.names <- rownames(pr)
rownames(pro_targ) <- r.names

labels_all <- matrix(NA, nrow(Pred), 1)
for (h in seq(nrow(Pred))) {
for (k in seq(length(lev))) {
for (i in seq((length(colna) - 1))) {
pr[k,i] <- get(paste("P", colna[i], Pred[h,colna[i]],
colna[nc], lev[k], sep = "_"))
}
}
for (j in seq(nrow(pr))) {
pro_targ[r.names[j], 1] <- Reduce(`*`, pr[r.names[j], ]) *
get(paste("P", colna[nc], r.names[j], sep = "_"))
}

idx <- which.max(pro_targ[, 1])
lab <- rownames(pro_targ)[idx]
#print(pro_targ)
labels_all[h] <- lab
}
return(labels_all)
}

DF <- data.frame(x1 = c(rep(1,5),rep(2,5),rep(3,5)),
x2 = c("S","M","M","S","S",
"S","M","M","L","L",
"L","M","M","L","L"),
y = factor(c(0,0,1,1,0,
0,0,1,1,1,
1,1,1,1,0),levels = c(0,1)))
tes <- data.frame(x1 = c(2),
x2 = c("S"))
# 例题4.1
pred_naive_bayes(DF = DF, Pred = tes, lambda = 0)``````
``````##      [,1]
## [1,] "0"``````
``````# 例题4.2
pred_naive_bayes(DF = DF, Pred = tes, lambda = 1)``````
``````##      [,1]
## [1,] "0"``````
``pred_naive_bayes(DF = DF, Pred = DF[, -ncol(DF)], lambda = 0)``
``````##       [,1]
##  [1,] "0"
##  [2,] "0"
##  [3,] "0"
##  [4,] "0"
##  [5,] "0"
##  [6,] "0"
##  [7,] "1"
##  [8,] "1"
##  [9,] "1"
## [10,] "1"
## [11,] "1"
## [12,] "1"
## [13,] "1"
## [14,] "1"
## [15,] "1"``````
``pred_naive_bayes(DF = DF, Pred = DF[, -ncol(DF)], lambda = 1)``
``````##       [,1]
##  [1,] "0"
##  [2,] "1"
##  [3,] "1"
##  [4,] "0"
##  [5,] "0"
##  [6,] "0"
##  [7,] "1"
##  [8,] "1"
##  [9,] "1"
## [10,] "1"
## [11,] "1"
## [12,] "1"
## [13,] "1"
## [14,] "1"
## [15,] "1"``````
``````test_1 <- read.csv(file = "c:/housevotes.csv", header = TRUE,
stringsAsFactors = FALSE)
``````##        Class V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
## 1   democrat  n  y  y  n  y  y  n  n  n   n   n   n   y   y   y   y
## 2 republican  n  y  n  y  y  y  n  n  n   n   n   y   y   y   n   y
## 3   democrat  y  y  y  n  n  n  y  y  y   n   y   n   n   n   y   y
## 4   democrat  y  y  y  n  n  n  y  y  y   n   n   n   n   n   y   y
## 5   democrat  y  n  y  n  n  n  y  y  y   y   n   n   n   n   y   y
## 6   democrat  y  n  y  n  n  n  y  y  y   n   y   n   n   n   y   y``````
``````test_1 <- test_1[, c(2:ncol(test_1), 1)]
test_1\$Class <- factor(test_1\$Class)
``````##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16      Class
## 1  n  y  y  n  y  y  n  n  n   n   n   n   y   y   y   y   democrat
## 2  n  y  n  y  y  y  n  n  n   n   n   y   y   y   n   y republican
## 3  y  y  y  n  n  n  y  y  y   n   y   n   n   n   y   y   democrat
## 4  y  y  y  n  n  n  y  y  y   n   n   n   n   n   y   y   democrat
## 5  y  n  y  n  n  n  y  y  y   y   n   n   n   n   y   y   democrat
## 6  y  n  y  n  n  n  y  y  y   n   y   n   n   n   y   y   democrat``````
``````labels <- pred_naive_bayes(DF = test_1, Pred = test_1[, -ncol(test_1)], lambda = 0)
(cl <- table(test_1\$Class, labels))``````
``````##             labels
##              democrat republican
##   democrat        111         13
##   republican        6        102``````
``(cl[1, 1] + cl[2, 2]) / sum(cl)``
``## [1] 0.9181034``
``````labels_1 <- pred_naive_bayes(DF = test_1,Pred = test_1[,-ncol(test_1)],lambda = 1)
(cl_1 <- table(test_1\$Class,labels_1))``````
``````##             labels_1
##              democrat republican
##   democrat        110         14
##   republican        6        102``````
``(cl_1[1,1] + cl_1[2,2]) / sum(cl_1)``
``## [1] 0.9137931``
``````lamb <- seq(1, 5, by = 0.2)
accuracy <- NULL
for (i in lamb) {

labels <- pred_naive_bayes(DF = test_1, Pred = test_1[, -ncol(test_1)], lambda = i)
cl <- table(test_1\$Class, labels)
accu <- (cl[1, 1] + cl[2, 2]) / sum(cl)
accuracy <- append(accuracy, accu)
}

require(ggplot2)
df_plot <- data.frame(x = lamb, y = accuracy)
ggplot(df_plot,aes(lamb, accuracy)) +
geom_point() +
scale_y_continuous(limits = c(0.85, 0.95))``````