Chapter 3 Digit Model
In preparation for neural networks, we take a brief chapter to run other models on MNIST hand-written data. First we will run a binomial GLM on each digit and keep the maximum outputted likelihood as the predicted digit, then we will run a multinomial GLM to assess the likelihood of every digit simultaneously.
This chapter can be safely skipped / ignored.
3.1 Linear Model
# Import dataset
pixel_numbers <- seq(1, 784)
x_column_names <- paste("pixel", pixel_numbers, sep = "")
y_column_names <- c("Zero", "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine")
X.train <- read.csv('X_train.csv', header = FALSE, col.names = x_column_names)
X.test <- read.csv('X_test.csv', header = FALSE, col.names = x_column_names)
Y.train <- read.csv('Y_train.csv', header = FALSE, col.names = y_column_names)
Y.test <- read.csv('Y_test.csv', header = FALSE, col.names = y_column_names)
train_set <- cbind(X.train, Y.train)
test_set <- cbind(X.test, Y.test)
# Build logistic regression for each number
lm.fit0 <- glm(Zero ~. -One - Two - Three - Four - Five - Six - Seven - Eight - Nine,
data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit1 <- glm(One ~. - Zero - Two - Three - Four - Five - Six - Seven - Eight - Nine,
data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit2 <- glm(Two ~. - Zero - One - Three - Four - Five - Six - Seven - Eight - Nine,
data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit3 <- glm(Three ~. - Zero - One - Two - Four - Five - Six - Seven - Eight - Nine,
data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit4 <- glm(Four ~. - Zero - One - Two - Three - Five - Six - Seven - Eight - Nine,
data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit5 <- glm(Five ~. - Zero - One - Two - Three - Four - Six - Seven - Eight - Nine,
data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit6 <- glm(Six ~. - Zero - One - Two - Three - Four - Five - Seven - Eight - Nine,
data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit7 <- glm(Seven ~. - Zero - One - Two - Three - Four - Five - Six - Eight - Nine,
data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit8 <- glm(Eight ~. - Zero - One - Two - Three - Four - Five - Six - Seven - Nine,
data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit9 <- glm(Nine ~. - Zero - One - Two - Three - Four - Five - Six - Seven - Eight,
data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Define softmax function
softmax <- function(x) {
exp_x <- exp(x)
return(exp_x / rowSums(exp_x))
}
# Function to predict using the model and new data
predict_response <- function(model, newdata = NULL) {
as.vector(predict(model, type = "response", newdata = newdata))
}
# --- Training set predictions ---
train.preds <- lapply(list(lm.fit0, lm.fit1, lm.fit2, lm.fit3, lm.fit4, lm.fit5, lm.fit6, lm.fit7, lm.fit8, lm.fit9), predict_response)
train.result.matrix <- do.call(cbind, train.preds)
# Apply softmax and find the predicted classes
softmax_predictions_train <- softmax(train.result.matrix)
predicted_classes_train <- max.col(softmax_predictions_train) - 1
actual_train <- max.col(Y.train) - 1
train_accuracy <- sum(predicted_classes_train == actual_train) / length(actual_train)
# --- Testing set predictions ---
test.preds <- lapply(list(lm.fit0, lm.fit1, lm.fit2, lm.fit3, lm.fit4, lm.fit5, lm.fit6, lm.fit7, lm.fit8, lm.fit9), function(model) predict_response(model, newdata = test_set))
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
test.result.matrix <- do.call(cbind, test.preds)
# Apply softmax and find the predicted classes
softmax_predictions_test <- softmax(test.result.matrix)
predicted_classes_test <- max.col(softmax_predictions_test) - 1
actual_test <- max.col(Y.test) - 1
test_accuracy <- sum(predicted_classes_test == actual_test) / length(actual_test)
# Print the accuracies
print(paste("Training Set Accuracy:", round(train_accuracy, 3)))
## [1] "Training Set Accuracy: 0.656"
## [1] "Testing Set Accuracy: 0.633"
3.2 Binomial Model I
3.2.1 Setup
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'keras' was built under R version 4.2.3
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
## Warning: package 'MASS' was built under R version 4.2.3
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
## Warning: package 'logistf' was built under R version 4.2.3
## Warning in check_dep_version(): ABI version mismatch:
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
## Warning: package 'Metrics' was built under R version 4.2.3
## Warning: package 'prediction' was built under R version 4.2.3
# Loads the MNIST dataset, saves as an .RData file if not in WD
if (!(file.exists("mnist_data.RData"))) {
# ## installs older python version
# reticulate::install_python("3.10:latest")
# keras::install_keras(python_version = "3.10")
# ## re-loads keras
# library(keras)
## get MNIST data
mnist <- dataset_mnist()
## save to WD as .RData
save(mnist, file = "mnist_data.RData")
} else {
## read-in MNIST data
load(file = "mnist_data.RData")
}
# Access the training and testing sets
x_train <- mnist$train$x
y_train <- mnist$train$y
x_test <- mnist$test$x
y_test <- mnist$test$y
rm(mnist)
## plot function, from OG data
plot_mnist <- function(plt) {
## create image
image(x = 1:28,
y = 1:28,
## image is oriented incorrectly, this fixes it
z = t(apply(plt, 2, rev)),
## 255:0 puts black on white canvas,
## changing to 0:255 puts white on black canvas
col = gray((255:0)/255),
axes = FALSE)
## create plot border
rect(xleft = 0.5,
ybottom = 0.5,
xright = 28 + 0.5,
ytop = 28 + 0.5,
border = "black",
lwd = 1)
}
## train data
# initialize matrix
x_train_2 <- matrix(nrow = nrow(x_train),
ncol = 28*28)
## likely a faster way to do this in the future
for (i in 1:nrow(x_train)) {
## get each layer's matrix image, stretch to 28^2 x 1
x_train_2[i, ] <- matrix(x_train[i, , ], 1, 28*28)
}
x_train_2 <- x_train_2 %>%
as.data.frame()
## test data
x_test_2 <- matrix(nrow = nrow(x_test),
ncol = 28*28)
for (i in 1:nrow(x_test)) {
x_test_2[i, ] <- matrix(x_test[i, , ], 1, 28*28)
}
x_test_2 <- x_test_2 %>%
as.data.frame()
## re-scale data
x_train_2 <- x_train_2 / 256
x_test_2 <- x_test_2 / 256
## response
# x_test_2$y <- y_test
# x_train_2$y <- y_train
3.2.2 Model I
## for speed
# n <- nrow(x_train_2)
n <- 100
indices <- sample(x = 1:nrow(x_train_2),
size = n)
## init data
x_glm <- x_train_2[indices, ]
y_glm <- y_train[indices]
train_pred <- list()
## drop cols with all 0s
x_glm <- x_glm[, (colSums(x_glm) > 0)]
## 10 model method
for (i in 0:9) {
print(i)
y_glm_i = (y_glm == i)
init_model <- cv.glmnet(x = x_glm %>% as.matrix,
y = y_glm_i,
family = binomial,
alpha = 1)
train_pred[[i + 1]] <- predict(init_model,
x_glm %>% as.matrix,
s = init_model$lambda.min,
type = "response")
}
## [1] 0
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## format results
predictions <- data.frame(train_pred)
names(predictions) <- c("zero",
"one",
"two",
"three",
"four",
"five",
"six",
"seven",
"eight",
"nine")
#write.csv(predictions, "pred.csv", row.names = FALSE)
## convert to numeric
max_col <- apply(X = predictions,
MARGIN = 1,
FUN = function(x) names(x)[which.max(x)])
word_to_number <- c("zero" = 0,
"one" = 1,
"two" = 2,
"three" = 3,
"four" = 4,
"five" = 5,
"six" = 6,
"seven" = 7,
"eight" = 8,
"nine" = 9)
preds <- word_to_number[max_col] %>% as.numeric
## confusion matrix
table(y_glm, preds)
## preds
## y_glm 0 1 2 3 4 6 7 8 9
## 0 12 0 0 0 0 0 0 0 0
## 1 0 9 0 0 0 0 0 0 0
## 2 0 0 11 0 0 0 0 0 1
## 3 0 1 1 2 0 0 0 1 0
## 4 0 0 0 0 11 0 0 0 0
## 5 1 1 0 0 0 0 1 0 1
## 6 0 0 0 0 0 9 0 0 0
## 7 0 0 0 0 0 0 5 0 3
## 8 0 0 0 0 0 0 0 11 0
## 9 0 0 0 0 0 0 0 0 19
## [1] 0.11
3.2.3 Model II
## train data
# initialize matrix
x_train_2 <- matrix(nrow = nrow(x_train),
ncol = 28*28)
## likely a faster way to do this in the future
for (i in 1:nrow(x_train)) {
## get each layer's matrix image, stretch to 28^2 x 1
x_train_2[i, ] <- matrix(x_train[i, , ], 1, 28*28)
}
x_train_2 <- x_train_2 %>%
as.data.frame()
## test data
x_test_2 <- matrix(nrow = nrow(x_test),
ncol = 28*28)
for (i in 1:nrow(x_test)) {
x_test_2[i, ] <- matrix(x_test[i, , ], 1, 28*28)
}
x_test_2 <- x_test_2 %>%
as.data.frame()
# Initialize a list to store the logistic regression models
logistic_models <- list()
# Initialize a list to store the predicted probabilities for each digit
probs_train_list <- list()
probs_test_list <- list()
# Initialize a list to store the binary predictions for each digit
pred_train_list <- list()
pred_test_list <- list()
# Initialize lists to store log loss and classification error for each digit
log_loss_train_list <- list()
classification_error_train_list <- list()
log_loss_test_list <- list()
classification_error_test_list <- list()
# Iterate over digits from 0 to 9
for (digit in 0:9) {
# Apply the transformation to y_train and y_test for the current digit
y_train_digit <- ifelse(y_train == digit, 1, 0)
y_test_digit <- ifelse(y_test == digit, 1, 0)
# Fit logistic regression model for the current digit
logistic_model <- glm(y_train_digit ~ ., family = binomial(link = "logit"), data = x_train_2)
logistic_models[[as.character(digit)]] <- logistic_model
# Make predictions on the training set for the current digit
probs_train <- predict(logistic_model, x_train_2, type = "response")
probs_train_list[[as.character(digit)]] <- probs_train
# Convert predicted probabilities to binary predictions (0 or 1)
pred_train <- ifelse(probs_train > 0.5, 1, 0)
pred_train_list[[as.character(digit)]] <- pred_train
# Make predictions on the test set for the current digit
probs_test <- predict(logistic_model, x_test_2, type = "response")
probs_test_list[[as.character(digit)]] <- probs_test
# Convert predicted probabilities to binary predictions (0 or 1)
pred_test <- ifelse(probs_test > 0.5, 1, 0)
pred_test_list[[as.character(digit)]] <- pred_test
# Calculate log loss on the training set for the current digit
loss_entropy_train <- logLoss(y_train_digit, probs_train)
cat("Training set log loss for digit", digit, ":", round(loss_entropy_train, 3), "\n")
# Calculate log loss on the test set for the current digit
loss_entropy_test <- logLoss(y_test_digit, probs_test)
cat("Test set log loss for digit", digit, ":", round(loss_entropy_test, 3), "\n")
# Calculate classification error on the training set for the current digit
classification_error_train <- 1 - sum(pred_train == y_train_digit) / length(y_train_digit)
cat("Average number of classification errors on training set for digit", digit, ":", round(classification_error_train, 3), "\n")
# Calculate classification error on the test set for the current digit
classification_error_test <- 1 - sum(pred_test == y_test_digit) / length(y_test_digit)
cat("Average number of classification errors on test set for digit", digit, ":", round(classification_error_test, 3), "\n")
# Store log loss and classification error in lists
log_loss_train_list[[as.character(digit)]] <- loss_entropy_train
classification_error_train_list[[as.character(digit)]] <- classification_error_train
log_loss_test_list[[as.character(digit)]] <- loss_entropy_test
classification_error_test_list[[as.character(digit)]] <- classification_error_test
}
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 0 : 0.255
## Test set log loss for digit 0 : 0.404
## Average number of classification errors on training set for digit 0 : 0.007
## Average number of classification errors on test set for digit 0 : 0.011
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 1 : 0.285
## Test set log loss for digit 1 : 0.371
## Average number of classification errors on training set for digit 1 : 0.008
## Average number of classification errors on test set for digit 1 : 0.01
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 2 : 0.696
## Test set log loss for digit 2 : 0.811
## Average number of classification errors on training set for digit 2 : 0.019
## Average number of classification errors on test set for digit 2 : 0.022
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 3 : 0.914
## Test set log loss for digit 3 : 1.006
## Average number of classification errors on training set for digit 3 : 0.025
## Average number of classification errors on test set for digit 3 : 0.028
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 4 : 0.569
## Test set log loss for digit 4 : 0.681
## Average number of classification errors on training set for digit 4 : 0.016
## Average number of classification errors on test set for digit 4 : 0.019
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 5 : 1.049
## Test set log loss for digit 5 : 1.089
## Average number of classification errors on training set for digit 5 : 0.029
## Average number of classification errors on test set for digit 5 : 0.03
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 6 : 0.571
## Test set log loss for digit 6 : 0.761
## Average number of classification errors on training set for digit 6 : 0.016
## Average number of classification errors on test set for digit 6 : 0.021
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 7 : 0.552
## Test set log loss for digit 7 : 0.728
## Average number of classification errors on training set for digit 7 : 0.015
## Average number of classification errors on test set for digit 7 : 0.02
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 8 : 1.427
## Test set log loss for digit 8 : 1.55
## Average number of classification errors on training set for digit 8 : 0.04
## Average number of classification errors on test set for digit 8 : 0.043
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 9 : 1.526
## Test set log loss for digit 9 : 1.507
## Average number of classification errors on training set for digit 9 : 0.042
## Average number of classification errors on test set for digit 9 : 0.042
# Print the results from lists in a single loop
for (digit in 0:9) {
cat("Digit", digit, "\n")
cat(" Training Set Log Loss:", round(log_loss_train_list[[as.character(digit)]], 3), "\n")
cat(" Training Set Classification Error:", round(classification_error_train_list[[as.character(digit)]], 3), "\n")
cat(" Test Set Log Loss:", round(log_loss_test_list[[as.character(digit)]], 3), "\n")
cat(" Test Set Classification Error:", round(classification_error_test_list[[as.character(digit)]], 3), "\n")
}
## Digit 0
## Training Set Log Loss: 0.255
## Training Set Classification Error: 0.007
## Test Set Log Loss: 0.404
## Test Set Classification Error: 0.011
## Digit 1
## Training Set Log Loss: 0.285
## Training Set Classification Error: 0.008
## Test Set Log Loss: 0.371
## Test Set Classification Error: 0.01
## Digit 2
## Training Set Log Loss: 0.696
## Training Set Classification Error: 0.019
## Test Set Log Loss: 0.811
## Test Set Classification Error: 0.022
## Digit 3
## Training Set Log Loss: 0.914
## Training Set Classification Error: 0.025
## Test Set Log Loss: 1.006
## Test Set Classification Error: 0.028
## Digit 4
## Training Set Log Loss: 0.569
## Training Set Classification Error: 0.016
## Test Set Log Loss: 0.681
## Test Set Classification Error: 0.019
## Digit 5
## Training Set Log Loss: 1.049
## Training Set Classification Error: 0.029
## Test Set Log Loss: 1.089
## Test Set Classification Error: 0.03
## Digit 6
## Training Set Log Loss: 0.571
## Training Set Classification Error: 0.016
## Test Set Log Loss: 0.761
## Test Set Classification Error: 0.021
## Digit 7
## Training Set Log Loss: 0.552
## Training Set Classification Error: 0.015
## Test Set Log Loss: 0.728
## Test Set Classification Error: 0.02
## Digit 8
## Training Set Log Loss: 1.427
## Training Set Classification Error: 0.04
## Test Set Log Loss: 1.55
## Test Set Classification Error: 0.043
## Digit 9
## Training Set Log Loss: 1.526
## Training Set Classification Error: 0.042
## Test Set Log Loss: 1.507
## Test Set Classification Error: 0.042
# Compute the average log loss & classification errors
average_log_loss_train <- mean(sapply(log_loss_train_list, function(x) x))
cat("Average Training Set Log Loss:", round(average_log_loss_train, 3), "\n")
## Average Training Set Log Loss: 0.785
average_classification_error_train <- mean(sapply(classification_error_train_list, function(x) x))
cat("Average Training Set Classification Errors:", round(average_classification_error_train, 3), "\n")
## Average Training Set Classification Errors: 0.022
average_log_loss_test <- mean(sapply(log_loss_test_list, function(x) x))
cat("Average Test Set Log Loss:", round(average_log_loss_test, 3), "\n")
## Average Test Set Log Loss: 0.891
average_classification_error_test <- mean(sapply(classification_error_test_list, function(x) x))
cat("Average Test Set Classification Errors:", round(average_classification_error_test, 3), "\n")
## Average Test Set Classification Errors: 0.025
From the log loss and the average number of classification errors, we can see that the prediction results are quite accurate(training set: loss= 0.785, classification errors= 0.022; test set: loss= 0.891, classification error: 0.025)
3.3 Binomial Model II
# Download the first 2000 observations of the MNIST dataset
mnist <- read.csv("https://pjreddie.com/media/files/mnist_train.csv", nrows = 2000)
colnames(mnist) <- c("Digit", paste("Pixel", seq(1:784), sep = ""))
save(mnist, file = "mnist_first2000.RData")
3.3.1 1 knn
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.2.3
##
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
##
## precision, recall
## The following object is masked from 'package:purrr':
##
## lift
## Warning: package 'class' was built under R version 4.2.3
# Load the MNIST dataset
load("mnist_first2000.RData")
# Split the dataset into training data (first 1000 rows) and testing data (last 1000 rows)
train_data <- mnist[1:1000,]
test_data <- mnist[1001:2000,]
# Train a KNN model using 5-fold cross-validation
train_control <- trainControl(method="cv", number=5)
tune_grid <- expand.grid(.k=1:20)
knn_model <- train(as.factor(Digit)~., data=train_data, method="knn", tuneGrid=tune_grid, trControl=train_control)
# Select the value of K with the lowest cross-validation error
best_k <- knn_model$bestTune$k
print(best_k)
## [1] 1
# Use the KNN algorithm for classification
knn_fit <- knn(train=train_data[,1:785], test=test_data[,1:785], cl=train_data$Digit, k=best_k)
# Calculate the prediction error and confusion matrix
error_rate <- mean(knn_fit != test_data$Digit)
confusion_matrix <- table(knn_fit, test_data$Digit)
print(confusion_matrix)
##
## knn_fit 0 1 2 3 4 5 6 7 8 9
## 0 89 0 0 1 0 1 1 0 2 1
## 1 1 103 4 1 6 0 0 1 1 1
## 2 0 0 86 4 1 0 0 0 0 0
## 3 0 0 0 76 0 6 0 0 1 1
## 4 0 0 1 0 81 0 1 0 1 5
## 5 0 0 0 11 1 76 1 2 3 0
## 6 2 0 0 0 2 3 103 0 3 0
## 7 0 1 5 1 1 0 0 101 0 5
## 8 0 0 3 2 1 1 0 0 70 0
## 9 1 0 0 2 16 2 0 3 4 97
# Print the prediction classification error
print(paste("Prediction classification error:", error_rate))
## [1] "Prediction classification error: 0.118"
3.3.2 2 logistic regression
# Load the glmnet package
library(glmnet)
# Load the MNIST dataset
load("mnist_first2000.RData")
# Split the data into training and testing sets
train_data <- mnist[1:1000,]
test_data <- mnist[1001:2000,]
# Extract the predictors and response variables
x_train <- train_data[,2:785]
y_train <- train_data[, 1]
x_test <- test_data[,2:785]
y_test <- test_data[, 1]
# Convert non-numeric columns to numeric using model.matrix()
x_train <- model.matrix(~., data=train_data[, -1])
x_test <- model.matrix(~., data=test_data[, -1])
# Fit a multi-class logistic regression model with Lasso penalty
fit <- cv.glmnet(x_train, y_train, family="multinomial", alpha=1, type.measure="class")
# Select the best tuning parameter
best_lambda <- fit$lambda.min
# Make predictions on the testing data
pred <- predict(fit, newx=x_test, s=best_lambda, type="class")
# Compute the confusion matrix
confusion_matrix <- table(pred, y_test)
# Compute the prediction classification error
error_rate <- mean(pred != y_test)
# Print the confusion matrix and prediction classification error
print(confusion_matrix)
## y_test
## pred 0 1 2 3 4 5 6 7 8 9
## 0 83 0 1 0 0 1 2 0 3 4
## 1 0 90 2 1 1 0 1 0 3 0
## 2 0 1 75 12 3 0 4 0 2 0
## 3 1 1 1 63 1 1 0 2 1 2
## 4 1 0 3 0 82 3 1 3 0 10
## 5 5 0 1 10 2 74 3 1 1 1
## 6 0 0 4 1 3 1 94 0 1 0
## 7 1 1 6 2 3 2 0 92 2 6
## 8 2 11 4 5 2 6 1 2 68 2
## 9 0 0 2 4 12 1 0 7 4 85
## [1] "Prediction classification error: 0.194"
3.4 Binomial Model III
# Import dataset
pixel_numbers <- seq(1, 784)
x_column_names <- paste("pixel", pixel_numbers, sep = "")
y_column_names <- c("Zero", "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine")
X.train <- as.matrix(read.csv('X_train.csv', header = FALSE, col.names = x_column_names))
X.test <- as.matrix(read.csv('X_test.csv', header = FALSE, col.names = x_column_names))
Y.train <- as.matrix(read.csv('Y_train.csv', header = FALSE, col.names = y_column_names))
Y.test <- as.matrix(read.csv('Y_test.csv', header = FALSE, col.names = y_column_names))
train_set <- cbind(X.train, Y.train)
test_set <- cbind(X.test, Y.test)
# Create logistic function without using glm function
# Define sigmoid activation function
sigmoid <- function(x) {
1 / (1 + exp(-x))
}
# Using cross-entropy as the loss function
cross_entropy <- function(X, y, theta) {
m <- length(y)
predictions <- sigmoid(X %*% theta)
cost <- -(1 / m) * sum(y * log(predictions) + (1 - y) * log(1 - predictions))
return(cost)
}
# We are going to use gradient descent to find the optimal theta value
gradient_descent <- function(X, y, theta, alpha, num_iters) {
m <- length(y)
cost_history <- numeric(num_iters)
for (i in 1:num_iters) {
prediction <- sigmoid(X %*% theta)
errors <- prediction[, 1] - y
updates <- alpha * (1 / m) * (t(X) %*% errors)
theta <- theta - updates
cost_history[i] <- cross_entropy(X, y, theta)
}
list(theta = theta, cost_history = cost_history)
}
# Tune theta using training set
alpha <- 0.01
num_iters <- 1000
initial_beta <- rep(0, ncol(X.train))
# Tune logistic regression beta using
results <- lapply(1:ncol(Y.train), function(i) {
gradient_descent(X.train, Y.train[, i], theta = initial_beta,
alpha = alpha, num_iters = num_iters)
})
3.5 Multinomial Model I
3.5.1 Setup
# Loads the MNIST dataset, saves as an .RData file if not in WD
if (!(file.exists("mnist_data.RData"))) {
# ## installs older python version
# reticulate::install_python("3.10:latest")
# keras::install_keras(python_version = "3.10")
# ## re-loads keras
# library(keras)
## get MNIST data
mnist <- dataset_mnist()
## save to WD as .RData
save(mnist, file = "mnist_data.RData")
} else {
## read-in MNIST data
load(file = "mnist_data.RData")
}
# Access the training and testing sets
x_train <- mnist$train$x
y_train <- mnist$train$y
x_test <- mnist$test$x
y_test <- mnist$test$y
rm(mnist)
## plot function
plot_mnist_array <- function(plt, main_label = NA, color = FALSE, dim_n = 28) {
## setup color
if (color == TRUE) {
colfunc <- colorRampPalette(c("red", "white", "blue"))
min_abs <- -max(abs(range(plt)))
max_abs <- max(abs(range(plt)))
col <- colfunc(256)
} else {
col <- gray((255:0)/255)
min_abs <- 0
max_abs <- 255
}
## create image
image(x = 1:dim_n,
y = 1:dim_n,
## image is oriented incorrectly, this fixes it
z = t(apply(plt, 2, rev)),
col = col,
zlim = c(min_abs, max_abs),
axes = FALSE,
xlab = NA,
ylab = NA)
## create plot border
rect(xleft = 0.5,
ybottom = 0.5,
xright = 28 + 0.5,
ytop = 28 + 0.5,
border = "black",
lwd = 1)
## display prediction result
text(x = 2,
y = dim_n - 3,
labels = ifelse(is.na(main_label),
"",
main_label),
col = ifelse(color == TRUE,
"black",
"red"),
cex = 1.5)
}
## train data
# initialize matrix
x_train_2 <- matrix(nrow = nrow(x_train),
ncol = 28*28)
## likely a faster way to do this in the future
for (i in 1:nrow(x_train)) {
## get each layer's matrix image, stretch to 28^2 x 1
x_train_2[i, ] <- matrix(x_train[i, , ], 1, 28*28)
}
x_train_2 <- x_train_2 %>%
as.data.frame()
## test data
x_test_2 <- matrix(nrow = nrow(x_test),
ncol = 28*28)
for (i in 1:nrow(x_test)) {
x_test_2[i, ] <- matrix(x_test[i, , ], 1, 28*28)
}
x_test_2 <- x_test_2 %>%
as.data.frame()
## re-scale data
x_train_2 <- x_train_2 / 256
x_test_2 <- x_test_2 / 256
## response
# x_test_2$y <- y_test
# x_train_2$y <- y_train
3.5.2 Model
3.5.2.1 train
## set training data size
# n <- nrow(x_train_2)
n <- 100
indices <- sample(x = 1:nrow(x_train_2),
size = n)
## init data
x_multi <- x_train_2[indices, ]
y_multi <- y_train[indices]
## drop cols with all 0s
#x_multi <- x_multi[, (colSums(x_multi) > 0)]
## for the sake of the coefficients viz, setting alpha = 0
init_model <- cv.glmnet(x = x_multi %>% as.matrix,
y = y_multi %>% factor,
family = "multinomial",
alpha = 0)
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
multi_model <- predict(init_model,
x_multi %>% as.matrix,
s = init_model$lambda.min,
type = "response")
## format results
preds_init <- multi_model[, , 1] %>%
as.data.frame()
preds <- apply(X = preds_init,
MARGIN = 1,
FUN = function(x) names(which.max(x)) %>% as.numeric)
## TRAIN confusion matrix
table(y_multi, preds)
## preds
## y_multi 0 1 2 3 4 5 6 7 8 9
## 0 14 0 0 0 0 0 0 0 0 0
## 1 0 10 0 0 0 0 0 0 0 0
## 2 0 0 10 0 0 0 0 0 0 0
## 3 0 0 1 14 0 0 0 0 0 0
## 4 0 0 0 0 7 0 0 0 0 0
## 5 0 0 0 0 0 5 0 0 0 0
## 6 0 0 0 0 0 0 12 0 0 0
## 7 0 1 0 0 0 0 0 5 0 0
## 8 0 0 0 1 0 0 0 0 6 0
## 9 0 0 0 0 0 0 0 0 0 14
## [1] 0.03
3.5.2.2 test
## pre-process data
x_multi_test <- dplyr::select(x_test_2, all_of(names(x_multi)))
## get preds
multi_model_test <- predict(init_model,
as.matrix(x_multi_test),
s = init_model$lambda.min,
type = "response")
## format results
preds_init_test <- multi_model_test[, , 1] %>%
as.data.frame()
preds_test <- apply(X = preds_init_test,
MARGIN = 1,
FUN = function(x) names(which.max(x)) %>% as.numeric)
## TEST confusion matrix
table(y_test, preds_test)
## preds_test
## y_test 0 1 2 3 4 5 6 7 8 9
## 0 950 2 0 12 1 0 9 0 2 4
## 1 0 1074 7 34 0 0 4 0 16 0
## 2 26 67 570 236 16 3 82 1 11 20
## 3 13 9 7 947 0 4 8 2 3 17
## 4 58 35 1 11 402 0 41 0 13 421
## 5 299 104 1 239 16 30 28 4 102 69
## 6 70 26 8 3 1 0 836 0 8 6
## 7 52 55 5 123 7 1 2 393 0 390
## 8 80 82 9 295 5 6 22 0 365 110
## 9 25 25 0 45 33 0 4 19 0 858
## [1] 0.3575
## sort vectors so outputs are grouped
x_test_sort <- x_test[order(y_test), , ]
y_test_sort <- y_test[order(y_test)]
preds_test_sort <- preds_test[order(y_test)]
## get misclassified obs
wrong <- which(!(y_test_sort == preds_test_sort))
## plot a sample of misclassified obs
plot_wrong <- wrong[sample(x = 1:length(wrong), size = 3*8*6)] %>%
sort()
## plot params
par(mfcol = c(8, 6))
par(mar = c(0, 0, 0, 0))
for (i in plot_wrong) {
plot_mnist_array(plt = x_test_sort[i, , ],
main_label = preds_test_sort[i])
}
3.5.3 model heatmaps
## get coefficients into matrices
model_coef <- coef(init_model, s = init_model$lambda.min) %>%
lapply(as.matrix) %>%
lapply(function(x) matrix(x[-1, ], nrow = 28, ncol = 28)) %>%
## take sigmoid activation just to help viz
lapply(function(x) 1 / (1 + exp(-x)) - 0.5)
mapply(FUN = plot_mnist_array,
plt = model_coef,
main_label = names(model_coef),
color = TRUE)
## $`0`
## NULL
##
## $`1`
## NULL
##
## $`2`
## NULL
##
## $`3`
## NULL
##
## $`4`
## NULL
##
## $`5`
## NULL
##
## $`6`
## NULL
##
## $`7`
## NULL
##
## $`8`
## NULL
##
## $`9`
## NULL
3.5.4 no outside cells model
earlier runs of the above sections revealed that for a regularization method that does not perform variable selection, odd importance is given to outermost cell for prediction. Thus, those will be removed:
## set training data size
# n <- nrow(x_train_2)
n <- 1000
indices <- sample(x = 1:nrow(x_train_2),
size = n)
## init data
x_multi <- x_train_2[indices, ]
y_multi <- y_train[indices]
## drop outer cells
x_multi <- x_multi[, rep(seq(146, 622, 28), each = 18) + rep(0:17, times = 18)]
## for the sake of the coefficients viz, setting alpha = 0
init_model <- cv.glmnet(x = x_multi %>% as.matrix,
y = y_multi %>% factor,
family = "multinomial",
alpha = 0)
multi_model <- predict(init_model,
x_multi %>% as.matrix,
s = init_model$lambda.min,
type = "response")
## get coefficients into matrices
model_coef <- coef(init_model, s = init_model$lambda.min) %>%
lapply(as.matrix) %>%
lapply(function(x) matrix(x[-1, ], nrow = 18, ncol = 18)) %>%
## take sigmoid activation just to help viz
lapply(function(x) 1 / (1 + exp(-x)) - 0.5)
mapply(FUN = plot_mnist_array,
plt = model_coef,
main_label = names(model_coef),
color = TRUE,
dim_n = 18)
## $`0`
## NULL
##
## $`1`
## NULL
##
## $`2`
## NULL
##
## $`3`
## NULL
##
## $`4`
## NULL
##
## $`5`
## NULL
##
## $`6`
## NULL
##
## $`7`
## NULL
##
## $`8`
## NULL
##
## $`9`
## NULL
3.6 Multinomial Model II
## Warning: package 'tensorflow' was built under R version 4.2.3
##
## Attaching package: 'tensorflow'
## The following object is masked from 'package:caret':
##
## train
# Loads the MNIST dataset, saves as an .RData file if not in WD
#if (!(file.exists("mnist_data.RData"))) {
## installs older python version
#reticulate::install_python("3.10:latest")
#keras::install_keras(python_version = "3.10")
## re-loads keras
#library(keras)
## get MNIST data
#mnist <- dataset_mnist()
## save to WD as .RData
#save(mnist, file = "mnist_data.RData")
#} else {
## read-in MNIST data
#load(file = "mnist_data.RData")
#}
mnist <- dataset_mnist()
# Access the training and testing sets
train_images <- mnist$train$x
train_labels <- mnist$train$y
test_images <- mnist$test$x
test_labels <- mnist$test$y
# Preprocess the data
train_images <- array_reshape(train_images, c(dim(train_images)[1], 28 * 28)) / 255
test_images <- array_reshape(test_images, c(dim(test_images)[1], 28 * 28)) / 255
# Define the neural network
model2 <- keras_model_sequential() %>%
layer_dense(units = 128, activation = "sigmoid", input_shape = c(784)) %>%
layer_dense(units = 64, activation = "sigmoid") %>%
layer_dense(units = 10, activation = "sigmoid")
# Compile the model
model2 %>% compile(
optimizer = 'adam',
loss = 'binary_crossentropy', # Using binary crossentropy for binary classification
metrics = c('accuracy')
)
# Display the model summary
summary(model2)
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_2 (Dense) (None, 128) 100480
## dense_1 (Dense) (None, 64) 8256
## dense (Dense) (None, 10) 650
## ================================================================================
## Total params: 109386 (427.29 KB)
## Trainable params: 109386 (427.29 KB)
## Non-trainable params: 0 (0.00 Byte)
## ________________________________________________________________________________
# Assuming 'sample_image' is the input image
sample_image <- test_images[1, , drop = FALSE]
# Reshape the sample image back to its original 28x28 shape
sample_image_reshaped <- array_reshape(sample_image, c(28, 28))
# Get the output of the first hidden layer
layer1_output <- predict(model2, sample_image)
## 1/1 - 0s - 163ms/epoch - 163ms/step
layer1_output <- as.vector(layer1_output)
# Plot the result of the sigmoid activation for all 128 units in a line chart
ggplot() +
geom_line(aes(x = seq_along(layer1_output)-1, y = layer1_output),
color = "blue") +
labs(title = "Output of Sigmoid Activation in the Output Layer",
x = "Class Index",
y = "Output") +
theme_minimal() +
coord_cartesian(ylim = c(0, 1)) + # Set y-axis limit
scale_x_continuous(breaks = seq(0, 9, by = 1), limits = c(0, 9)) # Set x-axis breaks
\[ Sigmoid \:\: Function: \\ f(x) = \frac{1}{1+e^{-x}} \]
# Preprocess the data
train_images <- array_reshape(train_images, c(dim(train_images)[1], 28 * 28)) / 255
test_images <- array_reshape(test_images, c(dim(test_images)[1], 28 * 28)) / 255
# Define the neural network
model1 <- keras_model_sequential() %>%
layer_dense(units = 128, activation = "sigmoid", input_shape = c(28 * 28)) %>%
layer_dense(units = 10, activation = "softmax")
# Compile the model
model1 %>% compile(
optimizer = 'Adam',
loss = 'sparse_categorical_crossentropy',
metrics = c('accuracy')
)
# Train the model
model1 %>% fit(train_images, train_labels, epochs = 10, batch_size = 64, validation_split = 0.2)
## Epoch 1/10
## 750/750 - 2s - loss: 2.2416 - accuracy: 0.2426 - val_loss: 2.1552 - val_accuracy: 0.4565 - 2s/epoch - 3ms/step
## Epoch 2/10
## 750/750 - 2s - loss: 2.0191 - accuracy: 0.5192 - val_loss: 1.8413 - val_accuracy: 0.5972 - 2s/epoch - 2ms/step
## Epoch 3/10
## 750/750 - 2s - loss: 1.6384 - accuracy: 0.6421 - val_loss: 1.4169 - val_accuracy: 0.7404 - 2s/epoch - 3ms/step
## Epoch 4/10
## 750/750 - 2s - loss: 1.2605 - accuracy: 0.7272 - val_loss: 1.0892 - val_accuracy: 0.7578 - 2s/epoch - 3ms/step
## Epoch 5/10
## 750/750 - 2s - loss: 0.9939 - accuracy: 0.7781 - val_loss: 0.8662 - val_accuracy: 0.8085 - 2s/epoch - 3ms/step
## Epoch 6/10
## 750/750 - 2s - loss: 0.8184 - accuracy: 0.8102 - val_loss: 0.7230 - val_accuracy: 0.8372 - 2s/epoch - 3ms/step
## Epoch 7/10
## 750/750 - 2s - loss: 0.7007 - accuracy: 0.8317 - val_loss: 0.6263 - val_accuracy: 0.8519 - 2s/epoch - 3ms/step
## Epoch 8/10
## 750/750 - 2s - loss: 0.6187 - accuracy: 0.8461 - val_loss: 0.5553 - val_accuracy: 0.8672 - 2s/epoch - 3ms/step
## Epoch 9/10
## 750/750 - 2s - loss: 0.5587 - accuracy: 0.8583 - val_loss: 0.5064 - val_accuracy: 0.8755 - 2s/epoch - 3ms/step
## Epoch 10/10
## 750/750 - 2s - loss: 0.5144 - accuracy: 0.8670 - val_loss: 0.4651 - val_accuracy: 0.8827 - 2s/epoch - 3ms/step
# Evaluate the model on test data
test_loss_and_accuracy <- model1 %>% evaluate(test_images, test_labels)
## 313/313 - 0s - loss: 0.4757 - accuracy: 0.8782 - 257ms/epoch - 820us/step
## Test loss and accuracy: 0.4756537 0.8782
# Sample input image for visualization
sample_image <- test_images[1, , drop = FALSE]
# Reshape the sample image back to its original 28x28 shape
sample_image_reshaped <- array_reshape(sample_image, c(28, 28))
# Predictions from the output layer (softmax activation)
output_layer_output <- predict(model1, sample_image)
## 1/1 - 0s - 27ms/epoch - 27ms/step
output_layer_output <- as.vector(output_layer_output)
# Plot the result of the sigmoid activation in a line chart
ggplot() +
geom_line(aes(x = seq_along(output_layer_output)-1, y = output_layer_output),
color = "blue") +
labs(title = "Output of Softmax Activation in the Output Layer",
x = "Class Index",
y = "Output") +
theme_minimal() +
coord_cartesian(ylim = c(0, 1)) + # Set y-axis limit
scale_x_continuous(breaks = seq(0, 9, by = 1), limits = c(0, 9)) # Set x-axis breaks
\[ Softmax \:\: Function: \\ f(x_i) = \frac{e^{-x_i}}{\sum\limits_{i=1}^Ne^{-x_i}} \]
3.7 Neural Network I
# Preprocess the data
train_images <- array_reshape(train_images, c(dim(train_images)[1], 28 * 28)) / 255
test_images <- array_reshape(test_images, c(dim(test_images)[1], 28 * 28)) / 255
# Define the neural network
model <- keras_model_sequential() %>%
layer_dense(units = 128, activation = "sigmoid", input_shape = c(28 * 28)) %>%
layer_dense(units = 10, activation = "softmax")
# Compile the model
model %>% compile(
optimizer = 'Adam',
loss = 'sparse_categorical_crossentropy',
metrics = c('accuracy')
)
# Train the model
model %>% fit(train_images, train_labels, epochs = 10, batch_size = 64, validation_split = 0.2)
## Epoch 1/10
## 750/750 - 2s - loss: 2.3121 - accuracy: 0.1062 - val_loss: 2.3056 - val_accuracy: 0.1060 - 2s/epoch - 3ms/step
## Epoch 2/10
## 750/750 - 2s - loss: 2.3068 - accuracy: 0.1035 - val_loss: 2.3106 - val_accuracy: 0.0989 - 2s/epoch - 2ms/step
## Epoch 3/10
## 750/750 - 2s - loss: 2.3074 - accuracy: 0.1061 - val_loss: 2.3083 - val_accuracy: 0.1035 - 2s/epoch - 2ms/step
## Epoch 4/10
## 750/750 - 2s - loss: 2.3072 - accuracy: 0.1035 - val_loss: 2.3047 - val_accuracy: 0.1060 - 2s/epoch - 2ms/step
## Epoch 5/10
## 750/750 - 2s - loss: 2.3058 - accuracy: 0.1071 - val_loss: 2.3100 - val_accuracy: 0.0975 - 2s/epoch - 2ms/step
## Epoch 6/10
## 750/750 - 2s - loss: 2.3069 - accuracy: 0.1044 - val_loss: 2.3063 - val_accuracy: 0.1060 - 2s/epoch - 2ms/step
## Epoch 7/10
## 750/750 - 2s - loss: 2.3061 - accuracy: 0.1085 - val_loss: 2.3039 - val_accuracy: 0.1060 - 2s/epoch - 2ms/step
## Epoch 8/10
## 750/750 - 2s - loss: 2.3058 - accuracy: 0.1081 - val_loss: 2.3066 - val_accuracy: 0.1060 - 2s/epoch - 2ms/step
## Epoch 9/10
## 750/750 - 2s - loss: 2.3053 - accuracy: 0.1069 - val_loss: 2.3075 - val_accuracy: 0.0998 - 2s/epoch - 2ms/step
## Epoch 10/10
## 750/750 - 2s - loss: 2.3050 - accuracy: 0.1074 - val_loss: 2.3114 - val_accuracy: 0.1060 - 2s/epoch - 2ms/step
# Evaluate the model on test data
test_loss_and_accuracy <- model %>% evaluate(test_images, test_labels)
## 313/313 - 0s - loss: 2.3072 - accuracy: 0.1135 - 260ms/epoch - 832us/step
## Test loss and accuracy: 2.307231 0.1135
Reshapes the 28x28 images into flat vectors of size 784 Normalizes the pixel values to the range [0, 1] by dividing by 255
Creates a sequential model using keras_model_sequential() Adds a dense hidden layer with 128 neurons and a sigmoid activation function. Adds the output layer with 10 neurons (one for each digit) and a softmax activation function.
Compiles the model with the Adam optimizer, sparse categorical crossentropy loss function, and accuracy as the evaluation metric.
Trains the model on the training data for 5 epochs with a batch size of 64 and a validation split of 20%
Evaluates the trained model on the test data and prints the test accuracy
## 313/313 - 0s - 230ms/epoch - 735us/step
# Extract the class with the maximum probability for each sample
predicted_labels <- apply(predictions, 1, function(x) which.max(x) - 1)
# Create a data frame for plotting
plot_data <- data.frame(
TrueLabel = as.factor(test_labels),
PredictedLabel = as.factor(predicted_labels)
)
# Plot the results using ggplot2
ggplot(plot_data, aes(x = TrueLabel, fill = PredictedLabel)) +
geom_bar(position = "dodge") +
labs(title = "True vs Predicted Labels",
x = "True Label",
y = "Count",
fill = "Predicted Label")
# Define the updated plot_mnist function
plot_mnist <- function(plt, predictions) {
## create image
image(x = 1:28,
y = 1:28,
## image is oriented incorrectly, this fixes it
z = t(apply(plt, 1, rev)),
## 255:0 puts black on white canvas,
## changing to 0:255 puts white on black canvas
col = gray((255:0)/255),
axes = FALSE)
## create plot border
rect(xleft = 0.5,
ybottom = 0.5,
xright = 28 + 0.5,
ytop = 28 + 0.5,
border = "black",
lwd = 1)
## display prediction result
text(x = 15, y = 25, labels = paste("Prediction:", predictions), col = "red", cex = 1.5)
}
# Display the first 36 predictions for each digit
par(mfcol = c(6, 6))
par(mar = c(0, 0, 0, 0))
for (digit in 0:9) {
cat("Digit", digit, " Predictions:\n")
digit_indices <- which(test_labels == digit)[1:36]
for (i in digit_indices) {
# Reshape the flattened image vector into a 28x28 matrix
img <- matrix(test_images[i, ] * 255, nrow = 28, ncol = 28)
plot_mnist(img, predicted_labels[i])
}
}
## Digit 0 Predictions:
## Digit 1 Predictions:
## Digit 2 Predictions:
## Digit 3 Predictions:
## Digit 4 Predictions:
## Digit 5 Predictions:
## Digit 6 Predictions:
## Digit 7 Predictions:
## Digit 8 Predictions:
## Digit 9 Predictions:
3.8 Neural Network II
# Function to perform gradient descent
gradient_descent <- function(model, x, y, learning_rate, epochs, batch_size) {
history <- data.frame(epoch = integer(), loss = numeric())
for (epoch in 1:epochs) {
# Shuffle the data for each epoch
indices <- sample(1:nrow(x))
x <- x[indices, ]
y <- y[indices]
for (i in seq(1, nrow(x), batch_size)) {
end_index <- min(i + batch_size - 1, nrow(x)) # Ensure the end index is within bounds
x_batch <- x[i:end_index, ]
y_batch <- y[i:end_index]
# Compute gradient and update weights
gradients <- compute_gradients(model, x_batch, y_batch)
model <- update_weights(model, gradients, learning_rate)
# Record loss for plotting
loss <- model %>% evaluate(x_batch, y_batch, verbose = 0)
history <- rbind(history, data.frame(epoch = epoch, loss = loss))
}
}
return(list(model = model, history = history))
}
# Function to compute gradients using tf$GradientTape
compute_gradients <- function(model, x, y) {
with(tf$GradientTape(persistent = TRUE) %as% tape, {
# Forward pass
predictions <- model(x)
loss <- keras$losses$sparse_categorical_crossentropy(y, predictions)
loss_value <- tf$reduce_mean(loss)
# Compute gradients
grads <- tape$gradient(loss_value, model$trainable_variables)
return(list(loss = loss_value, grads = grads))
})
}
# Function to update weights using gradients
update_weights <- function(model, gradients, learning_rate) {
weights <- get_weights(model)
for (i in seq_along(weights)) {
weights[[i]] <- weights[[i]] - learning_rate * gradients$grads[[i]]
}
set_weights(model, weights)
return(model)
}
# Convert labels to one-hot encoding
train_labels_onehot <- to_categorical(train_labels, 10)
# Flatten train_images and test_images to (60000, 28 * 28)
train_images <- array_reshape(train_images, c(dim(train_images)[1], 28 * 28)) / 255
test_images <- array_reshape(test_images, c(dim(test_images)[1], 28 * 28)) / 255
# Define the neural network
model1 <- keras_model_sequential() %>%
layer_dense(units = 128, activation = "sigmoid", input_shape = c(28 * 28)) %>%
layer_dense(units = 10, activation = "softmax")
# Set hyperparameters
learning_rate <- 0.01
epochs <- 10
batch_size <- 64
# Compile the model
model1 %>% compile(
optimizer = 'Adam',
loss = 'sparse_categorical_crossentropy',
metrics = c('accuracy')
)
# Perform gradient descent
result <- gradient_descent(model1, train_images, train_labels_onehot, learning_rate, epochs, batch_size)
# Plot the loss during training
ggplot(result$history, aes(x = epoch, y = loss)) +
geom_line(color = "blue") +
labs(title = "Gradient Descent Progress", x = "Epoch", y = "Loss") +
theme_minimal()
# Train the model
model1 %>% fit(train_images, train_labels, epochs = 10, batch_size = 64, validation_split = 0.2)
## Epoch 1/10
## 750/750 - 8s - loss: 2.5289 - accuracy: 0.1053 - val_loss: 2.3049 - val_accuracy: 0.0975 - 8s/epoch - 10ms/step
## Epoch 2/10
## 750/750 - 7s - loss: 2.3060 - accuracy: 0.1071 - val_loss: 2.3045 - val_accuracy: 0.1035 - 7s/epoch - 9ms/step
## Epoch 3/10
## 750/750 - 7s - loss: 2.3064 - accuracy: 0.1053 - val_loss: 2.3114 - val_accuracy: 0.1060 - 7s/epoch - 9ms/step
## Epoch 4/10
## 750/750 - 7s - loss: 2.3063 - accuracy: 0.1067 - val_loss: 2.3057 - val_accuracy: 0.1081 - 7s/epoch - 9ms/step
## Epoch 5/10
## 750/750 - 7s - loss: 2.3068 - accuracy: 0.1050 - val_loss: 2.3114 - val_accuracy: 0.1035 - 7s/epoch - 9ms/step
## Epoch 6/10
## 750/750 - 7s - loss: 2.3067 - accuracy: 0.1060 - val_loss: 2.3058 - val_accuracy: 0.0989 - 7s/epoch - 10ms/step
## Epoch 7/10
## 750/750 - 7s - loss: 2.3068 - accuracy: 0.1052 - val_loss: 2.3112 - val_accuracy: 0.1060 - 7s/epoch - 9ms/step
## Epoch 8/10
## 750/750 - 7s - loss: 2.3064 - accuracy: 0.1065 - val_loss: 2.3056 - val_accuracy: 0.1035 - 7s/epoch - 9ms/step
## Epoch 9/10
## 750/750 - 7s - loss: 2.3075 - accuracy: 0.1059 - val_loss: 2.3048 - val_accuracy: 0.0989 - 7s/epoch - 9ms/step
## Epoch 10/10
## 750/750 - 7s - loss: 2.3070 - accuracy: 0.1055 - val_loss: 2.3050 - val_accuracy: 0.0997 - 7s/epoch - 10ms/step
# Evaluate the model on test data
test_loss_and_accuracy <- model1 %>% evaluate(test_images, test_labels)
## 313/313 - 1s - loss: 2.3036 - accuracy: 0.1032 - 695ms/epoch - 2ms/step
## Test loss and accuracy: 2.303597 0.1032
## Warning: package 'plotly' was built under R version 4.2.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Function to generate background data
generate_bg_data <- function(bg_x, bg_y) {
z_container <- matrix(0, nrow = length(bg_x), ncol = length(bg_y))
for (i in seq_along(bg_x)) {
for (j in seq_along(bg_y)) {
z_container[i, j] <- z(bg_x[i], bg_y[j])
}
}
return(z_container)
}
# Function to generate max and min data
generate_max_min_data <- function(r) {
max_num <- floor(r / 2)
min_num <- floor(-1 * (r / 2))
return(c(max_num, min_num, max_num, min_num))
}
# Define the z function
z <- function(x, y) {
return((x^2 + y^2) / 2)
}
# Generate background data
plot_range <- 50
max_min_data <- generate_max_min_data(plot_range)
bg_x <- seq(max_min_data[2], max_min_data[1], length.out = (max_min_data[1] - max_min_data[2] + 1) * 10)
bg_y <- seq(max_min_data[4], max_min_data[3], length.out = (max_min_data[3] - max_min_data[4] + 1) * 10)
z_data <- generate_bg_data(bg_x, bg_y)
# Define a custom curved surface function
curved_surface <- function(x, y, W = 1) {
return(W * (x^2 + y^2) / 2)
}
# Generate data for the curved surface
x_1 <- seq(-20, 20, length.out = 500)
y_1 <- seq(-20, 20, length.out = 500)
z_1 <- outer(x_1, y_1, curved_surface)
p <- plot_ly(z = z_1, x = x_1, y = y_1, type = "surface")
# Set hyperparameters
lr <- 0.08
epochs <- 40
# Initialize variables
train_x <- 1
train_y <- 2
w_x <- 0.9
w_y <- 0.9
label_x <- 25
label_y <- 23
# Create 3D plot using plotly
plot <- plot_ly(
x = rep(bg_x, each = length(bg_y)),
y = rep(bg_y, times = length(bg_x)),
z = as.vector(z_data),
type = "surface",
colors = colorRamp(c("blue", "red")), opacity = 0.6
) %>%
layout(scene = list(aspectmode = "cube"))
# Training loop
for (e in 1:epochs) {
# Predict
prediction_x <- train_x * w_x
prediction_y <- train_y * w_y
# Calculate error
error_x <- label_x - prediction_x
error_y <- label_y - prediction_y
# Display current error
cat("epoch:", e, ", error_x:", error_x, ", error_y:", error_y, "\n")
# Calculate derivatives
current_z <- z(error_x, error_y)
# Add points to the 3D plot
plot <- plot %>% add_trace(x = error_x, y = error_y, z = current_z, type = "scatter3d", mode = "markers", marker = list(size = 5))
# Update weights
if (error_x != 0 || error_y != 0) {
w_x <- w_x + (error_x * lr) * train_x
w_y <- w_y + (error_y * lr) * train_y
} else {
cat("Error is minimum.\n")
}
}
## epoch: 1 , error_x: 24.1 , error_y: 21.2
## epoch: 2 , error_x: 22.172 , error_y: 14.416
## epoch: 3 , error_x: 20.39824 , error_y: 9.80288
## epoch: 4 , error_x: 18.76638 , error_y: 6.665958
## epoch: 5 , error_x: 17.26507 , error_y: 4.532852
## epoch: 6 , error_x: 15.88386 , error_y: 3.082339
## epoch: 7 , error_x: 14.61316 , error_y: 2.095991
## epoch: 8 , error_x: 13.4441 , error_y: 1.425274
## epoch: 9 , error_x: 12.36857 , error_y: 0.9691861
## epoch: 10 , error_x: 11.37909 , error_y: 0.6590465
## epoch: 11 , error_x: 10.46876 , error_y: 0.4481516
## epoch: 12 , error_x: 9.631261 , error_y: 0.3047431
## epoch: 13 , error_x: 8.86076 , error_y: 0.2072253
## epoch: 14 , error_x: 8.151899 , error_y: 0.1409132
## epoch: 15 , error_x: 7.499747 , error_y: 0.09582099
## epoch: 16 , error_x: 6.899767 , error_y: 0.06515827
## epoch: 17 , error_x: 6.347786 , error_y: 0.04430762
## epoch: 18 , error_x: 5.839963 , error_y: 0.03012918
## epoch: 19 , error_x: 5.372766 , error_y: 0.02048785
## epoch: 20 , error_x: 4.942945 , error_y: 0.01393173
## epoch: 21 , error_x: 4.547509 , error_y: 0.00947358
## epoch: 22 , error_x: 4.183708 , error_y: 0.006442034
## epoch: 23 , error_x: 3.849012 , error_y: 0.004380583
## epoch: 24 , error_x: 3.541091 , error_y: 0.002978797
## epoch: 25 , error_x: 3.257804 , error_y: 0.002025582
## epoch: 26 , error_x: 2.997179 , error_y: 0.001377396
## epoch: 27 , error_x: 2.757405 , error_y: 0.000936629
## epoch: 28 , error_x: 2.536813 , error_y: 0.0006369077
## epoch: 29 , error_x: 2.333868 , error_y: 0.0004330972
## epoch: 30 , error_x: 2.147158 , error_y: 0.0002945061
## epoch: 31 , error_x: 1.975386 , error_y: 0.0002002642
## epoch: 32 , error_x: 1.817355 , error_y: 0.0001361796
## epoch: 33 , error_x: 1.671966 , error_y: 9.260215e-05
## epoch: 34 , error_x: 1.538209 , error_y: 6.296946e-05
## epoch: 35 , error_x: 1.415152 , error_y: 4.281923e-05
## epoch: 36 , error_x: 1.30194 , error_y: 2.911708e-05
## epoch: 37 , error_x: 1.197785 , error_y: 1.979961e-05
## epoch: 38 , error_x: 1.101962 , error_y: 1.346374e-05
## epoch: 39 , error_x: 1.013805 , error_y: 9.155341e-06
## epoch: 40 , error_x: 0.9327007 , error_y: 6.225632e-06
# Define the layout with legend position
plot <- plot %>% layout(scene = list(aspectmode = "cube"),
legend = list(x = 0.8, y = 0.8)) # Adjust the x and y values as needed
# Combine the two plots into one
combined_plot <- subplot(
plot,
p,
nrows = 1,
shareX = TRUE,
shareY = TRUE
)
# Show the combined plot
combined_plot
## Warning: 'layout' objects don't have these attributes: 'NA'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'