Chapter 3 Digit Model

In preparation for neural networks, we take a brief chapter to run other models on MNIST hand-written data. First we will run a binomial GLM on each digit and keep the maximum outputted likelihood as the predicted digit, then we will run a multinomial GLM to assess the likelihood of every digit simultaneously.

This chapter can be safely skipped / ignored.

3.1 Linear Model

# Import dataset
pixel_numbers <- seq(1, 784)
x_column_names <- paste("pixel", pixel_numbers, sep = "")
y_column_names <- c("Zero", "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine")
X.train <- read.csv('X_train.csv', header = FALSE, col.names = x_column_names)
X.test <- read.csv('X_test.csv', header = FALSE, col.names = x_column_names)
Y.train <- read.csv('Y_train.csv', header = FALSE, col.names = y_column_names)
Y.test <- read.csv('Y_test.csv', header = FALSE, col.names = y_column_names)

train_set <- cbind(X.train, Y.train)
test_set <- cbind(X.test, Y.test)

# Build logistic regression for each number
lm.fit0 <- glm(Zero ~. -One - Two - Three - Four - Five - Six - Seven - Eight - Nine,
               data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit1 <- glm(One ~. - Zero - Two - Three - Four - Five - Six - Seven - Eight - Nine,
               data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit2 <- glm(Two ~. - Zero - One - Three - Four - Five - Six - Seven - Eight - Nine,
               data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit3 <- glm(Three ~. - Zero - One - Two - Four - Five - Six - Seven - Eight - Nine,
               data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit4 <- glm(Four ~. - Zero - One - Two - Three - Five - Six - Seven - Eight - Nine,
               data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit5 <- glm(Five ~. - Zero - One - Two - Three - Four - Six - Seven - Eight - Nine,
               data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit6 <- glm(Six ~. - Zero - One - Two - Three - Four - Five - Seven - Eight - Nine,
               data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit7 <- glm(Seven ~. - Zero - One - Two - Three - Four - Five - Six - Eight - Nine,
               data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit8 <- glm(Eight ~. - Zero - One - Two - Three - Four - Five - Six - Seven - Nine,
               data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.fit9 <- glm(Nine ~. - Zero - One - Two - Three - Four - Five - Six - Seven - Eight,
               data = train_set, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Define softmax function
softmax <- function(x) {
  exp_x <- exp(x)
  return(exp_x / rowSums(exp_x))
}

# Function to predict using the model and new data
predict_response <- function(model, newdata = NULL) {
  as.vector(predict(model, type = "response", newdata = newdata))
}

# --- Training set predictions ---
train.preds <- lapply(list(lm.fit0, lm.fit1, lm.fit2, lm.fit3, lm.fit4, lm.fit5, lm.fit6, lm.fit7, lm.fit8, lm.fit9), predict_response)
train.result.matrix <- do.call(cbind, train.preds)

# Apply softmax and find the predicted classes
softmax_predictions_train <- softmax(train.result.matrix)
predicted_classes_train <- max.col(softmax_predictions_train) - 1
actual_train <- max.col(Y.train) - 1
train_accuracy <- sum(predicted_classes_train == actual_train) / length(actual_train)

# --- Testing set predictions ---
test.preds <- lapply(list(lm.fit0, lm.fit1, lm.fit2, lm.fit3, lm.fit4, lm.fit5, lm.fit6, lm.fit7, lm.fit8, lm.fit9), function(model) predict_response(model, newdata = test_set))
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
test.result.matrix <- do.call(cbind, test.preds)

# Apply softmax and find the predicted classes
softmax_predictions_test <- softmax(test.result.matrix)
predicted_classes_test <- max.col(softmax_predictions_test) - 1
actual_test <- max.col(Y.test) - 1
test_accuracy <- sum(predicted_classes_test == actual_test) / length(actual_test)

# Print the accuracies
print(paste("Training Set Accuracy:", round(train_accuracy, 3)))
## [1] "Training Set Accuracy: 0.656"
print(paste("Testing Set Accuracy:", round(test_accuracy, 3)))
## [1] "Testing Set Accuracy: 0.633"

3.2 Binomial Model I

3.2.1 Setup

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(keras)
## Warning: package 'keras' was built under R version 4.2.3
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8
library(MASS)
## Warning: package 'MASS' was built under R version 4.2.3
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(logistf)
## Warning: package 'logistf' was built under R version 4.2.3
## Warning in check_dep_version(): ABI version mismatch: 
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.2.3
library(prediction)
## Warning: package 'prediction' was built under R version 4.2.3
# Loads the MNIST dataset, saves as an .RData file if not in WD
if (!(file.exists("mnist_data.RData"))) {
  
  # ## installs older python version
  # reticulate::install_python("3.10:latest")
  # keras::install_keras(python_version = "3.10")
  # ## re-loads keras
  # library(keras)
  
  ## get MNIST data
  mnist <- dataset_mnist()
  ## save to WD as .RData
  save(mnist, file = "mnist_data.RData")
  
} else {
  ## read-in MNIST data
  load(file = "mnist_data.RData")
}

# Access the training and testing sets
x_train <- mnist$train$x
y_train <- mnist$train$y
x_test <- mnist$test$x
y_test <- mnist$test$y

rm(mnist)
## plot function, from OG data
plot_mnist <- function(plt) {
  ## create image
  image(x = 1:28,
        y = 1:28,
        ## image is oriented incorrectly, this fixes it
        z = t(apply(plt, 2, rev)),
        ## 255:0 puts black on white canvas,
        ## changing to 0:255 puts white on black canvas
        col = gray((255:0)/255),
        axes = FALSE)
  
  ## create plot border
  rect(xleft = 0.5,
       ybottom = 0.5,
       xright = 28 + 0.5,
       ytop = 28 + 0.5,
       border = "black",
       lwd = 1)
}
## train data
# initialize matrix
x_train_2 <- matrix(nrow = nrow(x_train),
                    ncol = 28*28)

## likely a faster way to do this in the future
for (i in 1:nrow(x_train)) {
  ## get each layer's matrix image, stretch to 28^2 x 1
  x_train_2[i, ] <- matrix(x_train[i, , ], 1, 28*28)
}

x_train_2 <- x_train_2 %>%
  as.data.frame()

## test data
x_test_2 <- matrix(nrow = nrow(x_test),
                   ncol = 28*28)

for (i in 1:nrow(x_test)) {
  x_test_2[i, ] <- matrix(x_test[i, , ], 1, 28*28)
}

x_test_2 <- x_test_2 %>%
  as.data.frame()


## re-scale data
x_train_2 <- x_train_2 / 256
x_test_2 <- x_test_2 / 256

## response
# x_test_2$y <- y_test
# x_train_2$y <- y_train

3.2.2 Model I

## for speed
# n <- nrow(x_train_2)
n <- 100
indices <- sample(x = 1:nrow(x_train_2),
                  size = n)

## init data
x_glm <- x_train_2[indices, ]
y_glm <- y_train[indices]
train_pred <- list()

## drop cols with all 0s
x_glm <- x_glm[, (colSums(x_glm) > 0)]
## 10 model method
for (i in 0:9) {
print(i)
  
y_glm_i = (y_glm == i)

init_model <- cv.glmnet(x = x_glm %>% as.matrix,
                        y = y_glm_i,
                        family = binomial,
                        alpha = 1)

train_pred[[i + 1]] <- predict(init_model,
                               x_glm %>% as.matrix,
                               s = init_model$lambda.min,
                               type = "response")
}
## [1] 0
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## format results
predictions <- data.frame(train_pred)
names(predictions) <- c("zero",
                        "one",
                        "two",
                        "three",
                        "four",
                        "five",
                        "six",
                        "seven",
                        "eight",
                        "nine")

#write.csv(predictions, "pred.csv", row.names = FALSE)

## convert to numeric
max_col <- apply(X = predictions,
                 MARGIN = 1,
                 FUN = function(x) names(x)[which.max(x)])

word_to_number <- c("zero" = 0,
                    "one" = 1,
                    "two" = 2,
                    "three" = 3,
                    "four" = 4,
                    "five" = 5,
                    "six" = 6,
                    "seven" = 7,
                    "eight" = 8,
                    "nine" = 9)

preds <- word_to_number[max_col] %>% as.numeric

## confusion matrix
table(y_glm, preds)
##      preds
## y_glm  0  1  2  3  4  6  7  8  9
##     0 12  0  0  0  0  0  0  0  0
##     1  0  9  0  0  0  0  0  0  0
##     2  0  0 11  0  0  0  0  0  1
##     3  0  1  1  2  0  0  0  1  0
##     4  0  0  0  0 11  0  0  0  0
##     5  1  1  0  0  0  0  1  0  1
##     6  0  0  0  0  0  9  0  0  0
##     7  0  0  0  0  0  0  5  0  3
##     8  0  0  0  0  0  0  0 11  0
##     9  0  0  0  0  0  0  0  0 19
## misclassification rate
mean(!(y_glm == preds))
## [1] 0.11

3.2.3 Model II

## train data

# initialize matrix
x_train_2 <- matrix(nrow = nrow(x_train),
                    ncol = 28*28)

## likely a faster way to do this in the future
for (i in 1:nrow(x_train)) {
  ## get each layer's matrix image, stretch to 28^2 x 1
  x_train_2[i, ] <- matrix(x_train[i, , ], 1, 28*28)
}

x_train_2 <- x_train_2 %>%
  as.data.frame()

## test data
x_test_2 <- matrix(nrow = nrow(x_test),
                   ncol = 28*28)

for (i in 1:nrow(x_test)) {
  x_test_2[i, ] <- matrix(x_test[i, , ], 1, 28*28)
}

x_test_2 <- x_test_2 %>%
  as.data.frame()
# Initialize a list to store the logistic regression models
logistic_models <- list()

# Initialize a list to store the predicted probabilities for each digit
probs_train_list <- list()
probs_test_list <- list()

# Initialize a list to store the binary predictions for each digit
pred_train_list <- list()
pred_test_list <- list()

# Initialize lists to store log loss and classification error for each digit
log_loss_train_list <- list()
classification_error_train_list <- list()
log_loss_test_list <- list()
classification_error_test_list <- list()

# Iterate over digits from 0 to 9
for (digit in 0:9) {
  # Apply the transformation to y_train and y_test for the current digit
  y_train_digit <- ifelse(y_train == digit, 1, 0)
  y_test_digit <- ifelse(y_test == digit, 1, 0)

  # Fit logistic regression model for the current digit
  logistic_model <- glm(y_train_digit ~ ., family = binomial(link = "logit"), data = x_train_2)
  logistic_models[[as.character(digit)]] <- logistic_model

  # Make predictions on the training set for the current digit
  probs_train <- predict(logistic_model, x_train_2, type = "response")
  probs_train_list[[as.character(digit)]] <- probs_train

  # Convert predicted probabilities to binary predictions (0 or 1)
  pred_train <- ifelse(probs_train > 0.5, 1, 0)
  pred_train_list[[as.character(digit)]] <- pred_train

  # Make predictions on the test set for the current digit
  probs_test <- predict(logistic_model, x_test_2, type = "response")
  probs_test_list[[as.character(digit)]] <- probs_test

  # Convert predicted probabilities to binary predictions (0 or 1)
  pred_test <- ifelse(probs_test > 0.5, 1, 0)
  pred_test_list[[as.character(digit)]] <- pred_test

  # Calculate log loss on the training set for the current digit
  loss_entropy_train <- logLoss(y_train_digit, probs_train)
  cat("Training set log loss for digit", digit, ":", round(loss_entropy_train, 3), "\n")
  
  # Calculate log loss on the test set for the current digit
  loss_entropy_test <- logLoss(y_test_digit, probs_test)
  cat("Test set log loss for digit", digit, ":", round(loss_entropy_test, 3), "\n")

  # Calculate classification error on the training set for the current digit
  classification_error_train <- 1 - sum(pred_train == y_train_digit) / length(y_train_digit)
  cat("Average number of classification errors on training set for digit", digit, ":", round(classification_error_train, 3), "\n")
  
  # Calculate classification error on the test set for the current digit
  classification_error_test <- 1 - sum(pred_test == y_test_digit) / length(y_test_digit)
  cat("Average number of classification errors on test set for digit", digit, ":", round(classification_error_test, 3), "\n")
  
  # Store log loss and classification error in lists
  log_loss_train_list[[as.character(digit)]] <- loss_entropy_train
  classification_error_train_list[[as.character(digit)]] <- classification_error_train
  log_loss_test_list[[as.character(digit)]] <- loss_entropy_test
  classification_error_test_list[[as.character(digit)]] <- classification_error_test
}
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 0 : 0.255 
## Test set log loss for digit 0 : 0.404 
## Average number of classification errors on training set for digit 0 : 0.007 
## Average number of classification errors on test set for digit 0 : 0.011
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 1 : 0.285 
## Test set log loss for digit 1 : 0.371 
## Average number of classification errors on training set for digit 1 : 0.008 
## Average number of classification errors on test set for digit 1 : 0.01
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 2 : 0.696 
## Test set log loss for digit 2 : 0.811 
## Average number of classification errors on training set for digit 2 : 0.019 
## Average number of classification errors on test set for digit 2 : 0.022
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 3 : 0.914 
## Test set log loss for digit 3 : 1.006 
## Average number of classification errors on training set for digit 3 : 0.025 
## Average number of classification errors on test set for digit 3 : 0.028
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 4 : 0.569 
## Test set log loss for digit 4 : 0.681 
## Average number of classification errors on training set for digit 4 : 0.016 
## Average number of classification errors on test set for digit 4 : 0.019
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 5 : 1.049 
## Test set log loss for digit 5 : 1.089 
## Average number of classification errors on training set for digit 5 : 0.029 
## Average number of classification errors on test set for digit 5 : 0.03
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 6 : 0.571 
## Test set log loss for digit 6 : 0.761 
## Average number of classification errors on training set for digit 6 : 0.016 
## Average number of classification errors on test set for digit 6 : 0.021
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 7 : 0.552 
## Test set log loss for digit 7 : 0.728 
## Average number of classification errors on training set for digit 7 : 0.015 
## Average number of classification errors on test set for digit 7 : 0.02
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 8 : 1.427 
## Test set log loss for digit 8 : 1.55 
## Average number of classification errors on training set for digit 8 : 0.04 
## Average number of classification errors on test set for digit 8 : 0.043
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Training set log loss for digit 9 : 1.526 
## Test set log loss for digit 9 : 1.507 
## Average number of classification errors on training set for digit 9 : 0.042 
## Average number of classification errors on test set for digit 9 : 0.042
# Print the results from lists in a single loop
for (digit in 0:9) {
  cat("Digit", digit, "\n")
  cat(" Training Set Log Loss:", round(log_loss_train_list[[as.character(digit)]], 3), "\n")
  cat(" Training Set Classification Error:", round(classification_error_train_list[[as.character(digit)]], 3), "\n")
  cat(" Test Set Log Loss:", round(log_loss_test_list[[as.character(digit)]], 3), "\n")
  cat(" Test Set Classification Error:", round(classification_error_test_list[[as.character(digit)]], 3), "\n")
}
## Digit 0 
##  Training Set Log Loss: 0.255 
##  Training Set Classification Error: 0.007 
##  Test Set Log Loss: 0.404 
##  Test Set Classification Error: 0.011 
## Digit 1 
##  Training Set Log Loss: 0.285 
##  Training Set Classification Error: 0.008 
##  Test Set Log Loss: 0.371 
##  Test Set Classification Error: 0.01 
## Digit 2 
##  Training Set Log Loss: 0.696 
##  Training Set Classification Error: 0.019 
##  Test Set Log Loss: 0.811 
##  Test Set Classification Error: 0.022 
## Digit 3 
##  Training Set Log Loss: 0.914 
##  Training Set Classification Error: 0.025 
##  Test Set Log Loss: 1.006 
##  Test Set Classification Error: 0.028 
## Digit 4 
##  Training Set Log Loss: 0.569 
##  Training Set Classification Error: 0.016 
##  Test Set Log Loss: 0.681 
##  Test Set Classification Error: 0.019 
## Digit 5 
##  Training Set Log Loss: 1.049 
##  Training Set Classification Error: 0.029 
##  Test Set Log Loss: 1.089 
##  Test Set Classification Error: 0.03 
## Digit 6 
##  Training Set Log Loss: 0.571 
##  Training Set Classification Error: 0.016 
##  Test Set Log Loss: 0.761 
##  Test Set Classification Error: 0.021 
## Digit 7 
##  Training Set Log Loss: 0.552 
##  Training Set Classification Error: 0.015 
##  Test Set Log Loss: 0.728 
##  Test Set Classification Error: 0.02 
## Digit 8 
##  Training Set Log Loss: 1.427 
##  Training Set Classification Error: 0.04 
##  Test Set Log Loss: 1.55 
##  Test Set Classification Error: 0.043 
## Digit 9 
##  Training Set Log Loss: 1.526 
##  Training Set Classification Error: 0.042 
##  Test Set Log Loss: 1.507 
##  Test Set Classification Error: 0.042
# Compute the average log loss & classification errors
average_log_loss_train <- mean(sapply(log_loss_train_list, function(x) x))
cat("Average Training Set Log Loss:", round(average_log_loss_train, 3), "\n")
## Average Training Set Log Loss: 0.785
average_classification_error_train <- mean(sapply(classification_error_train_list, function(x) x))
cat("Average Training Set Classification Errors:", round(average_classification_error_train, 3), "\n")
## Average Training Set Classification Errors: 0.022
average_log_loss_test <- mean(sapply(log_loss_test_list, function(x) x))
cat("Average Test Set Log Loss:", round(average_log_loss_test, 3), "\n")
## Average Test Set Log Loss: 0.891
average_classification_error_test <- mean(sapply(classification_error_test_list, function(x) x))
cat("Average Test Set Classification Errors:", round(average_classification_error_test, 3), "\n")
## Average Test Set Classification Errors: 0.025

From the log loss and the average number of classification errors, we can see that the prediction results are quite accurate(training set: loss= 0.785, classification errors= 0.022; test set: loss= 0.891, classification error: 0.025)

3.3 Binomial Model II

# Download the first 2000 observations of the MNIST dataset
mnist <- read.csv("https://pjreddie.com/media/files/mnist_train.csv", nrows = 2000)
colnames(mnist) <- c("Digit", paste("Pixel", seq(1:784), sep = ""))
save(mnist, file = "mnist_first2000.RData")

3.3.1 1 knn

# Load the required packages
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.2.3
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
## 
##     precision, recall
## The following object is masked from 'package:purrr':
## 
##     lift
library(class)
## Warning: package 'class' was built under R version 4.2.3
# Load the MNIST dataset
load("mnist_first2000.RData")

# Split the dataset into training data (first 1000 rows) and testing data (last 1000 rows)
train_data <- mnist[1:1000,] 
test_data <- mnist[1001:2000,]

# Train a KNN model using 5-fold cross-validation
train_control <- trainControl(method="cv", number=5)  
tune_grid <- expand.grid(.k=1:20)
knn_model <- train(as.factor(Digit)~., data=train_data, method="knn", tuneGrid=tune_grid, trControl=train_control)

# Select the value of K with the lowest cross-validation error
best_k <- knn_model$bestTune$k
print(best_k)
## [1] 1
# Use the KNN algorithm for classification
knn_fit <- knn(train=train_data[,1:785], test=test_data[,1:785], cl=train_data$Digit, k=best_k)

# Calculate the prediction error and confusion matrix
error_rate <- mean(knn_fit != test_data$Digit)
confusion_matrix <- table(knn_fit, test_data$Digit)
print(confusion_matrix)
##        
## knn_fit   0   1   2   3   4   5   6   7   8   9
##       0  89   0   0   1   0   1   1   0   2   1
##       1   1 103   4   1   6   0   0   1   1   1
##       2   0   0  86   4   1   0   0   0   0   0
##       3   0   0   0  76   0   6   0   0   1   1
##       4   0   0   1   0  81   0   1   0   1   5
##       5   0   0   0  11   1  76   1   2   3   0
##       6   2   0   0   0   2   3 103   0   3   0
##       7   0   1   5   1   1   0   0 101   0   5
##       8   0   0   3   2   1   1   0   0  70   0
##       9   1   0   0   2  16   2   0   3   4  97
# Print the prediction classification error
print(paste("Prediction classification error:", error_rate))
## [1] "Prediction classification error: 0.118"

3.3.2 2 logistic regression

# Load the glmnet package
library(glmnet)

# Load the MNIST dataset
load("mnist_first2000.RData")

# Split the data into training and testing sets
train_data <- mnist[1:1000,]
test_data <- mnist[1001:2000,]

# Extract the predictors and response variables
x_train <- train_data[,2:785]
y_train <- train_data[, 1]
x_test <- test_data[,2:785]
y_test <- test_data[, 1]

# Convert non-numeric columns to numeric using model.matrix()
x_train <- model.matrix(~., data=train_data[, -1])
x_test  <- model.matrix(~., data=test_data[, -1])
# Fit a multi-class logistic regression model with Lasso penalty
fit <- cv.glmnet(x_train, y_train, family="multinomial", alpha=1, type.measure="class")

# Select the best tuning parameter
best_lambda <- fit$lambda.min

# Make predictions on the testing data
pred <- predict(fit, newx=x_test, s=best_lambda, type="class")

# Compute the confusion matrix
confusion_matrix <- table(pred, y_test)

# Compute the prediction classification error
error_rate <- mean(pred != y_test)

# Print the confusion matrix and prediction classification error
print(confusion_matrix)
##     y_test
## pred  0  1  2  3  4  5  6  7  8  9
##    0 83  0  1  0  0  1  2  0  3  4
##    1  0 90  2  1  1  0  1  0  3  0
##    2  0  1 75 12  3  0  4  0  2  0
##    3  1  1  1 63  1  1  0  2  1  2
##    4  1  0  3  0 82  3  1  3  0 10
##    5  5  0  1 10  2 74  3  1  1  1
##    6  0  0  4  1  3  1 94  0  1  0
##    7  1  1  6  2  3  2  0 92  2  6
##    8  2 11  4  5  2  6  1  2 68  2
##    9  0  0  2  4 12  1  0  7  4 85
print(paste("Prediction classification error:", error_rate))
## [1] "Prediction classification error: 0.194"

3.4 Binomial Model III

# Import dataset
pixel_numbers <- seq(1, 784)
x_column_names <- paste("pixel", pixel_numbers, sep = "")
y_column_names <- c("Zero", "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine")
X.train <- as.matrix(read.csv('X_train.csv', header = FALSE, col.names = x_column_names))
X.test <- as.matrix(read.csv('X_test.csv', header = FALSE, col.names = x_column_names))
Y.train <- as.matrix(read.csv('Y_train.csv', header = FALSE, col.names = y_column_names))
Y.test <- as.matrix(read.csv('Y_test.csv', header = FALSE, col.names = y_column_names))

train_set <- cbind(X.train, Y.train)
test_set <- cbind(X.test, Y.test)

# Create logistic function without using glm function
# Define sigmoid activation function
sigmoid <- function(x) {
  1 / (1 + exp(-x))
}

# Using cross-entropy as the loss function
cross_entropy <- function(X, y, theta) {
  m <- length(y)
  predictions <- sigmoid(X %*% theta)
  cost <- -(1 / m) * sum(y * log(predictions) + (1 - y) * log(1 - predictions))
  return(cost)
}

# We are going to use gradient descent to find the optimal theta value
gradient_descent <- function(X, y, theta, alpha, num_iters) {
  m <- length(y)
  cost_history <- numeric(num_iters)
  
  for (i in 1:num_iters) {
    prediction <- sigmoid(X %*% theta)
    errors <- prediction[, 1] - y
    updates <- alpha * (1 / m) * (t(X) %*% errors)
    theta <- theta - updates
    cost_history[i] <- cross_entropy(X, y, theta)
  }
  
  list(theta = theta, cost_history = cost_history)
}

# Tune theta using training set
alpha <- 0.01
num_iters <- 1000
initial_beta <- rep(0, ncol(X.train))

# Tune logistic regression beta using
results <- lapply(1:ncol(Y.train), function(i) {
  gradient_descent(X.train, Y.train[, i], theta = initial_beta,
                   alpha = alpha, num_iters = num_iters)
})

3.5 Multinomial Model I

3.5.1 Setup

# Loads the MNIST dataset, saves as an .RData file if not in WD
if (!(file.exists("mnist_data.RData"))) {
  
  # ## installs older python version
  # reticulate::install_python("3.10:latest")
  # keras::install_keras(python_version = "3.10")
  # ## re-loads keras
  # library(keras)
  
  ## get MNIST data
  mnist <- dataset_mnist()
  ## save to WD as .RData
  save(mnist, file = "mnist_data.RData")
  
} else {
  ## read-in MNIST data
  load(file = "mnist_data.RData")
}

# Access the training and testing sets
x_train <- mnist$train$x
y_train <- mnist$train$y
x_test <- mnist$test$x
y_test <- mnist$test$y

rm(mnist)
## plot function
plot_mnist_array <- function(plt, main_label = NA, color = FALSE, dim_n = 28) {
  
  ## setup color
  if (color == TRUE) {
      colfunc <- colorRampPalette(c("red", "white", "blue"))
      
      min_abs <- -max(abs(range(plt)))
      max_abs <- max(abs(range(plt)))
      
      col <- colfunc(256)
  } else {
    col <- gray((255:0)/255)
    min_abs <- 0
    max_abs <- 255
    }
  
  ## create image
  image(x = 1:dim_n,
        y = 1:dim_n,
        ## image is oriented incorrectly, this fixes it
        z = t(apply(plt, 2, rev)),
        col = col,
        zlim = c(min_abs, max_abs),
        axes = FALSE,
        xlab = NA,
        ylab = NA)
  
  ## create plot border
  rect(xleft = 0.5,
       ybottom = 0.5,
       xright = 28 + 0.5,
       ytop = 28 + 0.5,
       border = "black",
       lwd = 1)
  
  ## display prediction result
  text(x = 2,
       y = dim_n - 3,
       labels = ifelse(is.na(main_label),
                       "",
                       main_label),
       col = ifelse(color == TRUE,
                    "black",
                    "red"),
       cex = 1.5)
}
## train data

# initialize matrix
x_train_2 <- matrix(nrow = nrow(x_train),
                    ncol = 28*28)

## likely a faster way to do this in the future
for (i in 1:nrow(x_train)) {
  ## get each layer's matrix image, stretch to 28^2 x 1
  x_train_2[i, ] <- matrix(x_train[i, , ], 1, 28*28)
}

x_train_2 <- x_train_2 %>%
  as.data.frame()

## test data
x_test_2 <- matrix(nrow = nrow(x_test),
                   ncol = 28*28)

for (i in 1:nrow(x_test)) {
  x_test_2[i, ] <- matrix(x_test[i, , ], 1, 28*28)
}

x_test_2 <- x_test_2 %>%
  as.data.frame()


## re-scale data
x_train_2 <- x_train_2 / 256
x_test_2 <- x_test_2 / 256

## response
# x_test_2$y <- y_test
# x_train_2$y <- y_train

3.5.2 Model

3.5.2.1 train

## set training data size
# n <- nrow(x_train_2)
n <- 100

indices <- sample(x = 1:nrow(x_train_2),
                  size = n)

## init data
x_multi <- x_train_2[indices, ]
y_multi <- y_train[indices]

## drop cols with all 0s
#x_multi <- x_multi[, (colSums(x_multi) > 0)]
## for the sake of the coefficients viz, setting alpha = 0
init_model <- cv.glmnet(x = x_multi %>% as.matrix,
                        y = y_multi %>% factor,
                        family = "multinomial",
                        alpha = 0)
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
multi_model <- predict(init_model,
                       x_multi %>% as.matrix,
                       s = init_model$lambda.min,
                       type = "response")
## format results
preds_init <- multi_model[, , 1] %>%
  as.data.frame()

preds <- apply(X = preds_init,
               MARGIN = 1,
               FUN = function(x) names(which.max(x)) %>% as.numeric)

## TRAIN confusion matrix
table(y_multi, preds)
##        preds
## y_multi  0  1  2  3  4  5  6  7  8  9
##       0 14  0  0  0  0  0  0  0  0  0
##       1  0 10  0  0  0  0  0  0  0  0
##       2  0  0 10  0  0  0  0  0  0  0
##       3  0  0  1 14  0  0  0  0  0  0
##       4  0  0  0  0  7  0  0  0  0  0
##       5  0  0  0  0  0  5  0  0  0  0
##       6  0  0  0  0  0  0 12  0  0  0
##       7  0  1  0  0  0  0  0  5  0  0
##       8  0  0  0  1  0  0  0  0  6  0
##       9  0  0  0  0  0  0  0  0  0 14
## TRAIN misclassification rate
mean(!(y_multi == preds))
## [1] 0.03

3.5.2.2 test

## pre-process data
x_multi_test <- dplyr::select(x_test_2, all_of(names(x_multi)))

## get preds
multi_model_test <- predict(init_model,
                            as.matrix(x_multi_test),
                            s = init_model$lambda.min,
                            type = "response")
## format results
preds_init_test <- multi_model_test[, , 1] %>%
  as.data.frame()

preds_test <- apply(X = preds_init_test,
               MARGIN = 1,
               FUN = function(x) names(which.max(x)) %>% as.numeric)

## TEST confusion matrix
table(y_test, preds_test)
##       preds_test
## y_test    0    1    2    3    4    5    6    7    8    9
##      0  950    2    0   12    1    0    9    0    2    4
##      1    0 1074    7   34    0    0    4    0   16    0
##      2   26   67  570  236   16    3   82    1   11   20
##      3   13    9    7  947    0    4    8    2    3   17
##      4   58   35    1   11  402    0   41    0   13  421
##      5  299  104    1  239   16   30   28    4  102   69
##      6   70   26    8    3    1    0  836    0    8    6
##      7   52   55    5  123    7    1    2  393    0  390
##      8   80   82    9  295    5    6   22    0  365  110
##      9   25   25    0   45   33    0    4   19    0  858
## TEST misclassification rate
mean(!(y_test == preds_test))
## [1] 0.3575
## sort vectors so outputs are grouped
x_test_sort <- x_test[order(y_test), , ]
y_test_sort <- y_test[order(y_test)]
preds_test_sort <- preds_test[order(y_test)]

## get misclassified obs
wrong <- which(!(y_test_sort == preds_test_sort))

## plot a sample of misclassified obs
plot_wrong <- wrong[sample(x = 1:length(wrong), size = 3*8*6)] %>%
  sort()

## plot params
par(mfcol = c(8, 6))
par(mar = c(0, 0, 0, 0))

for (i in plot_wrong) {
  plot_mnist_array(plt = x_test_sort[i, , ],
                   main_label = preds_test_sort[i])
}

par(mfcol = c(1, 1))

3.5.3 model heatmaps

## get coefficients into matrices
model_coef <- coef(init_model, s = init_model$lambda.min) %>%
  lapply(as.matrix) %>%
  lapply(function(x) matrix(x[-1, ], nrow = 28, ncol = 28)) %>%
  ## take sigmoid activation just to help viz
  lapply(function(x) 1 / (1 + exp(-x)) - 0.5)

mapply(FUN = plot_mnist_array,
       plt = model_coef,
       main_label = names(model_coef),
       color = TRUE)

## $`0`
## NULL
## 
## $`1`
## NULL
## 
## $`2`
## NULL
## 
## $`3`
## NULL
## 
## $`4`
## NULL
## 
## $`5`
## NULL
## 
## $`6`
## NULL
## 
## $`7`
## NULL
## 
## $`8`
## NULL
## 
## $`9`
## NULL

3.5.4 no outside cells model

earlier runs of the above sections revealed that for a regularization method that does not perform variable selection, odd importance is given to outermost cell for prediction. Thus, those will be removed:

## set training data size
# n <- nrow(x_train_2)
n <- 1000

indices <- sample(x = 1:nrow(x_train_2),
                  size = n)

## init data
x_multi <- x_train_2[indices, ]
y_multi <- y_train[indices]

## drop outer cells
x_multi <- x_multi[, rep(seq(146, 622, 28), each = 18) + rep(0:17, times = 18)]
## for the sake of the coefficients viz, setting alpha = 0
init_model <- cv.glmnet(x = x_multi %>% as.matrix,
                        y = y_multi %>% factor,
                        family = "multinomial",
                        alpha = 0)

multi_model <- predict(init_model,
                       x_multi %>% as.matrix,
                       s = init_model$lambda.min,
                       type = "response")
## get coefficients into matrices
model_coef <- coef(init_model, s = init_model$lambda.min) %>%
  lapply(as.matrix) %>%
  lapply(function(x) matrix(x[-1, ], nrow = 18, ncol = 18)) %>%
  ## take sigmoid activation just to help viz
  lapply(function(x) 1 / (1 + exp(-x)) - 0.5)

mapply(FUN = plot_mnist_array,
       plt = model_coef,
       main_label = names(model_coef),
       color = TRUE,
       dim_n = 18)

## $`0`
## NULL
## 
## $`1`
## NULL
## 
## $`2`
## NULL
## 
## $`3`
## NULL
## 
## $`4`
## NULL
## 
## $`5`
## NULL
## 
## $`6`
## NULL
## 
## $`7`
## NULL
## 
## $`8`
## NULL
## 
## $`9`
## NULL

3.6 Multinomial Model II

library(tensorflow)
## Warning: package 'tensorflow' was built under R version 4.2.3
## 
## Attaching package: 'tensorflow'
## The following object is masked from 'package:caret':
## 
##     train
# Loads the MNIST dataset, saves as an .RData file if not in WD
#if (!(file.exists("mnist_data.RData"))) {
  
  ## installs older python version
  #reticulate::install_python("3.10:latest")
  #keras::install_keras(python_version = "3.10")
  ## re-loads keras
  #library(keras)
  ## get MNIST data
  #mnist <- dataset_mnist()
  ## save to WD as .RData
  #save(mnist, file = "mnist_data.RData")
  
#} else {
  ## read-in MNIST data
  #load(file = "mnist_data.RData")
#}

mnist <- dataset_mnist()

# Access the training and testing sets
train_images <- mnist$train$x
train_labels <- mnist$train$y
test_images <- mnist$test$x
test_labels <- mnist$test$y
# Preprocess the data
train_images <- array_reshape(train_images, c(dim(train_images)[1], 28 * 28)) / 255
test_images <- array_reshape(test_images, c(dim(test_images)[1], 28 * 28)) / 255

# Define the neural network
model2 <- keras_model_sequential() %>%
  layer_dense(units = 128, activation = "sigmoid", input_shape = c(784)) %>%
  layer_dense(units = 64, activation = "sigmoid") %>%
  layer_dense(units = 10, activation = "sigmoid")

# Compile the model
model2 %>% compile(
  optimizer = 'adam',
  loss = 'binary_crossentropy',  # Using binary crossentropy for binary classification
  metrics = c('accuracy')
)

# Display the model summary
summary(model2)
## Model: "sequential"
## ________________________________________________________________________________
##  Layer (type)                       Output Shape                    Param #     
## ================================================================================
##  dense_2 (Dense)                    (None, 128)                     100480      
##  dense_1 (Dense)                    (None, 64)                      8256        
##  dense (Dense)                      (None, 10)                      650         
## ================================================================================
## Total params: 109386 (427.29 KB)
## Trainable params: 109386 (427.29 KB)
## Non-trainable params: 0 (0.00 Byte)
## ________________________________________________________________________________
# Assuming 'sample_image' is the input image
sample_image <- test_images[1, , drop = FALSE]

# Reshape the sample image back to its original 28x28 shape
sample_image_reshaped <- array_reshape(sample_image, c(28, 28))

# Get the output of the first hidden layer
layer1_output <- predict(model2, sample_image)
## 1/1 - 0s - 163ms/epoch - 163ms/step
layer1_output <- as.vector(layer1_output)

# Plot the result of the sigmoid activation for all 128 units in a line chart
ggplot() +
  geom_line(aes(x = seq_along(layer1_output)-1, y = layer1_output),
            color = "blue") +
  labs(title = "Output of Sigmoid Activation in the Output Layer",
       x = "Class Index",
       y = "Output") +
  theme_minimal() +
  coord_cartesian(ylim = c(0, 1)) +  # Set y-axis limit
  scale_x_continuous(breaks = seq(0, 9, by = 1), limits = c(0, 9))  # Set x-axis breaks

\[ Sigmoid \:\: Function: \\ f(x) = \frac{1}{1+e^{-x}} \]

# Preprocess the data
train_images <- array_reshape(train_images, c(dim(train_images)[1], 28 * 28)) / 255
test_images <- array_reshape(test_images, c(dim(test_images)[1], 28 * 28)) / 255

# Define the neural network
model1 <- keras_model_sequential() %>%
  layer_dense(units = 128, activation = "sigmoid", input_shape = c(28 * 28)) %>%
  layer_dense(units = 10, activation = "softmax")

# Compile the model
model1 %>% compile(
  optimizer = 'Adam',
  loss = 'sparse_categorical_crossentropy',
  metrics = c('accuracy')
)

# Train the model
model1 %>% fit(train_images, train_labels, epochs = 10, batch_size = 64, validation_split = 0.2)
## Epoch 1/10
## 750/750 - 2s - loss: 2.2416 - accuracy: 0.2426 - val_loss: 2.1552 - val_accuracy: 0.4565 - 2s/epoch - 3ms/step
## Epoch 2/10
## 750/750 - 2s - loss: 2.0191 - accuracy: 0.5192 - val_loss: 1.8413 - val_accuracy: 0.5972 - 2s/epoch - 2ms/step
## Epoch 3/10
## 750/750 - 2s - loss: 1.6384 - accuracy: 0.6421 - val_loss: 1.4169 - val_accuracy: 0.7404 - 2s/epoch - 3ms/step
## Epoch 4/10
## 750/750 - 2s - loss: 1.2605 - accuracy: 0.7272 - val_loss: 1.0892 - val_accuracy: 0.7578 - 2s/epoch - 3ms/step
## Epoch 5/10
## 750/750 - 2s - loss: 0.9939 - accuracy: 0.7781 - val_loss: 0.8662 - val_accuracy: 0.8085 - 2s/epoch - 3ms/step
## Epoch 6/10
## 750/750 - 2s - loss: 0.8184 - accuracy: 0.8102 - val_loss: 0.7230 - val_accuracy: 0.8372 - 2s/epoch - 3ms/step
## Epoch 7/10
## 750/750 - 2s - loss: 0.7007 - accuracy: 0.8317 - val_loss: 0.6263 - val_accuracy: 0.8519 - 2s/epoch - 3ms/step
## Epoch 8/10
## 750/750 - 2s - loss: 0.6187 - accuracy: 0.8461 - val_loss: 0.5553 - val_accuracy: 0.8672 - 2s/epoch - 3ms/step
## Epoch 9/10
## 750/750 - 2s - loss: 0.5587 - accuracy: 0.8583 - val_loss: 0.5064 - val_accuracy: 0.8755 - 2s/epoch - 3ms/step
## Epoch 10/10
## 750/750 - 2s - loss: 0.5144 - accuracy: 0.8670 - val_loss: 0.4651 - val_accuracy: 0.8827 - 2s/epoch - 3ms/step
# Evaluate the model on test data
test_loss_and_accuracy <- model1 %>% evaluate(test_images, test_labels)
## 313/313 - 0s - loss: 0.4757 - accuracy: 0.8782 - 257ms/epoch - 820us/step
cat("Test loss and accuracy:", test_loss_and_accuracy, "\n")
## Test loss and accuracy: 0.4756537 0.8782
# Sample input image for visualization
sample_image <- test_images[1, , drop = FALSE]
# Reshape the sample image back to its original 28x28 shape
sample_image_reshaped <- array_reshape(sample_image, c(28, 28))

# Predictions from the output layer (softmax activation)
output_layer_output <- predict(model1, sample_image)
## 1/1 - 0s - 27ms/epoch - 27ms/step
output_layer_output <- as.vector(output_layer_output)

# Plot the result of the sigmoid activation in a line chart
ggplot() +
  geom_line(aes(x = seq_along(output_layer_output)-1, y = output_layer_output),
            color = "blue") +
  labs(title = "Output of Softmax Activation in the Output Layer",
       x = "Class Index",
       y = "Output") +
  theme_minimal() +
  coord_cartesian(ylim = c(0, 1)) +  # Set y-axis limit
  scale_x_continuous(breaks = seq(0, 9, by = 1), limits = c(0, 9))  # Set x-axis breaks

\[ Softmax \:\: Function: \\ f(x_i) = \frac{e^{-x_i}}{\sum\limits_{i=1}^Ne^{-x_i}} \]

3.7 Neural Network I

# Preprocess the data
train_images <- array_reshape(train_images, c(dim(train_images)[1], 28 * 28)) / 255
test_images <- array_reshape(test_images, c(dim(test_images)[1], 28 * 28)) / 255

# Define the neural network
model <- keras_model_sequential() %>%
  layer_dense(units = 128, activation = "sigmoid", input_shape = c(28 * 28)) %>%
  layer_dense(units = 10, activation = "softmax")

# Compile the model
model %>% compile(
  optimizer = 'Adam',
  loss = 'sparse_categorical_crossentropy',
  metrics = c('accuracy')
)

# Train the model
model %>% fit(train_images, train_labels, epochs = 10, batch_size = 64, validation_split = 0.2)
## Epoch 1/10
## 750/750 - 2s - loss: 2.3121 - accuracy: 0.1062 - val_loss: 2.3056 - val_accuracy: 0.1060 - 2s/epoch - 3ms/step
## Epoch 2/10
## 750/750 - 2s - loss: 2.3068 - accuracy: 0.1035 - val_loss: 2.3106 - val_accuracy: 0.0989 - 2s/epoch - 2ms/step
## Epoch 3/10
## 750/750 - 2s - loss: 2.3074 - accuracy: 0.1061 - val_loss: 2.3083 - val_accuracy: 0.1035 - 2s/epoch - 2ms/step
## Epoch 4/10
## 750/750 - 2s - loss: 2.3072 - accuracy: 0.1035 - val_loss: 2.3047 - val_accuracy: 0.1060 - 2s/epoch - 2ms/step
## Epoch 5/10
## 750/750 - 2s - loss: 2.3058 - accuracy: 0.1071 - val_loss: 2.3100 - val_accuracy: 0.0975 - 2s/epoch - 2ms/step
## Epoch 6/10
## 750/750 - 2s - loss: 2.3069 - accuracy: 0.1044 - val_loss: 2.3063 - val_accuracy: 0.1060 - 2s/epoch - 2ms/step
## Epoch 7/10
## 750/750 - 2s - loss: 2.3061 - accuracy: 0.1085 - val_loss: 2.3039 - val_accuracy: 0.1060 - 2s/epoch - 2ms/step
## Epoch 8/10
## 750/750 - 2s - loss: 2.3058 - accuracy: 0.1081 - val_loss: 2.3066 - val_accuracy: 0.1060 - 2s/epoch - 2ms/step
## Epoch 9/10
## 750/750 - 2s - loss: 2.3053 - accuracy: 0.1069 - val_loss: 2.3075 - val_accuracy: 0.0998 - 2s/epoch - 2ms/step
## Epoch 10/10
## 750/750 - 2s - loss: 2.3050 - accuracy: 0.1074 - val_loss: 2.3114 - val_accuracy: 0.1060 - 2s/epoch - 2ms/step
# Evaluate the model on test data
test_loss_and_accuracy <- model %>% evaluate(test_images, test_labels)
## 313/313 - 0s - loss: 2.3072 - accuracy: 0.1135 - 260ms/epoch - 832us/step
cat("Test loss and accuracy:", test_loss_and_accuracy, "\n")
## Test loss and accuracy: 2.307231 0.1135

Reshapes the 28x28 images into flat vectors of size 784 Normalizes the pixel values to the range [0, 1] by dividing by 255

Creates a sequential model using keras_model_sequential() Adds a dense hidden layer with 128 neurons and a sigmoid activation function. Adds the output layer with 10 neurons (one for each digit) and a softmax activation function.

Compiles the model with the Adam optimizer, sparse categorical crossentropy loss function, and accuracy as the evaluation metric.

Trains the model on the training data for 5 epochs with a batch size of 64 and a validation split of 20%

Evaluates the trained model on the test data and prints the test accuracy

# Make predictions using the trained model
predictions <- model %>% predict(test_images)
## 313/313 - 0s - 230ms/epoch - 735us/step
# Extract the class with the maximum probability for each sample
predicted_labels <- apply(predictions, 1, function(x) which.max(x) - 1)

# Create a data frame for plotting
plot_data <- data.frame(
  TrueLabel = as.factor(test_labels),
  PredictedLabel = as.factor(predicted_labels)
)

# Plot the results using ggplot2
ggplot(plot_data, aes(x = TrueLabel, fill = PredictedLabel)) +
  geom_bar(position = "dodge") +
  labs(title = "True vs Predicted Labels",
       x = "True Label",
       y = "Count",
       fill = "Predicted Label")

# Define the updated plot_mnist function
plot_mnist <- function(plt, predictions) {
  ## create image
  image(x = 1:28,
        y = 1:28,
        ## image is oriented incorrectly, this fixes it
        z = t(apply(plt, 1, rev)),
        ## 255:0 puts black on white canvas,
        ## changing to 0:255 puts white on black canvas
        col = gray((255:0)/255),
        axes = FALSE)
  
  ## create plot border
  rect(xleft = 0.5,
       ybottom = 0.5,
       xright = 28 + 0.5,
       ytop = 28 + 0.5,
       border = "black",
       lwd = 1)
  
  ## display prediction result
  text(x = 15, y = 25, labels = paste("Prediction:", predictions), col = "red", cex = 1.5)
}

# Display the first 36 predictions for each digit
par(mfcol = c(6, 6))
par(mar = c(0, 0, 0, 0))

for (digit in 0:9) {
  cat("Digit", digit, " Predictions:\n")
  digit_indices <- which(test_labels == digit)[1:36]
  
  for (i in digit_indices) {
    # Reshape the flattened image vector into a 28x28 matrix
    img <- matrix(test_images[i, ] * 255, nrow = 28, ncol = 28)
    plot_mnist(img, predicted_labels[i])
  }
}
## Digit 0  Predictions:

## Digit 1  Predictions:

## Digit 2  Predictions:

## Digit 3  Predictions:

## Digit 4  Predictions:

## Digit 5  Predictions:

## Digit 6  Predictions:

## Digit 7  Predictions:

## Digit 8  Predictions:

## Digit 9  Predictions:

par(mfcol = c(1, 1))

3.8 Neural Network II

# Function to perform gradient descent
gradient_descent <- function(model, x, y, learning_rate, epochs, batch_size) {
  history <- data.frame(epoch = integer(), loss = numeric())

  for (epoch in 1:epochs) {
    # Shuffle the data for each epoch
    indices <- sample(1:nrow(x))
    x <- x[indices, ]
    y <- y[indices]

    for (i in seq(1, nrow(x), batch_size)) {
      end_index <- min(i + batch_size - 1, nrow(x))  # Ensure the end index is within bounds
      x_batch <- x[i:end_index, ]
      y_batch <- y[i:end_index]

      # Compute gradient and update weights
      gradients <- compute_gradients(model, x_batch, y_batch)
      model <- update_weights(model, gradients, learning_rate)

      # Record loss for plotting
      loss <- model %>% evaluate(x_batch, y_batch, verbose = 0)
      history <- rbind(history, data.frame(epoch = epoch, loss = loss))
    }
  }

  return(list(model = model, history = history))
}


# Function to compute gradients using tf$GradientTape
compute_gradients <- function(model, x, y) {
  with(tf$GradientTape(persistent = TRUE) %as% tape, {
    # Forward pass
    predictions <- model(x)
    loss <- keras$losses$sparse_categorical_crossentropy(y, predictions)
    loss_value <- tf$reduce_mean(loss)
    
    # Compute gradients
    grads <- tape$gradient(loss_value, model$trainable_variables)
    return(list(loss = loss_value, grads = grads))
  })
}

# Function to update weights using gradients
update_weights <- function(model, gradients, learning_rate) {
  weights <- get_weights(model)
  for (i in seq_along(weights)) {
    weights[[i]] <- weights[[i]] - learning_rate * gradients$grads[[i]]
  }
  set_weights(model, weights)
  return(model)
}
# Convert labels to one-hot encoding
train_labels_onehot <- to_categorical(train_labels, 10)
# Flatten train_images and test_images to (60000, 28 * 28)
train_images <- array_reshape(train_images, c(dim(train_images)[1], 28 * 28)) / 255
test_images <- array_reshape(test_images, c(dim(test_images)[1], 28 * 28)) / 255

# Define the neural network
model1 <- keras_model_sequential() %>%
  layer_dense(units = 128, activation = "sigmoid", input_shape = c(28 * 28)) %>%
  layer_dense(units = 10, activation = "softmax")

# Set hyperparameters
learning_rate <- 0.01
epochs <- 10
batch_size <- 64

# Compile the model
model1 %>% compile(
  optimizer = 'Adam',
  loss = 'sparse_categorical_crossentropy',
  metrics = c('accuracy')
)

# Perform gradient descent
result <- gradient_descent(model1, train_images, train_labels_onehot, learning_rate, epochs, batch_size)

# Plot the loss during training
ggplot(result$history, aes(x = epoch, y = loss)) +
  geom_line(color = "blue") +
  labs(title = "Gradient Descent Progress", x = "Epoch", y = "Loss") +
  theme_minimal()

# Train the model
model1 %>% fit(train_images, train_labels, epochs = 10, batch_size = 64, validation_split = 0.2)
## Epoch 1/10
## 750/750 - 8s - loss: 2.5289 - accuracy: 0.1053 - val_loss: 2.3049 - val_accuracy: 0.0975 - 8s/epoch - 10ms/step
## Epoch 2/10
## 750/750 - 7s - loss: 2.3060 - accuracy: 0.1071 - val_loss: 2.3045 - val_accuracy: 0.1035 - 7s/epoch - 9ms/step
## Epoch 3/10
## 750/750 - 7s - loss: 2.3064 - accuracy: 0.1053 - val_loss: 2.3114 - val_accuracy: 0.1060 - 7s/epoch - 9ms/step
## Epoch 4/10
## 750/750 - 7s - loss: 2.3063 - accuracy: 0.1067 - val_loss: 2.3057 - val_accuracy: 0.1081 - 7s/epoch - 9ms/step
## Epoch 5/10
## 750/750 - 7s - loss: 2.3068 - accuracy: 0.1050 - val_loss: 2.3114 - val_accuracy: 0.1035 - 7s/epoch - 9ms/step
## Epoch 6/10
## 750/750 - 7s - loss: 2.3067 - accuracy: 0.1060 - val_loss: 2.3058 - val_accuracy: 0.0989 - 7s/epoch - 10ms/step
## Epoch 7/10
## 750/750 - 7s - loss: 2.3068 - accuracy: 0.1052 - val_loss: 2.3112 - val_accuracy: 0.1060 - 7s/epoch - 9ms/step
## Epoch 8/10
## 750/750 - 7s - loss: 2.3064 - accuracy: 0.1065 - val_loss: 2.3056 - val_accuracy: 0.1035 - 7s/epoch - 9ms/step
## Epoch 9/10
## 750/750 - 7s - loss: 2.3075 - accuracy: 0.1059 - val_loss: 2.3048 - val_accuracy: 0.0989 - 7s/epoch - 9ms/step
## Epoch 10/10
## 750/750 - 7s - loss: 2.3070 - accuracy: 0.1055 - val_loss: 2.3050 - val_accuracy: 0.0997 - 7s/epoch - 10ms/step
# Evaluate the model on test data
test_loss_and_accuracy <- model1 %>% evaluate(test_images, test_labels)
## 313/313 - 1s - loss: 2.3036 - accuracy: 0.1032 - 695ms/epoch - 2ms/step
cat("Test loss and accuracy:", test_loss_and_accuracy, "\n")
## Test loss and accuracy: 2.303597 0.1032
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Function to generate background data
generate_bg_data <- function(bg_x, bg_y) {
  z_container <- matrix(0, nrow = length(bg_x), ncol = length(bg_y))

  for (i in seq_along(bg_x)) {
    for (j in seq_along(bg_y)) {
      z_container[i, j] <- z(bg_x[i], bg_y[j])
    }
  }

  return(z_container)
}

# Function to generate max and min data
generate_max_min_data <- function(r) {
  max_num <- floor(r / 2)
  min_num <- floor(-1 * (r / 2))
  return(c(max_num, min_num, max_num, min_num))
}

# Define the z function
z <- function(x, y) {
  return((x^2 + y^2) / 2)
}

# Generate background data
plot_range <- 50
max_min_data <- generate_max_min_data(plot_range)
bg_x <- seq(max_min_data[2], max_min_data[1], length.out = (max_min_data[1] - max_min_data[2] + 1) * 10)
bg_y <- seq(max_min_data[4], max_min_data[3], length.out = (max_min_data[3] - max_min_data[4] + 1) * 10)
z_data <- generate_bg_data(bg_x, bg_y)



# Define a custom curved surface function
curved_surface <- function(x, y, W = 1) {
  return(W * (x^2 + y^2) / 2)
}

# Generate data for the curved surface
x_1 <- seq(-20, 20, length.out = 500)
y_1 <- seq(-20, 20, length.out = 500)
z_1 <- outer(x_1, y_1, curved_surface)

p <- plot_ly(z = z_1, x = x_1, y = y_1, type = "surface")


# Set hyperparameters
lr <- 0.08
epochs <- 40

# Initialize variables
train_x <- 1
train_y <- 2
w_x <- 0.9
w_y <- 0.9
label_x <- 25
label_y <- 23

# Create 3D plot using plotly
plot <- plot_ly(
  x = rep(bg_x, each = length(bg_y)), 
  y = rep(bg_y, times = length(bg_x)),
  z = as.vector(z_data), 
  type = "surface", 
  colors = colorRamp(c("blue", "red")), opacity = 0.6
) %>%
  layout(scene = list(aspectmode = "cube"))

# Training loop
for (e in 1:epochs) {
  # Predict
  prediction_x <- train_x * w_x
  prediction_y <- train_y * w_y

  # Calculate error
  error_x <- label_x - prediction_x
  error_y <- label_y - prediction_y

  # Display current error
  cat("epoch:", e, ",  error_x:", error_x, ", error_y:", error_y, "\n")

  # Calculate derivatives
  current_z <- z(error_x, error_y)

  # Add points to the 3D plot
  plot <- plot %>% add_trace(x = error_x, y = error_y, z = current_z, type = "scatter3d", mode = "markers", marker = list(size = 5))

  # Update weights
  if (error_x != 0 || error_y != 0) {
    w_x <- w_x + (error_x * lr) * train_x
    w_y <- w_y + (error_y * lr) * train_y
  } else {
    cat("Error is minimum.\n")
  }
}
## epoch: 1 ,  error_x: 24.1 , error_y: 21.2 
## epoch: 2 ,  error_x: 22.172 , error_y: 14.416 
## epoch: 3 ,  error_x: 20.39824 , error_y: 9.80288 
## epoch: 4 ,  error_x: 18.76638 , error_y: 6.665958 
## epoch: 5 ,  error_x: 17.26507 , error_y: 4.532852 
## epoch: 6 ,  error_x: 15.88386 , error_y: 3.082339 
## epoch: 7 ,  error_x: 14.61316 , error_y: 2.095991 
## epoch: 8 ,  error_x: 13.4441 , error_y: 1.425274 
## epoch: 9 ,  error_x: 12.36857 , error_y: 0.9691861 
## epoch: 10 ,  error_x: 11.37909 , error_y: 0.6590465 
## epoch: 11 ,  error_x: 10.46876 , error_y: 0.4481516 
## epoch: 12 ,  error_x: 9.631261 , error_y: 0.3047431 
## epoch: 13 ,  error_x: 8.86076 , error_y: 0.2072253 
## epoch: 14 ,  error_x: 8.151899 , error_y: 0.1409132 
## epoch: 15 ,  error_x: 7.499747 , error_y: 0.09582099 
## epoch: 16 ,  error_x: 6.899767 , error_y: 0.06515827 
## epoch: 17 ,  error_x: 6.347786 , error_y: 0.04430762 
## epoch: 18 ,  error_x: 5.839963 , error_y: 0.03012918 
## epoch: 19 ,  error_x: 5.372766 , error_y: 0.02048785 
## epoch: 20 ,  error_x: 4.942945 , error_y: 0.01393173 
## epoch: 21 ,  error_x: 4.547509 , error_y: 0.00947358 
## epoch: 22 ,  error_x: 4.183708 , error_y: 0.006442034 
## epoch: 23 ,  error_x: 3.849012 , error_y: 0.004380583 
## epoch: 24 ,  error_x: 3.541091 , error_y: 0.002978797 
## epoch: 25 ,  error_x: 3.257804 , error_y: 0.002025582 
## epoch: 26 ,  error_x: 2.997179 , error_y: 0.001377396 
## epoch: 27 ,  error_x: 2.757405 , error_y: 0.000936629 
## epoch: 28 ,  error_x: 2.536813 , error_y: 0.0006369077 
## epoch: 29 ,  error_x: 2.333868 , error_y: 0.0004330972 
## epoch: 30 ,  error_x: 2.147158 , error_y: 0.0002945061 
## epoch: 31 ,  error_x: 1.975386 , error_y: 0.0002002642 
## epoch: 32 ,  error_x: 1.817355 , error_y: 0.0001361796 
## epoch: 33 ,  error_x: 1.671966 , error_y: 9.260215e-05 
## epoch: 34 ,  error_x: 1.538209 , error_y: 6.296946e-05 
## epoch: 35 ,  error_x: 1.415152 , error_y: 4.281923e-05 
## epoch: 36 ,  error_x: 1.30194 , error_y: 2.911708e-05 
## epoch: 37 ,  error_x: 1.197785 , error_y: 1.979961e-05 
## epoch: 38 ,  error_x: 1.101962 , error_y: 1.346374e-05 
## epoch: 39 ,  error_x: 1.013805 , error_y: 9.155341e-06 
## epoch: 40 ,  error_x: 0.9327007 , error_y: 6.225632e-06
# Define the layout with legend position
plot <- plot %>% layout(scene = list(aspectmode = "cube"),
                        legend = list(x = 0.8, y = 0.8))  # Adjust the x and y values as needed

# Combine the two plots into one
combined_plot <- subplot(
  plot,
  p,
  nrows = 1,
  shareX = TRUE,
  shareY = TRUE
)

# Show the combined plot
combined_plot
## Warning: 'layout' objects don't have these attributes: 'NA'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'

3.9 Neural Network III

Softmax regression - the final layer of nerual network

  1. Data Preprocessing
# Load the data
mnist <- read.csv("https://pjreddie.com/media/files/mnist_train.csv", nrows = 20000)
colnames(mnist) <- c("Digit", paste("Pixel", seq(1:784), sep = ""))

# Normalize the pixel values to range [0, 1]
mnist[, -1] <- mnist[, -1] / 255

# Split the dataset into features (X) and labels (y)
X <- as.matrix(mnist[, -1])
y <- mnist$Digit

Explanation: Preprocessing is crucial for machine learning models. Normalizing the pixel values helps in speeding up the convergence during training by ensuring that all features (in this case, pixels) have similar scales. Here, we divide by 255 to scale the pixel values to a [0, 1] range. We also separate the dataset into features (X) and labels (y), which will be used for training the softmax regression model.

  1. Initializing Parameters
# Initialize weights and biases
num_classes <- length(unique(y))
num_features <- ncol(X)
weights <- matrix(runif(num_features * num_classes) - 0.5, nrow = num_features, ncol = num_classes)
biases <- runif(num_classes) - 0.5

Explanation: Initialization plays a role in the optimization landscape that the training process will navigate. Here, weights and biases are initialized randomly, providing a starting point for optimization. The weight matrix dimensions allow for the calculation of class scores for each of the 10 digits given an input image. Each column in the weight matrix corresponds to a specific class (digit), and biases are used to adjust the output scores independently of the input features.

  1. Softmax Function
softmax <- function(scores) {
  exp_scores <- exp(scores)
  probs <- sweep(exp_scores, 1, rowSums(exp_scores), '/')
  return(probs)
}

Explanation: The softmax function ensures that the output values are interpretable as probabilities, as they are positive and sum up to 1 for each input. This function is essential for multi-class classification tasks like MNIST, where each output corresponds to the model’s confidence in each class. Applying softmax allows us to use cross-entropy as a loss function, which measures the difference between the predicted probabilities and the actual distribution of the labels.

  1. Cross-Entropy Loss
cross_entropy_loss <- function(probs, y) {
  # Convert labels to one-hot encoding
  y_one_hot <- matrix(0, nrow = nrow(probs), ncol = num_classes)
  y_one_hot[cbind(1:nrow(probs), y + 1)] <- 1  # R is 1-indexed
  
  # Compute the loss
  loss <- -sum(y_one_hot * log(probs)) / nrow(probs)
  return(loss)
}

Explanation: Cross-entropy loss is a key component in training classification models, as it provides a measure of how different the predicted probabilities are from the actual labels. In this implementation, we first convert the labels into a one-hot encoded format, where each label is represented as a vector of zeros except for a single one at the index corresponding to the class label. The loss is calculated by taking the negative log of the predicted probabilities for the actual classes and averaging over all examples. This loss function encourages the model to assign high probabilities to the correct classes.

  1. Forward Propagation and Class Score Calculation
forward_propagation <- function(X, weights, biases) {
  scores <- X %*% weights + matrix(rep(biases, each=nrow(X)), nrow=nrow(X), ncol=ncol(weights), byrow=TRUE)
  probs <- softmax(scores)
  return(list(scores=scores, probs=probs))
}

Explanation: Forward propagation calculates the weighted sum of inputs plus biases for each class, which are the raw class scores. The softmax function is then applied to these scores to derive probabilities. Each probability indicates the likelihood of an input belonging to a particular class. This step is crucial for both training (to compute the loss and gradients) and making predictions.

  1. Computing the Gradient
compute_gradients <- function(X, y, probs, num_classes) {
  # Convert labels to one-hot encoding
  y_one_hot <- matrix(0, nrow = nrow(probs), ncol = num_classes)
  y_one_hot[cbind(1:nrow(probs), y + 1)] <- 1
  
  # Compute gradients
  dW <- t(X) %*% (probs - y_one_hot) / nrow(X)
  dB <- colSums(probs - y_one_hot) / nrow(X)
  
  return(list(dW=dW, dB=dB))
}

Explanation: The gradient computation is a critical step in training neural networks. Here, dW represents the gradient of the loss with respect to the weights, and dB represents the gradient with respect to the biases. These gradients indicate how much a change in each parameter would affect the loss. By subtracting a portion of these gradients from the parameters, we can iteratively reduce the loss, making our model more accurate

  1. Parameter Update (Gradient Descent)
update_parameters <- function(weights, biases, dW, dB, learning_rate) {
  weights <- weights - learning_rate * dW
  biases <- biases - learning_rate * dB
  return(list(weights=weights, biases=biases))
}

Explanation: This function updates the model parameters in the direction that reduces the loss, as indicated by the gradients. The learning rate controls how big a step we take during each update. If the learning rate is too high, we might overshoot the minimum; if it’s too low, training might take too long.

  1. Model Evaluation
evaluate_accuracy <- function(probs, y) {
  predictions <- max.col(probs) - 1
  accuracy <- sum(predictions == y) / length(y)
  return(accuracy)
}

Explanation: After training, evaluating the model’s performance on unseen data is crucial. Accuracy measures the proportion of correctly predicted instances. It provides a straightforward metric to gauge how well our model generalizes from the training data to new, unseen data.

  1. Model Encapsulation
train_model <- function(X, y, num_classes, learning_rate, epochs, batch_size) {
  num_features <- ncol(X)
  
  # Initialize weights and biases
  weights <- matrix(runif(num_features * num_classes) - 0.5, nrow = num_features, ncol = num_classes)
  biases <- runif(num_classes) - 0.5
  
  for (epoch in 1:epochs) {
    # Mini-batch gradient descent
    for (i in seq(1, nrow(X), by=batch_size)) {
      batch_indices <- i:min(i+batch_size-1, nrow(X))
      X_batch <- X[batch_indices, ]
      y_batch <- y[batch_indices]
      
      # Forward propagation
      forward_result <- forward_propagation(X_batch, weights, biases)
      probs <- forward_result$probs
      
      # Compute loss (optional here, mainly for monitoring)
      loss <- cross_entropy_loss(probs, y_batch)
      
      # Compute gradients
      gradients <- compute_gradients(X_batch, y_batch, probs, num_classes)
      
      # Update parameters
      update_result <- update_parameters(weights, biases, gradients$dW, gradients$dB, learning_rate)
      weights <- update_result$weights
      biases <- update_result$biases
    }
    
    # Optionally evaluate the model on the training set or a validation set
    if (epoch %% 5 == 0) {
      forward_result <- forward_propagation(X, weights, biases)
      accuracy <- evaluate_accuracy(forward_result$probs, y)
      cat("Epoch", epoch, ": Training accuracy =", accuracy, "\n")
    }
  }
  
  return(list(weights=weights, biases=biases))
}

Explanation:

Initialization: We start by initializing the weights and biases with small random values. Training Loop: The outer loop iterates over the number of epochs. Each epoch represents a full pass through the training dataset. Mini-Batch Gradient Descent: Within each epoch, the dataset is divided into mini-batches. This approach helps to stabilize the gradient computation and can lead to faster convergence. Forward Propagation: For each batch, we compute the class probabilities using the current weights and biases. Loss Computation: While not strictly necessary for the update step, calculating the loss allows us to monitor training progress. Gradient Computation: We compute the gradients of the loss with respect to the weights and biases. Parameter Update: Using these gradients, we update the model parameters. Evaluation: Periodically, we evaluate the model’s performance on the entire dataset (or a separate validation set) to monitor its accuracy. This function represents a basic framework for training a softmax regression model. In practice, you might enhance this process with additional features like validation checks, early stopping, learning rate schedules, or regularization to improve the model’s performance and prevent overfitting.

  1. Model Training
# Set hyperparameters
learning_rate <- 0.01
epochs <- 200  # Number of passes through the entire dataset
batch_size <- 100  # Number of training examples used in one iteration
num_classes <- length(unique(y))

# Train the model
model <- train_model(X, y, num_classes, learning_rate, epochs, batch_size)
## Epoch 5 : Training accuracy = 0.73405 
## Epoch 10 : Training accuracy = 0.81045 
## Epoch 15 : Training accuracy = 0.83965 
## Epoch 20 : Training accuracy = 0.85385 
## Epoch 25 : Training accuracy = 0.86305 
## Epoch 30 : Training accuracy = 0.87045 
## Epoch 35 : Training accuracy = 0.87535 
## Epoch 40 : Training accuracy = 0.8809 
## Epoch 45 : Training accuracy = 0.88395 
## Epoch 50 : Training accuracy = 0.88685 
## Epoch 55 : Training accuracy = 0.88905 
## Epoch 60 : Training accuracy = 0.8918 
## Epoch 65 : Training accuracy = 0.89385 
## Epoch 70 : Training accuracy = 0.8959 
## Epoch 75 : Training accuracy = 0.8981 
## Epoch 80 : Training accuracy = 0.8992 
## Epoch 85 : Training accuracy = 0.90045 
## Epoch 90 : Training accuracy = 0.9017 
## Epoch 95 : Training accuracy = 0.90295 
## Epoch 100 : Training accuracy = 0.90415 
## Epoch 105 : Training accuracy = 0.9053 
## Epoch 110 : Training accuracy = 0.9059 
## Epoch 115 : Training accuracy = 0.9069 
## Epoch 120 : Training accuracy = 0.9082 
## Epoch 125 : Training accuracy = 0.90905 
## Epoch 130 : Training accuracy = 0.91 
## Epoch 135 : Training accuracy = 0.9105 
## Epoch 140 : Training accuracy = 0.9114 
## Epoch 145 : Training accuracy = 0.91185 
## Epoch 150 : Training accuracy = 0.9125 
## Epoch 155 : Training accuracy = 0.91335 
## Epoch 160 : Training accuracy = 0.914 
## Epoch 165 : Training accuracy = 0.9145 
## Epoch 170 : Training accuracy = 0.9151 
## Epoch 175 : Training accuracy = 0.9157 
## Epoch 180 : Training accuracy = 0.9162 
## Epoch 185 : Training accuracy = 0.91685 
## Epoch 190 : Training accuracy = 0.9173 
## Epoch 195 : Training accuracy = 0.9176 
## Epoch 200 : Training accuracy = 0.9182
  1. Testing—data Preprocessing
# Load the data
mnist_test <- read.csv("https://pjreddie.com/media/files/mnist_train.csv", skip=20000, nrows = 20000, header = FALSE)
colnames(mnist_test) <- c("Digit", paste("Pixel", seq(1:784), sep = ""))

# Normalize the pixel values to range [0, 1]
mnist_test[, -1] <- mnist_test[, -1] / 255

# Split the dataset into features (X) and labels (y)
X_test <- as.matrix(mnist_test[, -1])
y_test <- mnist_test$Digit
  1. Model Evaluation
# Perform forward propagation on the test set to get probabilities
forward_result_test <- forward_propagation(X_test, model$weights, model$biases)
probs_test <- forward_result_test$probs

# Make predictions by selecting the class with the highest probability for each test example
predictions_test <- max.col(probs_test) - 1  # Adjust for R's 1-indexing

# Calculate the accuracy on the test set
accuracy_test <- sum(predictions_test == y_test) / length(y_test)

# Print the test accuracy
cat("Test Accuracy:", accuracy_test, "\n")
## Test Accuracy: 0.9008

3.10 Neural Network IV

  1. Data Preprocessing
# Load the data
mnist <- read.csv("https://pjreddie.com/media/files/mnist_train.csv", nrows = 20000)
colnames(mnist) <- c("Digit", paste("Pixel", seq(1:784), sep = ""))

# Normalize the pixel values to range [0, 1]
mnist[, -1] <- mnist[, -1] / 255

# Split the dataset into features (X) and labels (y)
X <- as.matrix(mnist[, -1])
y <- mnist$Digit
num_classes <- length(unique(y))  
y_one_hot <- matrix(0, nrow = length(y), ncol = num_classes)  
y_one_hot[cbind(1:nrow(y_one_hot), y + 1)] <- 1 
  1. Initialize Parameters First, we initialize the weights and biases for a network with three hidden layers. Each layer’s weights are initialized randomly to break symmetry, and biases are initialized to zero.
initialize_params <- function(input_layer_size, hidden_layer1_size, hidden_layer2_size, output_layer_size) {
  list(
    W1 = matrix(rnorm(input_layer_size * hidden_layer1_size, mean = 0, sd = sqrt(2 / input_layer_size)), nrow = input_layer_size, ncol = hidden_layer1_size),
    b1 = matrix(0, nrow = 1, ncol = hidden_layer1_size),
    W2 = matrix(rnorm(hidden_layer1_size * hidden_layer2_size, mean = 0, sd = sqrt(2 / hidden_layer1_size)), nrow = hidden_layer1_size, ncol = hidden_layer2_size),
    b2 = matrix(0, nrow = 1, ncol = hidden_layer2_size),
    W3 = matrix(rnorm(hidden_layer2_size * output_layer_size, mean = 0, sd = sqrt(2 / hidden_layer2_size)), nrow = hidden_layer2_size, ncol = output_layer_size),
    b3 = matrix(0, nrow = 1, ncol = output_layer_size)
  )
}
  1. Activation Functions Define the RELU and softmax functions for the network’s activation:
relu <- function(z, alpha=0.01) {
  return(ifelse(z > 0, z, alpha * z))
}

softmax <- function(z) {
  exp(z) / rowSums(exp(z))
}
  1. Forward Propagation The forward propagation function computes the activation for each layer using the weights, biases, and activation functions:
forward_propagation <- function(X, params) {
  Z1 = X %*% params$W1 + matrix(params$b1, nrow = nrow(X), ncol = ncol(params$b1), byrow = TRUE)
  A1 = relu(Z1)
  Z2 = A1 %*% params$W2 + matrix(params$b2, nrow = nrow(A1), ncol = ncol(params$b2), byrow = TRUE)
  A2 = relu(Z2)
  Z3 = A2 %*% params$W3 + matrix(params$b3, nrow = nrow(A2), ncol = ncol(params$b3), byrow = TRUE)
  A3 = softmax(Z3)
  
  return(list(A1 = A1, A2 = A2, A3 = A3, Z1 = Z1, Z2 = Z2, Z3 = Z3))
}
  1. Cross-Entropy Loss The cross-entropy loss function evaluates the performance of the model:
cross_entropy_loss <- function(y_pred, y_true) {
  -mean(rowSums(y_true * log(y_pred)))
}
  1. Backpropagation The backpropagation function computes gradients for each parameter in the network by applying the chain rule:
backpropagation <- function(X, Y, params, forward_outputs, learning_rate) {
  A1 <- forward_outputs$A1
  A2 <- forward_outputs$A2
  A3 <- forward_outputs$A3
  Z1 <- forward_outputs$Z1
  Z2 <- forward_outputs$Z2
  

  leaky_relu_derivative <- function(Z, alpha=0.01) {
  dZ <- matrix(alpha, nrow = nrow(Z), ncol = ncol(Z)) 
  dZ[Z > 0] <- 1
  return(dZ)
}


  dZ3 <- A3 - Y
  dW3 <- t(A2) %*% dZ3 / nrow(X)
  db3 <- colSums(dZ3) / nrow(X)
  
  
  dZ2 <- (dZ3 %*% t(params$W3)) * leaky_relu_derivative(Z2)
  dW2 <- t(A1) %*% dZ2 / nrow(X)
  db2 <- colSums(dZ2) / nrow(X)
  
  
  
  dZ1 <- (dZ2 %*% t(params$W2)) * leaky_relu_derivative(Z1)
  dW1 <- t(X) %*% dZ1 / nrow(X)
  db1 <- colSums(dZ1) / nrow(X)
  
  params$W1 <- params$W1 - learning_rate * dW1
  params$b1 <- params$b1 - learning_rate * db1
  params$W2 <- params$W2 - learning_rate * dW2
  params$b2 <- params$b2 - learning_rate * db2
  params$W3 <- params$W3 - learning_rate * dW3
  params$b3 <- params$b3 - learning_rate * db3
  
  return(params)
}
  1. Training the Model Train the model over multiple epochs, adjusting the parameters using the gradients computed by backpropagation:
train_model <- function(train_data, train_labels, learning_rate, epochs, hidden_layer_sizes, batch_size) {
  input_layer_size <- ncol(train_data)
  num_classes <- ncol(train_labels)
  params <- initialize_params(input_layer_size, hidden_layer_sizes[1],hidden_layer_sizes[2], num_classes)
  
  for (epoch in 1:epochs) {
    for (i in seq(1, nrow(train_data), by=batch_size)) {
      batch_indices <- i:min(i+batch_size-1, nrow(train_data))
      X_batch <- train_data[batch_indices, ]
      Y_batch <- train_labels[batch_indices, ]
      
      forward_outputs <- forward_propagation(X_batch, params)
      params <- backpropagation(X_batch, Y_batch, params, forward_outputs, learning_rate)
    }
  
    forward_outputs <- forward_propagation(train_data, params)
    predictions <- max.col(forward_outputs$A3) - 1
    true_labels <- max.col(y_one_hot) - 1
    accuracy <- sum(predictions == true_labels) / length(true_labels)
    cat(sprintf("Epoch %d: Training Accuracy: %.4f\n", epoch, accuracy))
  }
  
  return(params)
}
  1. Model Training
learning_rate <- 0.01
epochs <- 200  # Number of passes through the entire dataset
hidden_layer_sizes <- c(512, 256)
batch_size <- 128
# Train the model
model <- train_model(X, y_one_hot, learning_rate, epochs, hidden_layer_sizes, batch_size)
## Epoch 1: Training Accuracy: 0.8104
## Epoch 2: Training Accuracy: 0.8577
## Epoch 3: Training Accuracy: 0.8786
## Epoch 4: Training Accuracy: 0.8918
## Epoch 5: Training Accuracy: 0.8988
## Epoch 6: Training Accuracy: 0.9056
## Epoch 7: Training Accuracy: 0.9106
## Epoch 8: Training Accuracy: 0.9142
## Epoch 9: Training Accuracy: 0.9181
## Epoch 10: Training Accuracy: 0.9214
## Epoch 11: Training Accuracy: 0.9234
## Epoch 12: Training Accuracy: 0.9263
## Epoch 13: Training Accuracy: 0.9288
## Epoch 14: Training Accuracy: 0.9314
## Epoch 15: Training Accuracy: 0.9334
## Epoch 16: Training Accuracy: 0.9353
## Epoch 17: Training Accuracy: 0.9376
## Epoch 18: Training Accuracy: 0.9390
## Epoch 19: Training Accuracy: 0.9412
## Epoch 20: Training Accuracy: 0.9426
## Epoch 21: Training Accuracy: 0.9439
## Epoch 22: Training Accuracy: 0.9451
## Epoch 23: Training Accuracy: 0.9464
## Epoch 24: Training Accuracy: 0.9472
## Epoch 25: Training Accuracy: 0.9486
## Epoch 26: Training Accuracy: 0.9503
## Epoch 27: Training Accuracy: 0.9514
## Epoch 28: Training Accuracy: 0.9530
## Epoch 29: Training Accuracy: 0.9537
## Epoch 30: Training Accuracy: 0.9547
## Epoch 31: Training Accuracy: 0.9554
## Epoch 32: Training Accuracy: 0.9565
## Epoch 33: Training Accuracy: 0.9574
## Epoch 34: Training Accuracy: 0.9585
## Epoch 35: Training Accuracy: 0.9595
## Epoch 36: Training Accuracy: 0.9603
## Epoch 37: Training Accuracy: 0.9613
## Epoch 38: Training Accuracy: 0.9619
## Epoch 39: Training Accuracy: 0.9626
## Epoch 40: Training Accuracy: 0.9632
## Epoch 41: Training Accuracy: 0.9637
## Epoch 42: Training Accuracy: 0.9644
## Epoch 43: Training Accuracy: 0.9653
## Epoch 44: Training Accuracy: 0.9659
## Epoch 45: Training Accuracy: 0.9668
## Epoch 46: Training Accuracy: 0.9674
## Epoch 47: Training Accuracy: 0.9681
## Epoch 48: Training Accuracy: 0.9686
## Epoch 49: Training Accuracy: 0.9692
## Epoch 50: Training Accuracy: 0.9697
## Epoch 51: Training Accuracy: 0.9701
## Epoch 52: Training Accuracy: 0.9707
## Epoch 53: Training Accuracy: 0.9710
## Epoch 54: Training Accuracy: 0.9717
## Epoch 55: Training Accuracy: 0.9722
## Epoch 56: Training Accuracy: 0.9729
## Epoch 57: Training Accuracy: 0.9731
## Epoch 58: Training Accuracy: 0.9735
## Epoch 59: Training Accuracy: 0.9737
## Epoch 60: Training Accuracy: 0.9740
## Epoch 61: Training Accuracy: 0.9746
## Epoch 62: Training Accuracy: 0.9749
## Epoch 63: Training Accuracy: 0.9753
## Epoch 64: Training Accuracy: 0.9759
## Epoch 65: Training Accuracy: 0.9765
## Epoch 66: Training Accuracy: 0.9770
## Epoch 67: Training Accuracy: 0.9775
## Epoch 68: Training Accuracy: 0.9778
## Epoch 69: Training Accuracy: 0.9781
## Epoch 70: Training Accuracy: 0.9786
## Epoch 71: Training Accuracy: 0.9789
## Epoch 72: Training Accuracy: 0.9794
## Epoch 73: Training Accuracy: 0.9798
## Epoch 74: Training Accuracy: 0.9805
## Epoch 75: Training Accuracy: 0.9807
## Epoch 76: Training Accuracy: 0.9811
## Epoch 77: Training Accuracy: 0.9812
## Epoch 78: Training Accuracy: 0.9818
## Epoch 79: Training Accuracy: 0.9821
## Epoch 80: Training Accuracy: 0.9825
## Epoch 81: Training Accuracy: 0.9827
## Epoch 82: Training Accuracy: 0.9829
## Epoch 83: Training Accuracy: 0.9831
## Epoch 84: Training Accuracy: 0.9835
## Epoch 85: Training Accuracy: 0.9839
## Epoch 86: Training Accuracy: 0.9843
## Epoch 87: Training Accuracy: 0.9846
## Epoch 88: Training Accuracy: 0.9851
## Epoch 89: Training Accuracy: 0.9853
## Epoch 90: Training Accuracy: 0.9858
## Epoch 91: Training Accuracy: 0.9860
## Epoch 92: Training Accuracy: 0.9863
## Epoch 93: Training Accuracy: 0.9865
## Epoch 94: Training Accuracy: 0.9867
## Epoch 95: Training Accuracy: 0.9871
## Epoch 96: Training Accuracy: 0.9873
## Epoch 97: Training Accuracy: 0.9877
## Epoch 98: Training Accuracy: 0.9878
## Epoch 99: Training Accuracy: 0.9881
## Epoch 100: Training Accuracy: 0.9882
## Epoch 101: Training Accuracy: 0.9882
## Epoch 102: Training Accuracy: 0.9883
## Epoch 103: Training Accuracy: 0.9886
## Epoch 104: Training Accuracy: 0.9888
## Epoch 105: Training Accuracy: 0.9890
## Epoch 106: Training Accuracy: 0.9892
## Epoch 107: Training Accuracy: 0.9895
## Epoch 108: Training Accuracy: 0.9896
## Epoch 109: Training Accuracy: 0.9898
## Epoch 110: Training Accuracy: 0.9900
## Epoch 111: Training Accuracy: 0.9901
## Epoch 112: Training Accuracy: 0.9903
## Epoch 113: Training Accuracy: 0.9904
## Epoch 114: Training Accuracy: 0.9905
## Epoch 115: Training Accuracy: 0.9907
## Epoch 116: Training Accuracy: 0.9909
## Epoch 117: Training Accuracy: 0.9909
## Epoch 118: Training Accuracy: 0.9910
## Epoch 119: Training Accuracy: 0.9911
## Epoch 120: Training Accuracy: 0.9913
## Epoch 121: Training Accuracy: 0.9916
## Epoch 122: Training Accuracy: 0.9919
## Epoch 123: Training Accuracy: 0.9920
## Epoch 124: Training Accuracy: 0.9921
## Epoch 125: Training Accuracy: 0.9923
## Epoch 126: Training Accuracy: 0.9926
## Epoch 127: Training Accuracy: 0.9927
## Epoch 128: Training Accuracy: 0.9928
## Epoch 129: Training Accuracy: 0.9930
## Epoch 130: Training Accuracy: 0.9931
## Epoch 131: Training Accuracy: 0.9932
## Epoch 132: Training Accuracy: 0.9934
## Epoch 133: Training Accuracy: 0.9936
## Epoch 134: Training Accuracy: 0.9939
## Epoch 135: Training Accuracy: 0.9940
## Epoch 136: Training Accuracy: 0.9942
## Epoch 137: Training Accuracy: 0.9943
## Epoch 138: Training Accuracy: 0.9946
## Epoch 139: Training Accuracy: 0.9948
## Epoch 140: Training Accuracy: 0.9949
## Epoch 141: Training Accuracy: 0.9952
## Epoch 142: Training Accuracy: 0.9953
## Epoch 143: Training Accuracy: 0.9954
## Epoch 144: Training Accuracy: 0.9954
## Epoch 145: Training Accuracy: 0.9956
## Epoch 146: Training Accuracy: 0.9956
## Epoch 147: Training Accuracy: 0.9958
## Epoch 148: Training Accuracy: 0.9959
## Epoch 149: Training Accuracy: 0.9961
## Epoch 150: Training Accuracy: 0.9961
## Epoch 151: Training Accuracy: 0.9962
## Epoch 152: Training Accuracy: 0.9962
## Epoch 153: Training Accuracy: 0.9963
## Epoch 154: Training Accuracy: 0.9964
## Epoch 155: Training Accuracy: 0.9966
## Epoch 156: Training Accuracy: 0.9966
## Epoch 157: Training Accuracy: 0.9967
## Epoch 158: Training Accuracy: 0.9967
## Epoch 159: Training Accuracy: 0.9967
## Epoch 160: Training Accuracy: 0.9968
## Epoch 161: Training Accuracy: 0.9970
## Epoch 162: Training Accuracy: 0.9970
## Epoch 163: Training Accuracy: 0.9971
## Epoch 164: Training Accuracy: 0.9972
## Epoch 165: Training Accuracy: 0.9972
## Epoch 166: Training Accuracy: 0.9972
## Epoch 167: Training Accuracy: 0.9973
## Epoch 168: Training Accuracy: 0.9974
## Epoch 169: Training Accuracy: 0.9974
## Epoch 170: Training Accuracy: 0.9974
## Epoch 171: Training Accuracy: 0.9974
## Epoch 172: Training Accuracy: 0.9975
## Epoch 173: Training Accuracy: 0.9976
## Epoch 174: Training Accuracy: 0.9977
## Epoch 175: Training Accuracy: 0.9977
## Epoch 176: Training Accuracy: 0.9978
## Epoch 177: Training Accuracy: 0.9979
## Epoch 178: Training Accuracy: 0.9979
## Epoch 179: Training Accuracy: 0.9980
## Epoch 180: Training Accuracy: 0.9980
## Epoch 181: Training Accuracy: 0.9980
## Epoch 182: Training Accuracy: 0.9980
## Epoch 183: Training Accuracy: 0.9980
## Epoch 184: Training Accuracy: 0.9980
## Epoch 185: Training Accuracy: 0.9980
## Epoch 186: Training Accuracy: 0.9980
## Epoch 187: Training Accuracy: 0.9980
## Epoch 188: Training Accuracy: 0.9981
## Epoch 189: Training Accuracy: 0.9981
## Epoch 190: Training Accuracy: 0.9983
## Epoch 191: Training Accuracy: 0.9983
## Epoch 192: Training Accuracy: 0.9983
## Epoch 193: Training Accuracy: 0.9983
## Epoch 194: Training Accuracy: 0.9983
## Epoch 195: Training Accuracy: 0.9984
## Epoch 196: Training Accuracy: 0.9984
## Epoch 197: Training Accuracy: 0.9984
## Epoch 198: Training Accuracy: 0.9985
## Epoch 199: Training Accuracy: 0.9986
## Epoch 200: Training Accuracy: 0.9986