The .R file and the .csv file needed to run the Naive Bayes model code can be downloaded below.
The code for creating and visualizing the results of the Naive Bayes model can be found below. A quick note:
- The data must be split into training and testing sets before the Naive Bayes Model is created
- The labels need to be saved in a separate variable and then removed from both the training and testing sets before the Naive Bayes model is created in R
## LOAD LIBRARIES
library(dplyr)
library(ggplot2)
library(naivebayes)
library(tidyverse)
library(caret)
library(caretEnsemble)
library(psych)
library(Amelia)
library(mice)
library(GGally)
library(e1071)
library(klaR)
# ------------------------------------------------------------------------------ #
## LOAD DATA
gbg_nb_full <- read.csv('C:/Users/casey/OneDrive/Documents/MSDS_Courses/Spring_2023/Machine_Learning/Naive_Bayes/prepped_data/gbg_nb_full_r.csv')
# ------------------------------------------------------------------------------ #
## SET VARIABLES TO CORRECT DATA TYPES
gbg_nb_full$total_result <- as.factor(gbg_nb_full$total_result)
gbg_nb_full$game_type <- as.factor(gbg_nb_full$game_type)
gbg_nb_full$weekday <- as.factor(gbg_nb_full$weekday)
gbg_nb_full$location <- as.factor(gbg_nb_full$location)
gbg_nb_full$roof <- as.factor(gbg_nb_full$roof)
gbg_nb_full$surface <- as.factor(gbg_nb_full$surface)
str(gbg_nb_full)
# ------------------------------------------------------------------------------ #
## SPLIT DATA INTO TRAIN AND TEST
# will split 80% train and 20% test
# check to see how big the training and testing datasets should be after splitting the data
nrow(gbg_nb_full)*0.8
nrow(gbg_nb_full)*0.2
## set a seed if you want it to be the same each time you
## run the code. The number (like 1234) does not matter
#set.seed(1234)
# find the number corresponding to 80% of the data
n <- floor(0.8*nrow(gbg_nb_full))
# randomly sample indicies to be included in our training set (80%)
index <- sample(seq_len(nrow(gbg_nb_full)), size = n)
# set the training set to be randomly sampled rows of the data (80%)
train <- gbg_nb_full[index, ]
# set the testing set to be the remaining rows (20%)
test <- gbg_nb_full[-index, ]
# check to see if the size of the training and testing sets match what was expected
cat("There are", dim(train)[1], "rows and", dim(train)[2],
"columns in the training set.")
cat("There are", dim(test)[1], "rows and", dim(test)[2],
"columns in the testing set.")
# make sure the testing and training sets have balanced labels
table(train$total_result)
table(test$total_result)
# remove labels from training and testing set and keep them
test_labels <- test$total_result
train_labels <- train$total_result
test <- test[ , -which(names(test) %in% c("total_result"))]
train <- train[ , -which(names(train) %in% c("total_result"))]
# ------------------------------------------------------------------------------ #
## NAIVE BAYES MODELING
NB_1_e1071_train <- naiveBayes(train, train_labels, laplace = 1)
NB_1_e1071_pred <- predict(NB_1_e1071_train, test)
# print probabilities
#NB_1_e1071_train
# ------------------------------------------------------------------------------- #
## EVALUATE RESULTS
cm_1 <- confusionMatrix(NB_1_e1071_pred, test_labels)
cm_1
draw_confusion_matrix <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#3F97D0')
text(195, 435, 'Over', cex=1.2)
rect(250, 430, 340, 370, col='#F7AD50')
text(295, 435, 'Under', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#F7AD50')
rect(250, 305, 340, 365, col='#3F97D0')
text(140, 400, 'Over', cex=1.2, srt=90)
text(140, 335, 'Under', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
draw_confusion_matrix(cm_1)
# ------------------------------------------------------------------------------ #
## CROSS-VALIDATION
# K-Fold CROSS-VALIDATION
# select number of folds (value of K)
# select how many times to repeat
train_control <- trainControl(method = 'cv', number = 10)
# perform cross-validation
cv_nb <- train(x = train, y = train_labels, method="nb",
trControl = train_control)
# view results of cross-validation
cv_nb
cv_nb_cm <- confusionMatrix(cv_nb)
cv_nb_cm