Naive Bayes
The following python file (Naive_Bayes_Model
) and the .csv files (all_news_cv
, all_news_tfidf
) needed to run the code can be downloaded below.
The python code to create, train, and evaluate the Naive Bayes model can be found below.
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 11 16:24:21 2023
@author: casey
"""
## LOAD LIBRARIES
# Set seed for reproducibility
import random; random.seed(53)
import pandas as pd
# Import all we need from sklearn
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Import visualization
import scikitplot as skplt
import matplotlib.pyplot as plt
# ------------------------------------------------------------------------------------- #
## LOAD DATA
news_df_cv = pd.read_csv('C:/Users/casey/OneDrive/Documents/MSDS_Courses/Spring_2023/Data_Mining/Course_Project/data/all_news_cv.csv')
news_df_cv.info()
news_df_tf = pd.read_csv('C:/Users/casey/OneDrive/Documents/MSDS_Courses/Spring_2023/Data_Mining/Course_Project/data/all_news_tfidf.csv')
news_df_tf.info()
# ------------------------------------------------------------------------------------- #
## CREATE TRAINING AND TESTING SETS FOR COUNT VECTORIZED
X_cv = news_df_cv.drop(['Label'], axis=1)
y_cv = news_df_cv['Label']
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_cv, y_cv, test_size=0.2)
# ------------------------------------------------------------------------------------- #
## CREATE TRAINING AND TESTING SETS FOR TERM FREQUENCY
X_tf = news_df_tf.drop(['Label'], axis=1)
y_tf = news_df_tf['Label']
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X_tf, y_tf, test_size=0.2)
# ------------------------------------------------------------------------------------- #
## TRAIN NAIVE BAYES MODEL
NB_Classifier = MultinomialNB()
nb_cv = NB_Classifier.fit(X_train_cv, y_train_cv)
nb_tf = NB_Classifier.fit(X_train_tf, y_train_tf)
# ------------------------------------------------------------------------------------- #
## EVALUATE MODEL
y_pred_cv = nb_cv.predict(X_test_cv)
nb_cv_score = accuracy_score(y_test_cv, y_pred_cv)
print('NaiveBayes Count Score: ', nb_cv_score)
y_pred_tf = nb_tf.predict(X_test_tf)
nb_tf_score = accuracy_score(y_test_tf, y_pred_tf)
print('NaiveBayes Term-Frequency Score: ', nb_tf_score)
# ------------------------------------------------------------------------------------- #
## CONFUSION MATRIX
# Calculate the confusion matrices for the tfidf_nb model and count_nb models
nb_cv_cm = confusion_matrix(y_test_cv, y_pred_cv)
nb_cv_results = classification_report(y_test_cv, y_pred_cv)
print(nb_cv_cm)
print(nb_cv_results)
nb_tf_cm = confusion_matrix(y_test_tf, y_pred_tf)
nb_tf_results = classification_report(y_test_tf, y_pred_tf)
print(nb_tf_cm)
print(nb_tf_results)
# ------------------------------------------------------------------------------------- #
## PLOT CONFUSION MATRIX
fig = plt.figure(figsize=(15,6))
ax1 = fig.add_subplot(121)
skplt.metrics.plot_confusion_matrix(y_pred_tf, y_test_tf,
title="Confusion Matrix for Term Frequency Vectorizer",
cmap="Oranges",
ax=ax1)
ax2 = fig.add_subplot(122)
skplt.metrics.plot_confusion_matrix(y_pred_cv, y_test_cv,
title="Confusion Matrix for Count Vectorizer",
cmap="Purples",
ax=ax2);
Support Vector Machines
The following python file (SVM_Model
) and the .csv files (all_news_cv
, all_news_tfidf
) needed to run the code can be downloaded below.
The python code to create, train, and evaluate the Support Vector Machines model using a linear kernel can be found below.
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 24 14:36:15 2023
@author: casey
"""
## LOAD LIBRARIES
# Set seed for reproducibility
import random; random.seed(53)
import pandas as pd
# Import all we need from sklearn
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Import visualization
import scikitplot as skplt
import matplotlib.pyplot as plt
# ------------------------------------------------------------------------------------- #
## LOAD DATA
news_df_cv = pd.read_csv('C:/Users/casey/OneDrive/Documents/MSDS_Courses/Spring_2023/Data_Mining/Course_Project/data/all_news_cv.csv')
news_df_cv.info()
news_df_tf = pd.read_csv('C:/Users/casey/OneDrive/Documents/MSDS_Courses/Spring_2023/Data_Mining/Course_Project/data/all_news_tfidf.csv')
news_df_tf.info()
# ------------------------------------------------------------------------------------- #
## CREATE TRAINING AND TESTING SETS FOR COUNT VECTORIZED
X_cv = news_df_cv.drop(['Label'], axis=1)
y_cv = news_df_cv['Label']
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_cv, y_cv, test_size=0.2)
# ------------------------------------------------------------------------------------- #
## CREATE TRAINING AND TESTING SETS FOR TERM FREQUENCY
X_tf = news_df_tf.drop(['Label'], axis=1)
y_tf = news_df_tf['Label']
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X_tf, y_tf, test_size=0.2)
# ------------------------------------------------------------------------------------- #
## TRAIN SVM MODEL
# change kernel to achieve different accuracies (linear, sigmoid, rbf, poly)
SVM_Classifier = svm.SVC(kernel='linear')
svm_cv = SVM_Classifier.fit(X_train_cv, y_train_cv)
svm_tf = SVM_Classifier.fit(X_train_tf, y_train_tf)
# ------------------------------------------------------------------------------------- #
## EVALUATE MODEL
y_pred_cv = svm_cv.predict(X_test_cv)
svm_cv_score = accuracy_score(y_test_cv, y_pred_cv)
print('SVM Count Score: ', svm_cv_score)
y_pred_tf = svm_tf.predict(X_test_tf)
svm_tf_score = accuracy_score(y_test_tf, y_pred_tf)
print('SVM Term-Frequency Score: ', svm_tf_score)
# ------------------------------------------------------------------------------------- #
## CONFUSION MATRIX
# Calculate the confusion matrices for the tfidf_nb model and count_nb models
svm_cv_cm = confusion_matrix(y_test_cv, y_pred_cv)
svm_cv_results = classification_report(y_test_cv, y_pred_cv)
print(svm_cv_cm)
print(svm_cv_results)
svm_tf_cm = confusion_matrix(y_test_tf, y_pred_tf)
svm_tf_results = classification_report(y_test_tf, y_pred_tf)
print(svm_tf_cm)
print(svm_tf_results)
# ------------------------------------------------------------------------------------- #
## PLOT CONFUSION MATRIX
fig = plt.figure(figsize=(15,6))
ax1 = fig.add_subplot(121)
skplt.metrics.plot_confusion_matrix(y_pred_tf, y_test_tf,
title="Confusion Matrix for Term Frequency Vectorizer",
cmap="Oranges",
ax=ax1)
ax2 = fig.add_subplot(122)
skplt.metrics.plot_confusion_matrix(y_pred_cv, y_test_cv,
title="Confusion Matrix for Count Vectorizer",
cmap="Purples",
ax=ax2);