Fake News Detection Modeling

Naive Bayes

The following python file (Naive_Bayes_Model) and the .csv files (all_news_cv, all_news_tfidf) needed to run the code can be downloaded below.

The python code to create, train, and evaluate the Naive Bayes model can be found below.

# -*- coding: utf-8 -*-
"""
Created on Sat Mar 11 16:24:21 2023

@author: casey
"""

## LOAD LIBRARIES
# Set seed for reproducibility
import random; random.seed(53)
import pandas as pd

# Import all we need from sklearn
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Import visualization
import scikitplot as skplt
import matplotlib.pyplot as plt

# ------------------------------------------------------------------------------------- #
## LOAD DATA

news_df_cv = pd.read_csv('C:/Users/casey/OneDrive/Documents/MSDS_Courses/Spring_2023/Data_Mining/Course_Project/data/all_news_cv.csv')
news_df_cv.info()

news_df_tf = pd.read_csv('C:/Users/casey/OneDrive/Documents/MSDS_Courses/Spring_2023/Data_Mining/Course_Project/data/all_news_tfidf.csv')
news_df_tf.info()

# ------------------------------------------------------------------------------------- #
## CREATE TRAINING AND TESTING SETS FOR COUNT VECTORIZED
X_cv = news_df_cv.drop(['Label'], axis=1)
y_cv = news_df_cv['Label']


X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_cv, y_cv, test_size=0.2)

# ------------------------------------------------------------------------------------- #
## CREATE TRAINING AND TESTING SETS FOR TERM FREQUENCY
X_tf = news_df_tf.drop(['Label'], axis=1)
y_tf = news_df_tf['Label']


X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X_tf, y_tf, test_size=0.2)

# ------------------------------------------------------------------------------------- #
## TRAIN NAIVE BAYES MODEL

NB_Classifier = MultinomialNB()

nb_cv = NB_Classifier.fit(X_train_cv, y_train_cv)
nb_tf = NB_Classifier.fit(X_train_tf, y_train_tf)

# ------------------------------------------------------------------------------------- #
## EVALUATE MODEL

y_pred_cv = nb_cv.predict(X_test_cv)
nb_cv_score = accuracy_score(y_test_cv, y_pred_cv)
print('NaiveBayes Count Score: ', nb_cv_score)

y_pred_tf = nb_tf.predict(X_test_tf)
nb_tf_score = accuracy_score(y_test_tf, y_pred_tf)
print('NaiveBayes Term-Frequency Score: ', nb_tf_score)

# ------------------------------------------------------------------------------------- #
## CONFUSION MATRIX

# Calculate the confusion matrices for the tfidf_nb model and count_nb models
nb_cv_cm = confusion_matrix(y_test_cv, y_pred_cv)
nb_cv_results = classification_report(y_test_cv, y_pred_cv)
print(nb_cv_cm)
print(nb_cv_results)

nb_tf_cm = confusion_matrix(y_test_tf, y_pred_tf)
nb_tf_results = classification_report(y_test_tf, y_pred_tf)
print(nb_tf_cm)
print(nb_tf_results)

# ------------------------------------------------------------------------------------- #
## PLOT CONFUSION MATRIX

fig = plt.figure(figsize=(15,6))

ax1 = fig.add_subplot(121)
skplt.metrics.plot_confusion_matrix(y_pred_tf, y_test_tf,
                                    title="Confusion Matrix for Term Frequency Vectorizer",
                                    cmap="Oranges",
                                    ax=ax1)
ax2 = fig.add_subplot(122)
skplt.metrics.plot_confusion_matrix(y_pred_cv, y_test_cv,
                                    title="Confusion Matrix for Count Vectorizer",
                                    cmap="Purples",
                                    ax=ax2);

Support Vector Machines

The following python file (SVM_Model) and the .csv files (all_news_cv, all_news_tfidf) needed to run the code can be downloaded below.

The python code to create, train, and evaluate the Support Vector Machines model using a linear kernel can be found below.

# -*- coding: utf-8 -*-
"""
Created on Fri Mar 24 14:36:15 2023

@author: casey
"""

## LOAD LIBRARIES
# Set seed for reproducibility
import random; random.seed(53)
import pandas as pd

# Import all we need from sklearn
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Import visualization
import scikitplot as skplt
import matplotlib.pyplot as plt

# ------------------------------------------------------------------------------------- #
## LOAD DATA

news_df_cv = pd.read_csv('C:/Users/casey/OneDrive/Documents/MSDS_Courses/Spring_2023/Data_Mining/Course_Project/data/all_news_cv.csv')
news_df_cv.info()

news_df_tf = pd.read_csv('C:/Users/casey/OneDrive/Documents/MSDS_Courses/Spring_2023/Data_Mining/Course_Project/data/all_news_tfidf.csv')
news_df_tf.info()

# ------------------------------------------------------------------------------------- #
## CREATE TRAINING AND TESTING SETS FOR COUNT VECTORIZED
X_cv = news_df_cv.drop(['Label'], axis=1)
y_cv = news_df_cv['Label']


X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_cv, y_cv, test_size=0.2)

# ------------------------------------------------------------------------------------- #
## CREATE TRAINING AND TESTING SETS FOR TERM FREQUENCY
X_tf = news_df_tf.drop(['Label'], axis=1)
y_tf = news_df_tf['Label']


X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X_tf, y_tf, test_size=0.2)

# ------------------------------------------------------------------------------------- #
## TRAIN SVM MODEL

# change kernel to achieve different accuracies (linear, sigmoid, rbf, poly)
SVM_Classifier = svm.SVC(kernel='linear')

svm_cv = SVM_Classifier.fit(X_train_cv, y_train_cv)
svm_tf = SVM_Classifier.fit(X_train_tf, y_train_tf)

# ------------------------------------------------------------------------------------- #
## EVALUATE MODEL

y_pred_cv = svm_cv.predict(X_test_cv)
svm_cv_score = accuracy_score(y_test_cv, y_pred_cv)
print('SVM Count Score: ', svm_cv_score)

y_pred_tf = svm_tf.predict(X_test_tf)
svm_tf_score = accuracy_score(y_test_tf, y_pred_tf)
print('SVM Term-Frequency Score: ', svm_tf_score)

# ------------------------------------------------------------------------------------- #
## CONFUSION MATRIX

# Calculate the confusion matrices for the tfidf_nb model and count_nb models
svm_cv_cm = confusion_matrix(y_test_cv, y_pred_cv)
svm_cv_results = classification_report(y_test_cv, y_pred_cv)
print(svm_cv_cm)
print(svm_cv_results)

svm_tf_cm = confusion_matrix(y_test_tf, y_pred_tf)
svm_tf_results = classification_report(y_test_tf, y_pred_tf)
print(svm_tf_cm)
print(svm_tf_results)

# ------------------------------------------------------------------------------------- #
## PLOT CONFUSION MATRIX

fig = plt.figure(figsize=(15,6))

ax1 = fig.add_subplot(121)
skplt.metrics.plot_confusion_matrix(y_pred_tf, y_test_tf,
                                    title="Confusion Matrix for Term Frequency Vectorizer",
                                    cmap="Oranges",
                                    ax=ax1)
ax2 = fig.add_subplot(122)
skplt.metrics.plot_confusion_matrix(y_pred_cv, y_test_cv,
                                    title="Confusion Matrix for Count Vectorizer",
                                    cmap="Purples",
                                    ax=ax2);