TF-IDF Baseline Classifiers
The code and dataset necessary to perform the TF-IDF modeling can be found below.
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 14 15:17:46 2023
@author: casey
"""
# ---------------------------------------------------------------------------------------------------------------------------- #
## IMPORTS
# Import all we need from sklearn
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB # naive bayes
from sklearn import svm # support vector machines
from sklearn import linear_model # logistic regression
from sklearn import preprocessing
import xgboost as xgb # xgboost
from sklearn.ensemble import RandomForestClassifier # random forest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import numpy as np
import pandas as pd
# Import all we need from nltk
import nltk
import string
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re
# ---------------------------------------------------------------------------------------------------------------------------- #
## LOAD DATA
tweets_df = pd.read_csv('C:/Users/casey/OneDrive/Documents/Data_Science/Projects/Movie_Review_Sentiment_Analysis/data/Corona_tweets.csv', encoding='latin1')
# 0: Extremely Negative, 1: Extremely Positive, 2: Negative, 3: Neutral, 4: Positive
le = LabelEncoder()
tweets_df.Sentiment = le.fit_transform(tweets_df.Sentiment)
tweets_df.Sentiment.value_counts()
tweets = tweets_df.OriginalTweet
labels = tweets_df.Sentiment
## Create train and test data
train_tweets, test_tweets, train_labels, test_labels = train_test_split(tweets, labels, test_size=0.2, random_state=1)
## Set the input shape
train_input_shape=train_tweets.shape
test_input_shape=test_tweets.shape
# check shapes
print("The input shape for the training reviews is\n", train_input_shape) ## (30000)
print("The input shape for the testing reviews is\n", test_input_shape) ## (5000)
# ---------------------------------------------------------------------------------------------------------------------------- #
## NORMALIZATION FUNCTIONS FOR TEXT PREPROCESSING
## CONVERT TO LOWERCASE
def to_lowercase(text):
text = text.lower()
return text
#df['review'] = df['review'].apply(to_lowercase)
# ------------------------------------------------------------------------------------------------ #
## REMOVE NUMBERS
def remove_numbers(text):
text = re.sub(r'\d+', '', text)
return text
#df['review'] = df['review'].apply(remove_numbers)
# ------------------------------------------------------------------------------------------------ #
## REMOVE PUNCTUATION
def remove_punctuations(text):
return text.translate(str.maketrans('', '', string.punctuation))
#df['review'] = df['review'].apply(remove_punctuations)
# ------------------------------------------------------------------------------------------------ #
## REMOVE SPECIAL CHARACTERS
def remove_special_chars(text):
return re.sub('[^a-zA-Z]', ' ', text)
#df['review'] = df['review'].apply(remove_special_chars)
# ------------------------------------------------------------------------------------------------ #
## REMOVE UNNECESSARY WHITE SPACE
def remove_whitespace(text):
return " ".join(text.split())
#df['review'] = df['review'].apply(remove_whitespace)
# ------------------------------------------------------------------------------------------------ #
## REMOVE STOPWORDS
# create list of your own words to also remove
my_stopwords = ['br', 'b']
def remove_stopwords(text):
new_list = []
words = word_tokenize(text)
stopwrds = stopwords.words('english') + my_stopwords
for word in words:
if word not in stopwrds:
new_list.append(word)
return ' '.join(new_list)
#df['review'] = df['review'].apply(remove_stopwords)
# ------------------------------------------------------------------------------------------------ #
## LEMMATIZATION
# usually preferred over stemming
# considers context (word part of speech)
# caring -> care
#lem_df = df.copy()
# Part of speech tagger function
def pos_tagger(nltk_tag):
if nltk_tag.startswith('J'):
return wordnet.ADJ
elif nltk_tag.startswith('V'):
return wordnet.VERB
elif nltk_tag.startswith('N'):
return wordnet.NOUN
elif nltk_tag.startswith('R'):
return wordnet.ADV
else:
return None
# Instantiate lemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_word(text):
pos_tagged = nltk.pos_tag(nltk.word_tokenize(text))
#word_tokens = word_tokenize(text)
wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
lemmatized_review = []
for word, tag in wordnet_tagged:
if tag is None:
# if there is no available tag, append the token as is
lemmatized_review.append(word)
else:
# else use the tag to lemmatize the token
lemmatized_review.append(lemmatizer.lemmatize(word, tag))
lemmatized_review = " ".join(lemmatized_review)
return lemmatized_review
#lem_df['review'] = lem_df['review'].apply(lemmatize_word)
# Twitter Specific Cleaning
def remove_hyperlinks(text):
return re.sub(r'https?:\/\/.*[\r\n]*', ' ', text)
def remove_hashtag_symbol(text):
return re.sub(r'#', ' ', text)
def remove_retweet_text(text):
return re.sub(r'^RT[\s]+', ' ', text)
# ---------------------------------------------------------------------------------------------------------------------------- #
## CUSTOM NORMALIZATION FUNCTION
# choose which preprocessing functions to use
# not using lemmatization or stemming since using a neural network
def custom_normalization(reviews):
reviews = reviews.apply(to_lowercase)
reviews = reviews.apply(remove_numbers)
reviews = reviews.apply(remove_punctuations)
reviews = reviews.apply(remove_special_chars)
reviews = reviews.apply(remove_stopwords)
reviews = reviews.apply(remove_hyperlinks)
reviews = reviews.apply(remove_hashtag_symbol)
reviews = reviews.apply(remove_retweet_text)
reviews = reviews.apply(lemmatize_word)
return reviews
# ---------------------------------------------------------------------------------------------------------------------------- #
## PREPROCESS FOR RNN MODEL
# normalize reviews
train_tweets = custom_normalization(train_tweets)
test_tweets = custom_normalization(test_tweets)
all_train_words = []
for review in train_tweets:
for word in review.split():
all_train_words.append(word)
unique_train_words = set(all_train_words) # there are about 130000 unique words
# ---------------------------------------------------------------------------------------------------------------------------- #
## BUILD THE MODELS
n_grams = [(1,1), (1,2), (1,3), (2,2), (2,3), (3,3)]
num_features = [500, 1000, 5000, 10000]
classifiers = [linear_model.LogisticRegression(max_iter=500), xgb.XGBClassifier(), RandomForestClassifier(), MultinomialNB()] # svm is very slow, run one at a time
#classifiers = [xgb.XGBClassifier(), RandomForestClassifier(), MultinomialNB(), svm.SVC(kernel='rbf'), svm.SVC(kernel='sigmoid'), svm.SVC(kernel='linear'), svm.SVC(kernel='poly', degree=2), svm.SVC(kernel='poly', degree=3)]
best_params = []
for classifier in classifiers:
fit_n_grams = []
fit_num_features = []
fit_scores = []
fit_predictions = []
print(str(classifier))
for n_gram in n_grams:
for num_feature in num_features:
my_stopwords = []
# instantiate tfidf vectorizer
Tfidf = TfidfVectorizer(
input="content",
lowercase = True,
stop_words = stopwords.words('english') + my_stopwords,
max_features = num_feature,
analyzer = 'word',
ngram_range = n_gram
)
train_dtm = Tfidf.fit_transform(train_tweets)
test_dtm = Tfidf.transform(test_tweets)
Classifier = classifier
Classifier.fit(train_dtm, train_labels)
## EVALUATE MODEL
predictions = Classifier.predict(test_dtm)
#y_proba = Classifier.predict_proba(dev_dtm)
report = classification_report(y_true=test_labels, y_pred=predictions, output_dict=True)
#f1 = report['macro avg']['f1-score']
print(n_gram, num_feature)
fit_n_grams.append(n_gram)
fit_num_features.append(num_feature)
fit_predictions.append(predictions)
# print(confusion_matrix(dev_df['Label'], y_pred))
#print(classification_report(dev_df['Label'], y_pred))
# print('Roc_Auc_Score: ' + str(roc_auc_score(dev_df['Label'], y_proba[:, 1])))
fit_scores.append(report['accuracy'])
index = np.argmax(fit_scores)
best_params.append([fit_scores[index], fit_n_grams[index], fit_num_features[index], str(classifier), fit_predictions[index]])
# ---------------------------------------------------------------------------------------------------------------------------- #
## GET BEST PARAMS FOR EACH MODEL
for model in best_params:
print(model)
# ---------------------------------------------------------------------------------------------------------------------------- #
## GET MODEL PREDICTIONS ON TEST SET
predictions=best_params[0][4]
print(predictions)
print(predictions.shape)
# ---------------------------------------------------------------------------------------------------------------------------- #
## GENERATE CONFUSION MATRIX FOR PREDICTIONS
CM=confusion_matrix(y_pred=predictions, y_true=test_labels)
print(CM)
# ---------------------------------------------------------------------------------------------------------------------------- #
## PLOT A PRETTY CONFUSION MATRIX
import seaborn as sns
import matplotlib.pyplot as plt
# 0: Extremely Negative, 1: Extremely Positive, 2: Negative, 3: Neutral, 4: Positive
class_names = ["Extremely Negative","Extremely Positive","Negative","Neutral","Positive"]
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(CM, annot=True, fmt='g', ax=ax, annot_kws={'size': 18})
#annot=True to annotate cells, ftm='g' to disable scientific notation
# annot_kws si size of font in heatmap
# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix: Tweet Sentiment Analysis (Logistic Regression)')
ax.xaxis.set_ticklabels(["0: Extremely Negative","1: Extremely Positive","2: Negative","3: Neutral","4: Positive"],rotation=90, fontsize = 18)
ax.yaxis.set_ticklabels(["0: Extremely Negative","1: Extremely Positive","2: Negative","3: Neutral","4: Positive"],rotation=0, fontsize = 18)
Neural Networks
The code and dataset necessary to perform the neural network modeling can be found below.
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 14 10:48:43 2023
@author: casey
"""
# ---------------------------------------------------------------------------------------------------------------------------- #
## IMPORTS
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import tensorflow.keras
#from tensorflow.keras.datasets import mnist
from tensorflow.keras import datasets
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, LSTM
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.regularizers import l1, l2, l1_l2
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
import numpy as np
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re
# ---------------------------------------------------------------------------------------------------------------------------- #
## LOAD DATA
tweets_df = pd.read_csv('C:/Users/casey/OneDrive/Documents/Data_Science/Projects/Movie_Review_Sentiment_Analysis/data/Corona_tweets.csv', encoding='latin1')
#tweets_df.Sentiment.replace(to_replace='Extremely Positive', value='Positive',inplace=True)
#tweets_df.Sentiment.replace(to_replace='Extremely Negative', value='Negative',inplace=True)
# 0: Extremely Negative, 1: Extremely Positive, 2: Negative, 3: Neutral, 4: Positive
le = LabelEncoder()
tweets_df.Sentiment = le.fit_transform(tweets_df.Sentiment)
tweets_df.Sentiment.value_counts()
tweets = tweets_df.OriginalTweet
labels = tweets_df.Sentiment
## Create train and test data
train_tweets, test_tweets, train_labels, test_labels = train_test_split(tweets, labels, test_size=4500, random_state=1)
## Create validation data
train_tweets, val_tweets, train_labels, val_labels = train_test_split(train_tweets, train_labels, test_size=4500, random_state=1)
## Set the input shape
train_input_shape=train_tweets.shape
test_input_shape=test_tweets.shape
val_input_shape=val_tweets.shape
# check shapes
print("The input shape for the training reviews is\n", train_input_shape) ## (30000)
print("The input shape for the testing reviews is\n", test_input_shape) ## (5000)
print("The input shape for the validation reviews is\n",val_input_shape) ## (5000)
# ---------------------------------------------------------------------------------------------------------------------------- #
## NORMALIZATION FUNCTIONS FOR TEXT PREPROCESSING
## CONVERT TO LOWERCASE
def to_lowercase(text):
text = text.lower()
return text
#df['review'] = df['review'].apply(to_lowercase)
# ------------------------------------------------------------------------------------------------ #
## REMOVE NUMBERS
def remove_numbers(text):
text = re.sub(r'\d+', '', text)
return text
#df['review'] = df['review'].apply(remove_numbers)
# ------------------------------------------------------------------------------------------------ #
## REMOVE PUNCTUATION
def remove_punctuations(text):
return text.translate(str.maketrans('', '', string.punctuation))
#df['review'] = df['review'].apply(remove_punctuations)
# ------------------------------------------------------------------------------------------------ #
## REMOVE SPECIAL CHARACTERS
def remove_special_chars(text):
return re.sub('[^a-zA-Z]', ' ', text)
#df['review'] = df['review'].apply(remove_special_chars)
# ------------------------------------------------------------------------------------------------ #
## REMOVE UNNECESSARY WHITE SPACE
def remove_whitespace(text):
return " ".join(text.split())
#df['review'] = df['review'].apply(remove_whitespace)
# ------------------------------------------------------------------------------------------------ #
## REMOVE STOPWORDS
# create list of your own words to also remove
my_stopwords = ['br', 'b']
def remove_stopwords(text):
new_list = []
words = word_tokenize(text)
stopwrds = stopwords.words('english') + my_stopwords
for word in words:
if word not in stopwrds:
new_list.append(word)
return ' '.join(new_list)
#df['review'] = df['review'].apply(remove_stopwords)
# ------------------------------------------------------------------------------------------------ #
## LEMMATIZATION
# usually preferred over stemming
# considers context (word part of speech)
# caring -> care
#lem_df = df.copy()
# Part of speech tagger function
def pos_tagger(nltk_tag):
if nltk_tag.startswith('J'):
return wordnet.ADJ
elif nltk_tag.startswith('V'):
return wordnet.VERB
elif nltk_tag.startswith('N'):
return wordnet.NOUN
elif nltk_tag.startswith('R'):
return wordnet.ADV
else:
return None
# Instantiate lemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_word(text):
pos_tagged = nltk.pos_tag(nltk.word_tokenize(text))
#word_tokens = word_tokenize(text)
wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
lemmatized_review = []
for word, tag in wordnet_tagged:
if tag is None:
# if there is no available tag, append the token as is
lemmatized_review.append(word)
else:
# else use the tag to lemmatize the token
lemmatized_review.append(lemmatizer.lemmatize(word, tag))
lemmatized_review = " ".join(lemmatized_review)
return lemmatized_review
#lem_df['review'] = lem_df['review'].apply(lemmatize_word)
# Twitter Specific Cleaning
def remove_hyperlinks(text):
return re.sub(r'https?:\/\/.*[\r\n]*', ' ', text)
def remove_hashtag_symbol(text):
return re.sub(r'#', ' ', text)
def remove_retweet_text(text):
return re.sub(r'^RT[\s]+', ' ', text)
# ---------------------------------------------------------------------------------------------------------------------------- #
## CUSTOM NORMALIZATION FUNCTION
# choose which preprocessing functions to use
# not using lemmatization or stemming since using a neural network
def custom_normalization(reviews):
reviews = reviews.apply(to_lowercase)
reviews = reviews.apply(remove_numbers)
reviews = reviews.apply(remove_punctuations)
reviews = reviews.apply(remove_special_chars)
reviews = reviews.apply(remove_stopwords)
reviews = reviews.apply(remove_hyperlinks)
reviews = reviews.apply(remove_hashtag_symbol)
reviews = reviews.apply(remove_retweet_text)
#reviews = reviews.apply(lemmatize_word)
return reviews
# ---------------------------------------------------------------------------------------------------------------------------- #
## NORMALIZE FOR MODEL
# normalize reviews
train_tweets = custom_normalization(train_tweets)
test_tweets = custom_normalization(test_tweets)
val_tweets = custom_normalization(val_tweets)
# ---------------------------------------------------------------------------------------------------------------------------- #
## TEXT VECTORIZATION LAYER FOR ENCODING
all_train_words = []
for tweet in train_tweets:
for word in tweet.split():
all_train_words.append(word)
unique_train_words = set(all_train_words) # there are about 130000 unique words
vocab_size = 5000
max_features = vocab_size # size of vocabulary
max_sequence_length = len(max(tweets, key=len)) # max length of a review, equal to longest review in the entire dataset
sequence_length = 250
vectorize_layer = layers.TextVectorization(
standardize=None,
max_tokens=max_features + 2, # to take into account the [oov] and [pad] tokens
output_mode='int',
output_sequence_length=sequence_length)
# ---------------------------------------------------------------------------------------------------------------------------- #
## CONVERT TO TENSORS
train_tweets = tf.convert_to_tensor(train_tweets)
test_tweets = tf.convert_to_tensor(test_tweets)
val_tweets = tf.convert_to_tensor(val_tweets)
train_labels = tf.convert_to_tensor(train_labels)
test_labels = tf.convert_to_tensor(test_labels)
val_labels = tf.convert_to_tensor(val_labels)
# ---------------------------------------------------------------------------------------------------------------------------- #
## GENERATE VOCABULARY USING VECTORIZE LAYER
# Ensure train set is adapted, otherwise overfitting will occur
vectorize_layer.adapt(train_tweets)
vocab = vectorize_layer.get_vocabulary()
#train_reviews = vectorize_layer(train_reviews)
#test_reviews = vectorize_layer(test_reviews)
#val_reviews = vectorize_layer(val_reviews)
# verify encoding worked, the following should be arrays of integers instead of words and have shape=(sequence_length, )
#train_reviews[0]
#test_reviews[0]
#val_reviews[0]
# ---------------------------------------------------------------------------------------------------------------------------- #
## VERIFY SHAPES
print('The shape of train_reviews is:', train_tweets.shape) # should be of shape (30000, sequence_length)
print('The shape of test_reviews is:', test_tweets.shape) # should be of shape (5000, sequence_length)
print('The shape of val_reviews is:', val_tweets.shape) # should be of shape (5000, sequence_length)
print('The shape of train_labels is:', train_labels.shape) # should be of shape (30000,)
print('The shape of test_labels is:', test_labels.shape) # should be of shape (5000,)
print('The shape of val_labels is:', val_labels.shape) # should be of shape (5000,)
# ---------------------------------------------------------------------------------------------------------------------------- #
## BUILD THE INITIAL ANN MODEL
embedding_dim = 64
ANN_Model = tf.keras.Sequential([
vectorize_layer,
tf.keras.layers.Embedding(input_dim=max_features + 2, output_dim=embedding_dim, input_length=sequence_length),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(32, activation='sigmoid'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(5, activation='softmax'),
])
ANN_Model.summary()
ANN_Model.compile(loss="sparse_categorical_crossentropy",
optimizer='adam',
metrics=["accuracy"])
##Increase epochs to improve accuracy/training
Hist = ANN_Model.fit(train_tweets, train_labels, epochs=20, validation_data=(val_tweets, val_labels))
# ---------------------------------------------------------------------------------------------------------------------------- #
## BUILD THE FINAL ANN MODEL
callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
embedding_dim = 16
ANN_Model_Final = tf.keras.Sequential([
vectorize_layer,
tf.keras.layers.Embedding(input_dim=max_features + 2, output_dim=embedding_dim, input_length=sequence_length),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(16, activation='relu', kernel_regularizer=l2(l2=0.01)),
tf.keras.layers.Dense(5, activation='softmax'),
])
ANN_Model_Final.summary()
ANN_Model_Final.compile(loss="sparse_categorical_crossentropy",
optimizer='adam',
metrics=["accuracy"])
##Increase epochs to improve accuracy/training
Hist = ANN_Model_Final.fit(train_tweets, train_labels, epochs=50, validation_data=(val_tweets, val_labels), callbacks=[callback])
# ---------------------------------------------------------------------------------------------------------------------------- #
## BUILD THE INITIAL LSTM MODEL
embedding_dim = 16
LSTM_Model = tf.keras.Sequential([
vectorize_layer,
tf.keras.layers.Embedding(input_dim=max_features + 2, output_dim=embedding_dim, input_length=sequence_length),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50)),
tf.keras.layers.Dense(5, activation='softmax')
])
LSTM_Model.summary()
LSTM_Model.compile(loss="sparse_categorical_crossentropy",
optimizer='adam',
metrics=["accuracy"])
##Increase epochs to improve accuracy/training
Hist = LSTM_Model.fit(train_tweets, train_labels, epochs=10, validation_data=(val_tweets, val_labels))
# ---------------------------------------------------------------------------------------------------------------------------- #
## BUILD THE FINAL LSTM MODEL
callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
embedding_dim = 16
#, kernel_regularizer=l1_l2(l1=0.001, l2=0.001)
LSTM_Model_Final = tf.keras.Sequential([
vectorize_layer,
tf.keras.layers.Embedding(input_dim=max_features + 2, output_dim=embedding_dim, input_length=sequence_length),
#tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences=True)),
#tf.keras.layers.GlobalAveragePooling1D(),
#tf.keras.layers.Dense(16, activation='relu'),
#tf.keras.layers.Dropout(0.5),
#tf.keras.layers.Dense(5, activation='softmax')
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50)),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(5, activation='softmax')
])
LSTM_Model_Final.summary()
LSTM_Model_Final.compile(loss="sparse_categorical_crossentropy",
optimizer='adam',
metrics=["accuracy"])
##Increase epochs to improve accuracy/training
Hist = LSTM_Model_Final.fit(train_tweets, train_labels, epochs=50, validation_data=(val_tweets, val_labels), callbacks=[callback])
# ---------------------------------------------------------------------------------------------------------------------------- #
## ACCURACY AND LOSS PLOTS
# train accuracy and val accuracy
plt.plot(Hist.history['accuracy'], label='train_accuracy')
plt.plot(Hist.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.2, 1])
plt.legend(loc='lower right')
# train loss and val loss
plt.plot(Hist.history['loss'], label='train_loss')
plt.plot(Hist.history['val_loss'], label = 'val_loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.ylim([0, 2])
plt.legend(loc='lower right')
# ---------------------------------------------------------------------------------------------------------------------------- #
## ACCURACY ON TEST SET
model_name = LSTM_Model_Final #insert model name here
test_loss, test_acc = model_name.evaluate(test_tweets, test_labels, verbose=2)
print(test_acc)
# ---------------------------------------------------------------------------------------------------------------------------- #
## GET MODEL PREDICTIONS ON TEST SET
predictions=model_name.predict([test_tweets])
print(predictions)
print(predictions.shape)
Max_Values = np.squeeze(np.array(predictions.argmax(axis=1)))
print(Max_Values)
# ---------------------------------------------------------------------------------------------------------------------------- #
## GENERATE CONFUSION MATRIX FOR PREDICTIONS
CM=confusion_matrix(y_pred=Max_Values, y_true=test_labels)
print(CM)
# ---------------------------------------------------------------------------------------------------------------------------- #
## PLOT A PRETTY CONFUSION MATRIX
import seaborn as sns
import matplotlib.pyplot as plt
# 0: Extremely Negative, 1: Extremely Positive, 2: Negative, 3: Neutral, 4: Positive
class_names = ["Extremely Negative","Extremely Positive","Negative","Neutral","Positive"]
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(CM, annot=True, fmt='g', ax=ax, annot_kws={'size': 18})
#annot=True to annotate cells, ftm='g' to disable scientific notation
# annot_kws si size of font in heatmap
# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix: Tweet Sentiment Analysis (LSTM)')
ax.xaxis.set_ticklabels(["0: Extremely Negative","1: Extremely Positive","2: Negative","3: Neutral","4: Positive"],rotation=90, fontsize = 18)
ax.yaxis.set_ticklabels(["0: Extremely Negative","1: Extremely Positive","2: Negative","3: Neutral","4: Positive"],rotation=0, fontsize = 18)