The code for the modeling as well as the prepped data can be viewed and downloaded below.
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 23 09:10:35 2023
@author: casey
"""
## LOAD LIBRARIES
# Set seed for reproducibility
import random;
random.seed(53)
import pandas as pd
# Import all we need from sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV # RandomizedSearchCV coming soon
from sklearn.model_selection import KFold, cross_val_score
# Import Bayesian Optimization
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval # coming soon
# Import visualization
import scikitplot as skplt
import matplotlib.pyplot as plt
import seaborn as sns
# --------------------------------------------------------------------------------------- #
## LOAD DATA
lr_df = pd.read_csv('C:/Users/casey/OneDrive/Documents/MSDS_Courses/Spring_2023/Machine_Learning/Logistic_Regression/prepped_data/gbg_lr_py.csv')
# --------------------------------------------------------------------------------------- #
## CREATE TRAINING AND TESTING SETS
# X will contain all variables except the labels (the labels are the last column 'total_result')
X = lr_df.iloc[:,:-1]
# y will contain the labels (the labels are the last column 'total_result')
y = lr_df.iloc[:,-1:]
# split the data vectors randomly into 80% train and 20% test
# X_train contains the quantitative variables for the training set
# X_test contains the quantitative variables for the testing set
# y_train contains the labels for training set
# y_test contains the lables for the testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
# ------------------------------------------------------------------------------------------------ #
## CREATE DEFAULT LOGISTIC MODEL
# Look at below documentation for parameters
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
LOGR_Classifier = linear_model.LogisticRegression()
LOGR_Classifier.fit(X_train, y_train)
# --------------------------------------------------------------------------------------- #
## EVALUATE MODEL
y_pred = LOGR_Classifier.predict(X_test)
# For roc_auc
y_proba = LOGR_Classifier.predict_proba(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_proba[:, 1]))
# ------------------------------------------------------------------------------------------------ #
## GRIDSEARCHCV HYPERPARAMETER TUNING
estimator = linear_model.LogisticRegression()
parameters = {
'penalty': ['l1', 'l2'],
'C': [0.01, 0.1, 0.5, 1, 1.5, 5],
'solver': ['liblinear'],
'max_iter': [100, 500, 1000]
}
# Use Kfold because even distribution of labels (48.5% Over, 51.5% Under)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
grid_search = GridSearchCV(
estimator = estimator,
param_grid = parameters,
scoring = 'accuracy',
cv = kf,
verbose=1
)
grid_search.fit(X_train, y_train)
# gets best params
grid_search.best_params_
# best params: C=0.5, penalty='l2', solver='liblinear', max_iter=500
# ------------------------------------------------------------------------------------------------ #
## CREATE TUNED LOGISTIC MODEL
LOGR_Classifier = linear_model.LogisticRegression(C=0.5, penalty='l2', solver='liblinear', max_iter=500)
LOGR_Classifier.fit(X_train, y_train)
# ------------------------------------------------------------------------------------------------ #
## EVALUATE TUNED TREE
y_pred = LOGR_Classifier.predict(X_test)
# For auc_roc
y_proba = LOGR_Classifier.predict_proba(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print('Roc_Auc_Score: ' + str(roc_auc_score(y_test, y_proba[:, 1])))
# ------------------------------------------------------------------------------------------------ #
## KFOLD CROSS VALIDATE TUNED LOGISTIC MODEL
LOGR_Classifier = linear_model.LogisticRegression(C=0.5, penalty='l2', solver='liblinear', max_iter=500)
kf = KFold(n_splits=10, shuffle=True, random_state=1)
cv_score = cross_val_score(LOGR_Classifier,
X_train, y_train,
cv=kf,
scoring='accuracy')
fold = 1
for score in cv_score:
print('Fold ' + str(fold) + ' : ' + str(round(score, 2)))
fold += 1
print('The mean accuracy over 10 folds is: ' + str(cv_score.mean()))
# ------------------------------------------------------------------------------------------------ #
## CREATE REDUCED LOGISTIC MODEL (Only important features)
# only keep important features in train and test sets
X_train = X_train[['wind', 'avg_home_total_yards', 'total_line', 'qb_elo_diff', 'avg_away_total_yards', 'surface_dessograss']]
X_test = X_test[['wind', 'avg_home_total_yards', 'total_line', 'qb_elo_diff', 'avg_away_total_yards', 'surface_dessograss']]
# Look at below documentation for parameters
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
LOGR_Classifier = linear_model.LogisticRegression()
## EVALUATE REDUCED MODEL
kf = KFold(n_splits=10, shuffle=True, random_state=1)
cv_score = cross_val_score(LOGR_Classifier,
X_train, y_train,
cv=kf,
scoring='accuracy')
fold = 1
for score in cv_score:
print('Fold ' + str(fold) + ' : ' + str(round(score, 2)))
fold += 1
print('The mean accuracy over 10 folds is: ' + str(cv_score.mean()))
# ------------------------------------------------------------------------------------------------ #
## VISUALIZATIONS
## GET FEATURE IMPORTANCE
LOGR_Classifier.coef_[0]
feat_dict= {}
for col, val in sorted(zip(X_train.columns, LOGR_Classifier.coef_[0]),key=lambda x:x[1],reverse=True):
feat_dict[col]=val
feat_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})
## PLOT FEATURE IMPORTANCE
values = feat_df.Importance
idx = feat_df.Feature
plt.figure(figsize=(10,8))
clrs = ['green' if (x > 0) else 'red' for x in values ]
sns.barplot(y=idx,x=values,palette=clrs).set(title='Important Features to Predict the Total Result')
plt.show()
## PLOT CONFUSION MATRIX
fig = plt.figure(figsize=(15,6))
ax1 = fig.add_subplot(121)
skplt.metrics.plot_confusion_matrix(y_test, y_pred,
title="Confusion Matrix for Reduced Logistic Regression Model",
cmap = 'Oranges',
ax=ax1)