import
Packagesimport warnings
warnings.filterwarnings('ignore')
import pickle
import os
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
project_path = '/content/drive/MyDrive/AAIC/SCS-1/sf_crime_classification/'
train_sf_df = pd.read_csv(filepath_or_buffer=project_path + 'csv_files/train_data_features.csv')
test_sf_df = pd.read_csv(filepath_or_buffer=project_path + 'csv_files/test_data_features.csv')
df = pd.read_csv(filepath_or_buffer=project_path + 'csv_files/train.csv')
train_sf_df['category'] = df['Category']
test_sf_df = test_sf_df.drop(columns=['id'])
train_sf_df.shape
(878049, 131)
test_sf_df.shape
(884262, 130)
data = train_sf_df
X = data.drop(columns=['category'])
y = data['category']
def preprocess_data(X):
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
return X
X_norm = preprocess_data(X=X)
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, stratify=y, test_size=0.20)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify=y_train, test_size=0.20)
len(y_train.value_counts()), len(y_valid.value_counts()), len(y_test.value_counts())
(39, 39, 39)
print("Training data shape : ", X_train.shape)
print("Validation data shape : ", X_valid.shape)
print("Test data shape : ", X_test.shape)
Training data shape : (561951, 130)
Validation data shape : (140488, 130)
Test data shape : (175610, 130)
labels = [
'ARSON',
'ASSAULT',
'BAD CHECKS',
'BRIBERY',
'BURGLARY',
'DISORDERLY CONDUCT',
'DRIVING UNDER THE INFLUENCE',
'DRUG/NARCOTIC',
'DRUNKENNESS',
'EMBEZZLEMENT',
'EXTORTION',
'FAMILY OFFENSES',
'FORGERY/COUNTERFEITING',
'FRAUD',
'GAMBLING',
'KIDNAPPING',
'LARCENY/THEFT',
'LIQUOR LAWS',
'LOITERING',
'MISSING PERSON',
'NON-CRIMINAL',
'OTHER OFFENSES',
'PORNOGRAPHY/OBSCENE MAT',
'PROSTITUTION',
'RECOVERED VEHICLE',
'ROBBERY',
'RUNAWAY',
'SECONDARY CODES',
'SEX OFFENSES FORCIBLE',
'SEX OFFENSES NON FORCIBLE',
'STOLEN PROPERTY',
'SUICIDE',
'SUSPICIOUS OCC',
'TREA',
'TRESPASS',
'VANDALISM',
'VEHICLE THEFT',
'WARRANTS',
'WEAPON LAWS'
]
def plot_heatmap(matrix, title, labels, cmap='YlGnBu'):
plt.figure(figsize=(25, 20))
sns.heatmap(matrix, annot=True, cmap=cmap, fmt='.2f', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')
plt.title(title)
plt.tight_layout()
plt.show()
return None
def plot_confusion_matrix(y_actuals, y_preds, labels=labels):
# confusion matrix
cmat = confusion_matrix(y_true=y_actuals, y_pred=y_preds)
# precision matrix
pmat = cmat / cmat.sum(axis=0)
# recall matrix
rmat = cmat / cmat.sum(axis=1)
# class labels
plot_heatmap(matrix=cmat, title='Confusion Matrix', labels=labels)
plot_heatmap(matrix=pmat, title='Precision Matrix', labels=labels)
plot_heatmap(matrix=rmat, title='Recall Matrix', labels=labels)
return None
def dummy_classifier(X_train, y_train, X_valid, y_valid, X_test, y_test, show_plot=True):
model_path = project_path + 'models/'
model_name = 'dummy_classifier.pkl'
if not os.path.isfile(path=model_path + model_name):
clf = DummyClassifier(strategy='uniform')
clf.fit(X=X_train, y=y_train)
sig_clf = CalibratedClassifierCV(clf, method='sigmoid')
sig_clf.fit(X_train, y_train)
pickle.dump(sig_clf, open(model_path + model_name, 'wb'))
else:
print('Model already exists.')
sig_clf = pickle.load(open(model_path + model_name, 'rb'))
y_train_preds = sig_clf.predict_proba(X_train)
train_log_loss = log_loss(y_train, y_train_preds)
print("Training Log Loss:", train_log_loss)
y_valid_preds = sig_clf.predict_proba(X_valid)
valid_log_loss = log_loss(y_valid, y_valid_preds)
print("Validation Log Loss:", valid_log_loss)
y_test_preds = sig_clf.predict_proba(X_test)
test_log_loss = log_loss(y_test, y_test_preds)
print("Testing Log Loss:", test_log_loss)
if show_plot:
predict_y = sig_clf.predict(X_test)
plot_confusion_matrix(y_actuals=y_test, y_preds=predict_y)
classification_report = metrics.classification_report(y_test, predict_y)
print(classification_report)
return train_log_loss, valid_log_loss, test_log_loss
dummy_train_loss, dummy_valid_loss, dummy_test_loss = dummy_classifier(X_train, y_train, X_valid, y_valid, X_test, y_test)
Model already exists.
Training Log Loss: 2.6803311916005748
Validation Log Loss: 2.6803405273212917
Testing Log Loss: 2.680302884126909
precision recall f1-score support
ARSON 0.00 0.00 0.00 303
ASSAULT 0.00 0.00 0.00 15375
BAD CHECKS 0.00 0.00 0.00 81
BRIBERY 0.00 0.00 0.00 58
BURGLARY 0.00 0.00 0.00 7351
DISORDERLY CONDUCT 0.00 0.00 0.00 864
DRIVING UNDER THE INFLUENCE 0.00 0.00 0.00 454
DRUG/NARCOTIC 0.00 0.00 0.00 10794
DRUNKENNESS 0.00 0.00 0.00 856
EMBEZZLEMENT 0.00 0.00 0.00 233
EXTORTION 0.00 0.00 0.00 51
FAMILY OFFENSES 0.00 0.00 0.00 98
FORGERY/COUNTERFEITING 0.00 0.00 0.00 2122
FRAUD 0.00 0.00 0.00 3336
GAMBLING 0.00 0.00 0.00 29
KIDNAPPING 0.00 0.00 0.00 468
LARCENY/THEFT 0.20 1.00 0.33 34980
LIQUOR LAWS 0.00 0.00 0.00 381
LOITERING 0.00 0.00 0.00 245
MISSING PERSON 0.00 0.00 0.00 5198
NON-CRIMINAL 0.00 0.00 0.00 18461
OTHER OFFENSES 0.00 0.00 0.00 25236
PORNOGRAPHY/OBSCENE MAT 0.00 0.00 0.00 4
PROSTITUTION 0.00 0.00 0.00 1497
RECOVERED VEHICLE 0.00 0.00 0.00 628
ROBBERY 0.00 0.00 0.00 4600
RUNAWAY 0.00 0.00 0.00 389
SECONDARY CODES 0.00 0.00 0.00 1997
SEX OFFENSES FORCIBLE 0.00 0.00 0.00 878
SEX OFFENSES NON FORCIBLE 0.00 0.00 0.00 29
STOLEN PROPERTY 0.00 0.00 0.00 908
SUICIDE 0.00 0.00 0.00 102
SUSPICIOUS OCC 0.00 0.00 0.00 6283
TREA 0.00 0.00 0.00 1
TRESPASS 0.00 0.00 0.00 1465
VANDALISM 0.00 0.00 0.00 8945
VEHICLE THEFT 0.00 0.00 0.00 10756
WARRANTS 0.00 0.00 0.00 8443
WEAPON LAWS 0.00 0.00 0.00 1711
accuracy 0.20 175610
macro avg 0.01 0.03 0.01 175610
weighted avg 0.04 0.20 0.07 175610
def log_reg_classifier(X_train, y_train, X_valid, y_valid, X_test, y_test, show_plot=True):
model_path = project_path + 'models/'
model_name = 'log_reg_classifier.pkl'
if not os.path.isfile(path=model_path + model_name):
clf = LogisticRegression(penalty='l2', C=30, n_jobs=-1)
clf.fit(X_train, y_train)
sig_clf = CalibratedClassifierCV(clf, method='sigmoid')
sig_clf.fit(X_train, y_train)
pickle.dump(sig_clf, open(model_path + model_name, 'wb'))
else:
print('Model already exists.')
sig_clf = pickle.load(open(model_path + model_name, 'rb'))
y_train_preds = sig_clf.predict_proba(X_train)
train_log_loss = log_loss(y_train, y_train_preds)
print("Training Log Loss:", train_log_loss)
y_valid_preds = sig_clf.predict_proba(X_valid)
valid_log_loss = log_loss(y_valid, y_valid_preds)
print("Validation Log Loss:", valid_log_loss)
y_test_preds = sig_clf.predict_proba(X_test)
test_log_loss = log_loss(y_test, y_test_preds)
print("Testing Log Loss:", test_log_loss)
if show_plot:
predict_y = sig_clf.predict(X_test)
plot_confusion_matrix(y_actuals=y_test, y_preds=predict_y)
classification_report = metrics.classification_report(y_test, predict_y)
print(classification_report)
return train_log_loss, valid_log_loss, test_log_loss
lr_train_loss, lr_valid_loss, lr_test_loss = log_reg_classifier(X_train, y_train, X_valid, y_valid, X_test, y_test)
Model already exists.
Training Log Loss: 2.500360732705917
Validation Log Loss: 2.496850823942772
Testing Log Loss: 2.4981216691332824