Learn data science and machine learning by building real-world projects on Jovian

San Francisco Crime Classification - Modelling

import Packages

import warnings
warnings.filterwarnings('ignore')

import pickle
import os
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from tabulate import tabulate

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.calibration import CalibratedClassifierCV

from xgboost import XGBClassifier

Data Reading

project_path = '/content/drive/MyDrive/AAIC/SCS-1/sf_crime_classification/'
train_sf_df = pd.read_csv(filepath_or_buffer=project_path + 'csv_files/train_data_features.csv')
test_sf_df = pd.read_csv(filepath_or_buffer=project_path + 'csv_files/test_data_features.csv')
df = pd.read_csv(filepath_or_buffer=project_path + 'csv_files/train.csv')
train_sf_df['category'] = df['Category']
test_sf_df = test_sf_df.drop(columns=['id'])
train_sf_df.shape
(878049, 131)
test_sf_df.shape
(884262, 130)

Stratification Split

  • The data is highly imbalanced.
data = train_sf_df
X = data.drop(columns=['category'])
y = data['category']

Data Preprocessing

def preprocess_data(X):
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    return X
X_norm = preprocess_data(X=X)

Data Splitting

X_train, X_test, y_train, y_test = train_test_split(X_norm, y, stratify=y, test_size=0.20)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify=y_train, test_size=0.20)
len(y_train.value_counts()), len(y_valid.value_counts()), len(y_test.value_counts())
(39, 39, 39)
print("Training data shape : ", X_train.shape)
print("Validation data shape : ", X_valid.shape)
print("Test data shape : ", X_test.shape)
Training data shape : (561951, 130) Validation data shape : (140488, 130) Test data shape : (175610, 130)

Labels for Confusion Matrix

labels = [
    'ARSON',
    'ASSAULT',
    'BAD CHECKS',
    'BRIBERY',
    'BURGLARY',
    'DISORDERLY CONDUCT',
    'DRIVING UNDER THE INFLUENCE',
    'DRUG/NARCOTIC',
    'DRUNKENNESS',
    'EMBEZZLEMENT',
    'EXTORTION',
    'FAMILY OFFENSES',
    'FORGERY/COUNTERFEITING',
    'FRAUD',
    'GAMBLING',
    'KIDNAPPING',
    'LARCENY/THEFT',
    'LIQUOR LAWS',
    'LOITERING',
    'MISSING PERSON',
    'NON-CRIMINAL',
    'OTHER OFFENSES',
    'PORNOGRAPHY/OBSCENE MAT',
    'PROSTITUTION',
    'RECOVERED VEHICLE',
    'ROBBERY',
    'RUNAWAY',
    'SECONDARY CODES',
    'SEX OFFENSES FORCIBLE',
    'SEX OFFENSES NON FORCIBLE',
    'STOLEN PROPERTY',
    'SUICIDE',
    'SUSPICIOUS OCC',
    'TREA',
    'TRESPASS',
    'VANDALISM',
    'VEHICLE THEFT',
    'WARRANTS',
    'WEAPON LAWS'
 ]

Confusion, Precision, and Recall Matrices

def plot_heatmap(matrix, title, labels, cmap='YlGnBu'):
    plt.figure(figsize=(25, 20))
    sns.heatmap(matrix, annot=True, cmap=cmap, fmt='.2f', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Actual Class')
    plt.title(title)
    plt.tight_layout()
    plt.show()
    return None

def plot_confusion_matrix(y_actuals, y_preds, labels=labels):
    # confusion matrix
    cmat = confusion_matrix(y_true=y_actuals, y_pred=y_preds)
    # precision matrix
    pmat = cmat / cmat.sum(axis=0)
    # recall matrix
    rmat = cmat / cmat.sum(axis=1)
    # class labels
    
    plot_heatmap(matrix=cmat, title='Confusion Matrix', labels=labels)
    plot_heatmap(matrix=pmat, title='Precision Matrix', labels=labels)
    plot_heatmap(matrix=rmat, title='Recall Matrix', labels=labels)
    
    return None

1. Dummy Classifier

def dummy_classifier(X_train, y_train, X_valid, y_valid, X_test, y_test, show_plot=True):
    model_path = project_path + 'models/'
    model_name = 'dummy_classifier.pkl'

    if not os.path.isfile(path=model_path + model_name):
        clf = DummyClassifier(strategy='uniform')
        clf.fit(X=X_train, y=y_train)

        sig_clf = CalibratedClassifierCV(clf, method='sigmoid')
        sig_clf.fit(X_train, y_train)
        
        pickle.dump(sig_clf, open(model_path + model_name, 'wb'))
    else:
        print('Model already exists.')
        sig_clf = pickle.load(open(model_path + model_name, 'rb'))

    y_train_preds = sig_clf.predict_proba(X_train)
    train_log_loss = log_loss(y_train, y_train_preds)
    print("Training Log Loss:", train_log_loss)

    y_valid_preds = sig_clf.predict_proba(X_valid)
    valid_log_loss = log_loss(y_valid, y_valid_preds)
    print("Validation Log Loss:", valid_log_loss)

    y_test_preds = sig_clf.predict_proba(X_test)
    test_log_loss = log_loss(y_test, y_test_preds)
    print("Testing Log Loss:", test_log_loss)

    if show_plot:
        predict_y = sig_clf.predict(X_test)
        plot_confusion_matrix(y_actuals=y_test, y_preds=predict_y)
    
    classification_report = metrics.classification_report(y_test, predict_y)
    print(classification_report)
    
    return train_log_loss, valid_log_loss, test_log_loss
dummy_train_loss, dummy_valid_loss, dummy_test_loss = dummy_classifier(X_train, y_train, X_valid, y_valid, X_test, y_test)
Model already exists. Training Log Loss: 2.6803311916005748 Validation Log Loss: 2.6803405273212917 Testing Log Loss: 2.680302884126909
Notebook Image
Notebook Image
Notebook Image
precision recall f1-score support ARSON 0.00 0.00 0.00 303 ASSAULT 0.00 0.00 0.00 15375 BAD CHECKS 0.00 0.00 0.00 81 BRIBERY 0.00 0.00 0.00 58 BURGLARY 0.00 0.00 0.00 7351 DISORDERLY CONDUCT 0.00 0.00 0.00 864 DRIVING UNDER THE INFLUENCE 0.00 0.00 0.00 454 DRUG/NARCOTIC 0.00 0.00 0.00 10794 DRUNKENNESS 0.00 0.00 0.00 856 EMBEZZLEMENT 0.00 0.00 0.00 233 EXTORTION 0.00 0.00 0.00 51 FAMILY OFFENSES 0.00 0.00 0.00 98 FORGERY/COUNTERFEITING 0.00 0.00 0.00 2122 FRAUD 0.00 0.00 0.00 3336 GAMBLING 0.00 0.00 0.00 29 KIDNAPPING 0.00 0.00 0.00 468 LARCENY/THEFT 0.20 1.00 0.33 34980 LIQUOR LAWS 0.00 0.00 0.00 381 LOITERING 0.00 0.00 0.00 245 MISSING PERSON 0.00 0.00 0.00 5198 NON-CRIMINAL 0.00 0.00 0.00 18461 OTHER OFFENSES 0.00 0.00 0.00 25236 PORNOGRAPHY/OBSCENE MAT 0.00 0.00 0.00 4 PROSTITUTION 0.00 0.00 0.00 1497 RECOVERED VEHICLE 0.00 0.00 0.00 628 ROBBERY 0.00 0.00 0.00 4600 RUNAWAY 0.00 0.00 0.00 389 SECONDARY CODES 0.00 0.00 0.00 1997 SEX OFFENSES FORCIBLE 0.00 0.00 0.00 878 SEX OFFENSES NON FORCIBLE 0.00 0.00 0.00 29 STOLEN PROPERTY 0.00 0.00 0.00 908 SUICIDE 0.00 0.00 0.00 102 SUSPICIOUS OCC 0.00 0.00 0.00 6283 TREA 0.00 0.00 0.00 1 TRESPASS 0.00 0.00 0.00 1465 VANDALISM 0.00 0.00 0.00 8945 VEHICLE THEFT 0.00 0.00 0.00 10756 WARRANTS 0.00 0.00 0.00 8443 WEAPON LAWS 0.00 0.00 0.00 1711 accuracy 0.20 175610 macro avg 0.01 0.03 0.01 175610 weighted avg 0.04 0.20 0.07 175610

2. Logistic Regression

def log_reg_classifier(X_train, y_train, X_valid, y_valid, X_test, y_test, show_plot=True):
    model_path = project_path + 'models/'
    model_name = 'log_reg_classifier.pkl'

    if not os.path.isfile(path=model_path + model_name):
        clf = LogisticRegression(penalty='l2', C=30, n_jobs=-1)
        clf.fit(X_train, y_train)

        sig_clf = CalibratedClassifierCV(clf, method='sigmoid')
        sig_clf.fit(X_train, y_train)
    
        pickle.dump(sig_clf, open(model_path + model_name, 'wb'))
    else:
        print('Model already exists.')
        sig_clf = pickle.load(open(model_path + model_name, 'rb'))

    y_train_preds = sig_clf.predict_proba(X_train)
    train_log_loss = log_loss(y_train, y_train_preds)
    print("Training Log Loss:", train_log_loss)

    y_valid_preds = sig_clf.predict_proba(X_valid)
    valid_log_loss = log_loss(y_valid, y_valid_preds)
    print("Validation Log Loss:", valid_log_loss)

    y_test_preds = sig_clf.predict_proba(X_test)
    test_log_loss = log_loss(y_test, y_test_preds)
    print("Testing Log Loss:", test_log_loss)

    if show_plot:
        predict_y = sig_clf.predict(X_test)
        plot_confusion_matrix(y_actuals=y_test, y_preds=predict_y)
    
    classification_report = metrics.classification_report(y_test, predict_y)
    print(classification_report)

    return train_log_loss, valid_log_loss, test_log_loss
lr_train_loss, lr_valid_loss, lr_test_loss = log_reg_classifier(X_train, y_train, X_valid, y_valid, X_test, y_test)
Model already exists. Training Log Loss: 2.500360732705917 Validation Log Loss: 2.496850823942772 Testing Log Loss: 2.4981216691332824
Notebook Image