# Setting the path to my local directory where my dataset is placed
import os
os.chdir('E:\\Finale Project\\Python Project')
print (os.getcwd())
E:\Finale Project\Python Project
# Importing necessary libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing,metrics
%matplotlib inline
from IPython.display import Image
from matplotlib import rcParams
rcParams['figure.figsize'] = 11,6
# Loading the data from txt file
data = pd.read_csv('XYZCorp_LendingData.txt',sep="\t",low_memory=False)
data.info()
data.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 855969 entries, 0 to 855968
Data columns (total 73 columns):
id 855969 non-null int64
member_id 855969 non-null int64
loan_amnt 855969 non-null float64
funded_amnt 855969 non-null float64
funded_amnt_inv 855969 non-null float64
term 855969 non-null object
int_rate 855969 non-null float64
installment 855969 non-null float64
grade 855969 non-null object
sub_grade 855969 non-null object
emp_title 806526 non-null object
emp_length 812908 non-null object
home_ownership 855969 non-null object
annual_inc 855969 non-null float64
verification_status 855969 non-null object
issue_d 855969 non-null object
pymnt_plan 855969 non-null object
desc 121812 non-null object
purpose 855969 non-null object
title 855936 non-null object
zip_code 855969 non-null object
addr_state 855969 non-null object
dti 855969 non-null float64
delinq_2yrs 855969 non-null float64
earliest_cr_line 855969 non-null object
inq_last_6mths 855969 non-null float64
mths_since_last_delinq 416157 non-null float64
mths_since_last_record 131184 non-null float64
open_acc 855969 non-null float64
pub_rec 855969 non-null float64
revol_bal 855969 non-null float64
revol_util 855523 non-null float64
total_acc 855969 non-null float64
initial_list_status 855969 non-null object
out_prncp 855969 non-null float64
out_prncp_inv 855969 non-null float64
total_pymnt 855969 non-null float64
total_pymnt_inv 855969 non-null float64
total_rec_prncp 855969 non-null float64
total_rec_int 855969 non-null float64
total_rec_late_fee 855969 non-null float64
recoveries 855969 non-null float64
collection_recovery_fee 855969 non-null float64
last_pymnt_d 847107 non-null object
last_pymnt_amnt 855969 non-null float64
next_pymnt_d 602998 non-null object
last_credit_pull_d 855919 non-null object
collections_12_mths_ex_med 855913 non-null float64
mths_since_last_major_derog 213139 non-null float64
policy_code 855969 non-null float64
application_type 855969 non-null object
annual_inc_joint 442 non-null float64
dti_joint 440 non-null float64
verification_status_joint 442 non-null object
acc_now_delinq 855969 non-null float64
tot_coll_amt 788656 non-null float64
tot_cur_bal 788656 non-null float64
open_acc_6m 13288 non-null float64
open_il_6m 13288 non-null float64
open_il_12m 13288 non-null float64
open_il_24m 13288 non-null float64
mths_since_rcnt_il 12934 non-null float64
total_bal_il 13288 non-null float64
il_util 11609 non-null float64
open_rv_12m 13288 non-null float64
open_rv_24m 13288 non-null float64
max_bal_bc 13288 non-null float64
all_util 13288 non-null float64
total_rev_hi_lim 788656 non-null float64
inq_fi 13288 non-null float64
total_cu_tl 13288 non-null float64
inq_last_12m 13288 non-null float64
default_ind 855969 non-null int64
dtypes: float64(49), int64(3), object(21)
memory usage: 476.7+ MB
# Plot Loan Status
plt.figure(figsize= (12,6))
plt.ylabel('Loan Status')
plt.xlabel('Count')
data['default_ind'].value_counts().plot(kind = 'barh', grid = True)
plt.show()
# Analyzing Purpose
total = len(data)
plt.figure(figsize = (14,6))
g = sns.countplot(x="", data=data,
color='blue')
g.set_xticklabels(g.get_xticklabels(),rotation=45)
g.set_xlabel("Loan Status Categories", fontsize=12)
g.set_ylabel("Count", fontsize=15)
g.set_title("Loan Status Types Distribution", fontsize=20)
sizes=[]
for p in g.patches:
height = p.get_height()
sizes.append(height)
g.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/total*100),
ha="center", fontsize=12)
g.set_ylim(0, max(sizes) * 1.10)
plt.show()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-49-2d1e3de9c932> in <module>
5
6 g = sns.countplot(x="", data=data,
----> 7 color='blue')
8 g.set_xticklabels(g.get_xticklabels(),rotation=45)
9 g.set_xlabel("Loan Status Categories", fontsize=12)
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py in countplot(x, y, hue, data, order, hue_order, orient, color, palette, saturation, dodge, ax, **kwargs)
3551 estimator, ci, n_boot, units,
3552 orient, color, palette, saturation,
-> 3553 errcolor, errwidth, capsize, dodge)
3554
3555 plotter.value_label = "count"
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py in __init__(self, x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, orient, color, palette, saturation, errcolor, errwidth, capsize, dodge)
1605 """Initialize the plotter."""
1606 self.establish_variables(x, y, hue, data, orient,
-> 1607 order, hue_order, units)
1608 self.establish_colors(color, palette, saturation)
1609 self.estimate_statistic(estimator, ci, n_boot)
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py in establish_variables(self, x, y, hue, data, orient, order, hue_order, units)
153 if isinstance(input, string_types):
154 err = "Could not interpret input '{}'".format(input)
--> 155 raise ValueError(err)
156
157 # Figure out the plotting orientation
ValueError: Could not interpret input ''
<Figure size 1008x432 with 0 Axes>
data.default_ind.value_counts()
0 809502
1 46467
Name: default_ind, dtype: int64
# Data Dimension
data.shape
(855969, 73)
# Drop these features for now
data.drop([ 'id',
'member_id',
'emp_title',
'title',
# 'url',
'zip_code',
'verification_status',
'home_ownership',
'issue_d',
'earliest_cr_line',
'last_pymnt_d',
'next_pymnt_d',
'desc',
# 'pymnt_plan',
# 'initial_list_status',
# 'addr_state',
'last_credit_pull_d',
], axis=1, inplace=True)
data.drop(['verification_status_joint'], axis=1, inplace=True)
# Show records number
data.count().sort_values()
dti_joint 440
annual_inc_joint 442
il_util 11609
mths_since_rcnt_il 12934
open_acc_6m 13288
inq_last_12m 13288
open_il_6m 13288
open_il_12m 13288
total_bal_il 13288
open_il_24m 13288
open_rv_12m 13288
open_rv_24m 13288
max_bal_bc 13288
all_util 13288
inq_fi 13288
total_cu_tl 13288
mths_since_last_record 131184
mths_since_last_major_derog 213139
mths_since_last_delinq 416157
total_rev_hi_lim 788656
tot_cur_bal 788656
tot_coll_amt 788656
emp_length 812908
revol_util 855523
collections_12_mths_ex_med 855913
last_pymnt_amnt 855969
loan_amnt 855969
application_type 855969
acc_now_delinq 855969
policy_code 855969
collection_recovery_fee 855969
total_rec_int 855969
total_rec_late_fee 855969
funded_amnt 855969
funded_amnt_inv 855969
term 855969
int_rate 855969
installment 855969
grade 855969
sub_grade 855969
annual_inc 855969
pymnt_plan 855969
purpose 855969
addr_state 855969
recoveries 855969
dti 855969
inq_last_6mths 855969
open_acc 855969
pub_rec 855969
revol_bal 855969
total_acc 855969
initial_list_status 855969
out_prncp 855969
out_prncp_inv 855969
total_pymnt 855969
total_pymnt_inv 855969
total_rec_prncp 855969
delinq_2yrs 855969
default_ind 855969
dtype: int64
# Drop columns with less than 25% data.
lack_of_data_idx = [x for x in data.count() < 855969*0.25]
data.drop(data.columns[lack_of_data_idx], 1, inplace=True)
# After Deletion
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 855969 entries, 0 to 855968
Data columns (total 41 columns):
loan_amnt 855969 non-null float64
funded_amnt 855969 non-null float64
funded_amnt_inv 855969 non-null float64
term 855969 non-null object
int_rate 855969 non-null float64
installment 855969 non-null float64
grade 855969 non-null object
sub_grade 855969 non-null object
emp_length 812908 non-null object
annual_inc 855969 non-null float64
pymnt_plan 855969 non-null object
purpose 855969 non-null object
addr_state 855969 non-null object
dti 855969 non-null float64
delinq_2yrs 855969 non-null float64
inq_last_6mths 855969 non-null float64
mths_since_last_delinq 416157 non-null float64
open_acc 855969 non-null float64
pub_rec 855969 non-null float64
revol_bal 855969 non-null float64
revol_util 855523 non-null float64
total_acc 855969 non-null float64
initial_list_status 855969 non-null object
out_prncp 855969 non-null float64
out_prncp_inv 855969 non-null float64
total_pymnt 855969 non-null float64
total_pymnt_inv 855969 non-null float64
total_rec_prncp 855969 non-null float64
total_rec_int 855969 non-null float64
total_rec_late_fee 855969 non-null float64
recoveries 855969 non-null float64
collection_recovery_fee 855969 non-null float64
last_pymnt_amnt 855969 non-null float64
collections_12_mths_ex_med 855913 non-null float64
policy_code 855969 non-null float64
application_type 855969 non-null object
acc_now_delinq 855969 non-null float64
tot_coll_amt 788656 non-null float64
tot_cur_bal 788656 non-null float64
total_rev_hi_lim 788656 non-null float64
default_ind 855969 non-null int64
dtypes: float64(31), int64(1), object(9)
memory usage: 267.8+ MB
print (data.mths_since_last_delinq.min(), data.mths_since_last_delinq.max())
print(data.mths_since_last_delinq.mean())
print(data.mths_since_last_delinq.median())
0.0 188.0
34.14994341078007
31.0
data.mths_since_last_delinq = data.mths_since_last_delinq.fillna(data.mths_since_last_delinq.median())
data.dropna(inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 747260 entries, 39694 to 855968
Data columns (total 41 columns):
loan_amnt 747260 non-null float64
funded_amnt 747260 non-null float64
funded_amnt_inv 747260 non-null float64
term 747260 non-null object
int_rate 747260 non-null float64
installment 747260 non-null float64
grade 747260 non-null object
sub_grade 747260 non-null object
emp_length 747260 non-null object
annual_inc 747260 non-null float64
pymnt_plan 747260 non-null object
purpose 747260 non-null object
addr_state 747260 non-null object
dti 747260 non-null float64
delinq_2yrs 747260 non-null float64
inq_last_6mths 747260 non-null float64
mths_since_last_delinq 747260 non-null float64
open_acc 747260 non-null float64
pub_rec 747260 non-null float64
revol_bal 747260 non-null float64
revol_util 747260 non-null float64
total_acc 747260 non-null float64
initial_list_status 747260 non-null object
out_prncp 747260 non-null float64
out_prncp_inv 747260 non-null float64
total_pymnt 747260 non-null float64
total_pymnt_inv 747260 non-null float64
total_rec_prncp 747260 non-null float64
total_rec_int 747260 non-null float64
total_rec_late_fee 747260 non-null float64
recoveries 747260 non-null float64
collection_recovery_fee 747260 non-null float64
last_pymnt_amnt 747260 non-null float64
collections_12_mths_ex_med 747260 non-null float64
policy_code 747260 non-null float64
application_type 747260 non-null object
acc_now_delinq 747260 non-null float64
tot_coll_amt 747260 non-null float64
tot_cur_bal 747260 non-null float64
total_rev_hi_lim 747260 non-null float64
default_ind 747260 non-null int64
dtypes: float64(31), int64(1), object(9)
memory usage: 239.4+ MB
# Calculate Good and Bad Loan Status Ratio
good_loan = len(data[(data.default_ind == 0)])
print ('Good/Bad Loan Ratio: %.2f%%' % (good_loan/len(data)*100))
Good/Bad Loan Ratio: 95.42%
# create an bad/good loan indicator feature
data['good_loan'] = np.where((data.default_ind == 0) , 1, 0)
# Hot encode some categorical features
columns = ['term', 'grade', 'sub_grade', 'emp_length', 'purpose', 'application_type','addr_state',
'pymnt_plan', 'initial_list_status']
for col in columns:
tmp_df = pd.get_dummies(data[col], prefix=col)
data = pd.concat((data, tmp_df), axis=1)
# drop attributes that we hot-encoded
data.drop([#'loan_status',
'term',
'grade',
'sub_grade',
'emp_length',
'addr_state',
'initial_list_status',
'pymnt_plan',
'purpose',
'application_type'], axis=1, inplace=True)
# drop attributes that we hot-encoded
data.drop(['default_ind'], axis=1, inplace=True)
# Rename some features to concur w/ some algorithms
data = data.rename(columns= {'emp_length_< 1 year':'emp_length_lt_1 year',
'emp_length_n/a':'emp_length_na'})
# Due to resource limitation, we limit data to only the first 10,000 records.
data = data[:10000]
data.head()
# Split Train/Test data
from sklearn.model_selection import train_test_split
y = data['good_loan']
X = data.ix[:, data.columns != 'good_loan']
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state=44)
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
"""
!pip install sklearn
Collecting sklearn
Downloading https://files.pythonhosted.org/packages/1e/7a/dbb3be0ce9bd5c8b7e3d87328e79063f8b263b2b1bfa4774cb1147bfcd3f/sklearn-0.0.tar.gz
Requirement already satisfied: scikit-learn in c:\programdata\anaconda3\lib\site-packages (from sklearn) (0.20.1)
Requirement already satisfied: numpy>=1.8.2 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn->sklearn) (1.15.4)
Requirement already satisfied: scipy>=0.13.3 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn->sklearn) (1.1.0)
Building wheels for collected packages: sklearn
Running setup.py bdist_wheel for sklearn: started
Running setup.py bdist_wheel for sklearn: finished with status 'done'
Stored in directory: C:\Users\P RAJ\AppData\Local\pip\Cache\wheels\76\03\bb\589d421d27431bcd2c6da284d5f2286c8e3b2ea3cf1594c074
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0
# Bring in evaluator
import sklearn.metrics as mt
from sklearn.model_selection import cross_val_score
# Flatten Data
from sklearn.preprocessing import StandardScaler, RobustScaler
#std_scaler = StandardScaler()
rob_scaler = RobustScaler()
#X_train_S = std_scaler.fit_transform(X_train)
#X_test_S = std_scaler.transform(X_test)
# Use robust scaler to reduce outliers
X_train_R = rob_scaler.fit_transform(X_train)
X_test_R = rob_scaler.transform(X_test)
from sklearn.svm import SVC
# Weighted prediction feature
y_0 = len(y_train[y_train == 0])/len(y_train)
y_1 = 1 - y_0
svm_clf = SVC(class_weight={0:y_1, 1:y_0})
svm_clf.fit(X_train_R, y_train)
svm_predictions = svm_clf.predict(X_test_R) # Save prediction
#print(svm_clf.score(X_test_R, y_test))
scores = cross_val_score(svm_clf, X_test_R, y_test, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)\n" % (scores.mean(), scores.std() * 2))
print(mt.classification_report(y_test, svm_predictions))
print(mt.confusion_matrix(y_test, svm_predictions))
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
[0.8553616 0.8725 0.8825 0.88 0.86215539]
Accuracy: 0.87 (+/- 0.02)
precision recall f1-score support
0 0.42 0.79 0.55 164
1 0.98 0.90 0.94 1836
micro avg 0.90 0.90 0.90 2000
macro avg 0.70 0.85 0.75 2000
weighted avg 0.93 0.90 0.91 2000
[[ 130 34]
[ 176 1660]]
!pip install joblib
Requirement already satisfied: joblib in c:\programdata\anaconda3\lib\site-packages (0.13.2)
# SVM - SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(k_neighbors=10, random_state=44, kind = 'svm')
X_res_train, y_res_train = sm.fit_sample(X_train_R, y_train)
svm_sm_clf = SVC()
svm_sm_clf.fit(X_res_train, y_res_train)
svm_sm_predictions = svm_clf.predict(X_test_R)
#print(svm_sm_clf.score(X_test_R, y_test))
scores = cross_val_score(svm_sm_clf, X_test_R, y_test, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)\n" % (scores.mean(), scores.std() * 2))
print(mt.classification_report(y_test, svm_sm_predictions))
print(mt.confusion_matrix(y_test, svm_sm_predictions))
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
[0.93266833 0.925 0.9275 0.9225 0.93233083]
Accuracy: 0.93 (+/- 0.01)
precision recall f1-score support
0 0.42 0.79 0.55 164
1 0.98 0.90 0.94 1836
micro avg 0.90 0.90 0.90 2000
macro avg 0.70 0.85 0.75 2000
weighted avg 0.93 0.90 0.91 2000
[[ 130 34]
[ 176 1660]]
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 20)
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
#print(rf.score(X_test, y_test))
scores = cross_val_score(rf, X_test, y_test, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)\n" % (scores.mean(), scores.std() * 2))
print(mt.classification_report(y_test, rf_predictions))
print(mt.confusion_matrix(y_test, rf_predictions))
[0.97506234 0.98 0.9825 0.98 0.97994987]
Accuracy: 0.98 (+/- 0.00)
precision recall f1-score support
0 1.00 0.92 0.96 164
1 0.99 1.00 1.00 1836
micro avg 0.99 0.99 0.99 2000
macro avg 1.00 0.96 0.98 2000
weighted avg 0.99 0.99 0.99 2000
[[ 151 13]
[ 0 1836]]
!pip install xgboost
Collecting xgboost
Downloading https://files.pythonhosted.org/packages/5e/49/b95c037b717b4ceadc76b6e164603471225c27052d1611d5a2e832757945/xgboost-0.90-py2.py3-none-win_amd64.whl (18.3MB)
Requirement already satisfied: numpy in c:\programdata\anaconda3\lib\site-packages (from xgboost) (1.15.4)
Requirement already satisfied: scipy in c:\programdata\anaconda3\lib\site-packages (from xgboost) (1.1.0)
Installing collected packages: xgboost
Successfully installed xgboost-0.90
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_predictions = xgb.predict(X_test)
#print(xgb.score(X_test, y_test))
scores = cross_val_score(xgb, X_test, y_test, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)\n" % (scores.mean(), scores.std() * 2))
print(mt.classification_report(y_test, xgb_predictions))
print(mt.confusion_matrix(y_test, xgb_predictions))
[0.9925187 0.99 0.9925 0.99 1. ]
Accuracy: 0.99 (+/- 0.01)
precision recall f1-score support
0 1.00 0.96 0.98 164
1 1.00 1.00 1.00 1836
micro avg 1.00 1.00 1.00 2000
macro avg 1.00 0.98 0.99 2000
weighted avg 1.00 1.00 1.00 2000
[[ 157 7]
[ 0 1836]]
# Taking the backup of loaded dataset
dataset_backup=data.copy()
dataset_backup.shape
(855969, 73)
missingdata = [x for x in data.count() < len(data)*0.35]
data.drop(data.columns[missingdata], axis=1, inplace=True)
data.shape
data.columns
len(data.columns)
53
# Checking and dropping columns with just one unique value
unique = data.nunique()
unique = unique[unique.values == 1]
data.drop(labels = list(unique.index), axis =1, inplace=True)
# Checking for duplicates
data.duplicated().value_counts()
False 855969
dtype: int64
len(data.columns)
52
data['annual_inc'].describe()
count 8.559690e+05
mean 7.507119e+04
std 6.426447e+04
min 0.000000e+00
25% 4.500000e+04
50% 6.500000e+04
75% 9.000000e+04
max 9.500000e+06
Name: annual_inc, dtype: float64
# Checking for outliers Data types for numeric columns
data['annual_inc']= data['annual_inc'].astype(float)
data['annual_inc'].describe()
data_fin=data.drop(data[data.annual_inc>1e+05].index)
# Bucketing and dropping extra columns
data_fin['int_rate']=data_fin['int_rate'].astype(str)
data_fin['int_rate']= data_fin['int_rate'].map(lambda x: x.rstrip('%'))
data_fin['int_rate']= data_fin['int_rate'].astype(float)
data_fin['int_rate'].describe()
buck = [0, 5, 10, 15, 20,25, 35]
lab = ['0-5', '5-10', '10-15', '15-20', '20-25','>25']
data_fin['int_rate_range'] = pd.cut(data_fin['int_rate'], buck, labels=lab)
data_fin['loan_amnt'].describe() #0-40k
buck = [0, 5000, 10000, 15000, 20000, 25000,40000]
lab = ['0-5000', '5000-10000', '10000-15000', '15000-20000', '20000-25000','25000 and above']
data_fin['loan_amnt_range'] = pd.cut(data_fin['loan_amnt'], buck, labels=lab)
data_fin['annual_inc'].describe() #range 1 to 1 mill
buck = [0, 25000, 50000, 75000, 100000,1000000]
lab = ['0-25000', '25000-50000', '50000-75000', '75000-100000', '100000 and above']
data_fin['annual_inc_range'] = pd.cut(data_fin['annual_inc'], buck, labels=lab)
Some basic plots to understand variable distribution- Univariate and bivariate relationships
sns.distplot(data_fin['loan_amnt'])
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
<matplotlib.axes._subplots.AxesSubplot at 0x14153ed3c8>
sns.distplot(data_fin['int_rate'])
<matplotlib.axes._subplots.AxesSubplot at 0x141f1ef1d0>
sns.distplot(data_fin['annual_inc'])
<matplotlib.axes._subplots.AxesSubplot at 0x141f2309e8>
sns.countplot(data_fin['default_ind'])
data_fin['default_ind'].value_counts()
0 662344
1 41227
Name: default_ind, dtype: int64
Well, Looking at this bar plot we can say this is case of class imbalance problem.
data_fin['default_ind'].value_counts()/len(data_fin)
0 0.941403
1 0.058597
Name: default_ind, dtype: float64
sns.countplot(data_fin['default_ind'])
sns.countplot(data_fin['purpose'],hue=data_fin['default_ind'])
sns.countplot(data_fin['purpose'],hue=data_fin['loan_amnt_range'])
<matplotlib.axes._subplots.AxesSubplot at 0x141fada898>
Home ownership variable - ANY and NONE levels do not signify anything so can be dropped
data_fin.drop(data_fin[data_fin['home_ownership']== 'ANY'].index, inplace=True)
data_fin.drop(data_fin[data_fin['home_ownership']== 'NONE'].index, inplace=True)
sns.countplot(data_fin['home_ownership'],hue=data_fin['default_ind'])
<matplotlib.axes._subplots.AxesSubplot at 0x14153cd208>
#employment length
sns.countplot(data_fin['emp_length'],hue=data_fin['default_ind'])
sns.countplot(data_fin['emp_length'],hue=data_fin['loan_amnt_range'])
<matplotlib.axes._subplots.AxesSubplot at 0x14153d2ac8>
#geography
sns.countplot(data_fin['addr_state'],hue=data_fin['default_ind'])
sns.countplot(data_fin['addr_state'],hue=data_fin['loan_amnt_range'])
<matplotlib.axes._subplots.AxesSubplot at 0x1422b070f0>
# monthly trend
data_fin['issue_yr']=pd.DatetimeIndex(data_fin['issue_d']).year
data_fin['issue_mon']=pd.DatetimeIndex(data_fin['issue_d']).month
sns.countplot(data_fin['issue_mon'],hue=data_fin['default_ind'])
<matplotlib.axes._subplots.AxesSubplot at 0x1422f85278>
Checking correlation between a few key business variables
cor_loan=data_fin[['loan_amnt','annual_inc', 'default_ind', 'int_rate', 'dti',
'tot_cur_bal', 'funded_amnt']]
f, ax = plt.subplots(figsize=(16, 9))
sns.heatmap(cor_loan.corr(),
xticklabels=cor_loan.columns.values,
yticklabels=cor_loan.columns.values,annot= True)
plt.show()
Variable selection: Dropping retrospective variables that would not have been known at the time of loan issuance
and thus can skew the prediction and some other unnecessary features.
data_fin.drop(['id', 'member_id','zip_code' ,'emp_title' ,'funded_amnt', 'funded_amnt_inv', 'total_pymnt',
'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'out_prncp','out_prncp_inv',
'last_pymnt_d', 'last_pymnt_amnt', 'last_credit_pull_d', 'collections_12_mths_ex_med',
'recoveries', 'collection_recovery_fee','title','revol_util'], axis=1, inplace=True)
# Dropping some more unnecessary features
data_fin.drop(['next_pymnt_d', 'tot_coll_amt' , 'tot_cur_bal' ,'total_rev_hi_lim'], axis=1, inplace=True)
len(data_fin.columns)
33
For categorical variables- Hot encoding used wherever clear levels could not be established
'emp_length'
data_fin['emp_length'].value_counts()
emp_range= {'< 1 year':0.5, '1 year':1, '2 years': 2, '3 years':3,
'4 years':4, '5 years':5,'6 years':6,'7 years':7,
'8 years':8,'9 years':9, '10+ years':10}
data_fin['emplen'] = data_fin["emp_length"].map(emp_range)
data_fin['emplen'].isnull().sum()
data_fin['emplen'].value_counts()
10.0 222511
2.0 62940
0.5 56206
3.0 55727
1.0 45844
5.0 44690
4.0 41923
7.0 35908
8.0 34946
6.0 34367
9.0 27534
Name: emplen, dtype: int64
Categorical: 'term', 'grade', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'purpose', 'initial_list_status', 'application_type'
Continuous: 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs','open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_amnt', 'collections_12_mths_ex_med', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim'
Count:
Missing value imputation
#Checking missing values in all the columns - how many null in each column
data_fin.isna()
data_fin.isnull().sum()
loan_amnt 0
term 0
int_rate 0
installment 0
grade 0
sub_grade 0
emp_length 40931
home_ownership 0
annual_inc 0
verification_status 0
issue_d 0
pymnt_plan 0
purpose 0
addr_state 0
dti 0
delinq_2yrs 0
earliest_cr_line 0
inq_last_6mths 0
mths_since_last_delinq 369541
open_acc 0
pub_rec 0
revol_bal 0
total_acc 0
initial_list_status 0
total_rec_late_fee 0
application_type 0
acc_now_delinq 0
default_ind 0
int_rate_range 0
loan_amnt_range 0
annual_inc_range 2
issue_yr 0
issue_mon 0
emplen 40931
dtype: int64
nullseries=pd.isnull(data_fin).sum()
nullseries[nullseries>0]
data_fin['emplen'] = data_fin['emplen'].replace(np.nan, 10)
data_fin.drop(['emp_length'],axis=1,inplace=True)
data_fin['mths_since_last_delinq'] = data_fin['mths_since_last_delinq'].fillna(data_fin['mths_since_last_delinq'].median()) #mean and median v similar
#very few missing values for all of them
#data_fin['dti'] = data_fin['dti'].fillna(data_fin['dti'].mean())
#data_fin['inq_last_6mths'] = data_fin['inq_last_6mths'].fillna(data_fin['inq_last_6mths'].mean())
#data_fin['collections_12_mths_ex_med'] = data_fin['collections_12_mths_ex_med'].fillna(data_fin['collections_12_mths_ex_med'].mean())
# A lot of NAs- with integer values so median replacement
# NONE
data_fin.isna()
data_fin.isnull().sum()
loan_amnt 0
term 0
int_rate 0
installment 0
grade 0
sub_grade 0
home_ownership 0
annual_inc 0
verification_status 0
issue_d 0
pymnt_plan 0
purpose 0
addr_state 0
dti 0
delinq_2yrs 0
earliest_cr_line 0
inq_last_6mths 0
mths_since_last_delinq 0
open_acc 0
pub_rec 0
revol_bal 0
total_acc 0
initial_list_status 0
total_rec_late_fee 0
application_type 0
acc_now_delinq 0
default_ind 0
int_rate_range 0
loan_amnt_range 0
annual_inc_range 2
issue_yr 0
issue_mon 0
emplen 0
dtype: int64
Feature Engineering- Encoding where not ordinality could not be established. Variables such as- verification status,subgrade, purpose,addr_state. Hot encoding needs to be done for purpose.
verification_map={'Source Verified':3, 'Verified':2, 'Not Verified':1}
data_fin['verification_status']=data_fin['verification_status'].map(verification_map)
ownership_map={'MORTGAGE':1, 'RENT':2, 'OWN':3, 'OTHER':4}
data_fin['home_ownership']=data_fin['home_ownership'].map(ownership_map)
subgrade_map={'A1':1,'A2':2, 'A3':3, 'A4':4, 'A5':5, 'B1':6, 'B2':7, 'B3':8, 'B4':9, 'B5':10,
'C1':11, 'C2':12, 'C3':13, 'C4':14, 'C5':15, 'D1':16, 'D2':17, 'D3':18,
'D4':19, 'D5':20, 'E1':21, 'E2':22, 'E3':23, 'E4':24, 'E5':25, 'F1':26,
'F2':27, 'F3':28, 'F4':29, 'F5':30, 'G1':31, 'G2':32, 'G3':33, 'G4':34, 'G5':35}
data_fin['sub_grade']=data_fin['sub_grade'].map(subgrade_map)
data_fin= data_fin[data_fin['purpose'] != 'educational']
data_fin= data_fin[data_fin['purpose'] !='wedding']
data_fin= data_fin[data_fin['purpose'] !='other']
enc1= pd.get_dummies(data_fin['purpose'])
data_fin=pd.concat((data_fin,enc1), axis=1)
data_fin.drop(['purpose'],axis=1,inplace=True)
enc2= pd.get_dummies(data_fin['addr_state'])
data_fin=pd.concat((data_fin,enc2), axis=1)
data_fin.drop(['addr_state'],axis=1,inplace=True)
enc4= pd.get_dummies(data_fin['application_type'])
data_fin=pd.concat((data_fin,enc4), axis=1)
data_fin.drop(['application_type'],axis=1,inplace=True)
enc5= pd.get_dummies(data_fin['term'])
data_fin=pd.concat((data_fin,enc5), axis=1)
data_fin.drop(['term'],axis=1,inplace=True)
data_fin.columns
Index(['loan_amnt', 'int_rate', 'installment', 'grade', 'sub_grade',
'home_ownership', 'annual_inc', 'verification_status', 'issue_d',
'pymnt_plan', 'dti', 'delinq_2yrs', 'earliest_cr_line',
'inq_last_6mths', 'mths_since_last_delinq', 'open_acc', 'pub_rec',
'revol_bal', 'total_acc', 'initial_list_status', 'total_rec_late_fee',
'acc_now_delinq', 'default_ind', 'int_rate_range', 'loan_amnt_range',
'annual_inc_range', 'issue_yr', 'issue_mon', 'emplen', 'car',
'credit_card', 'debt_consolidation', 'home_improvement', 'house',
'major_purchase', 'medical', 'moving', 'renewable_energy',
'small_business', 'vacation', 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT',
'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA',
'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH',
'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN',
'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'INDIVIDUAL', 'JOINT',
' 36 months', ' 60 months'],
dtype='object')
data_fin.drop(['earliest_cr_line','loan_amnt_range', 'annual_inc_range','int_rate_range','grade',
'issue_yr','issue_mon','pymnt_plan','initial_list_status', 'total_rec_late_fee'],axis=1, inplace=True)
data_fin.isna()
data_fin.isnull().sum()
loan_amnt 0
int_rate 0
installment 0
sub_grade 0
home_ownership 0
annual_inc 0
verification_status 0
issue_d 0
dti 0
delinq_2yrs 0
inq_last_6mths 0
mths_since_last_delinq 0
open_acc 0
pub_rec 0
revol_bal 0
total_acc 0
acc_now_delinq 0
default_ind 0
emplen 0
car 0
credit_card 0
debt_consolidation 0
home_improvement 0
house 0
major_purchase 0
medical 0
moving 0
renewable_energy 0
small_business 0
vacation 0
..
MS 0
MT 0
NC 0
ND 0
NE 0
NH 0
NJ 0
NM 0
NV 0
NY 0
OH 0
OK 0
OR 0
PA 0
RI 0
SC 0
SD 0
TN 0
TX 0
UT 0
VA 0
VT 0
WA 0
WI 0
WV 0
WY 0
INDIVIDUAL 0
JOINT 0
36 months 0
60 months 0
Length: 85, dtype: int64
len(data_fin.columns)
85
# emp_length
emp_length_count=data_fin['emplen'].value_counts()
print(emp_length_count)
bp=sns.barplot(emp_length_count.index,emp_length_count.values)
bp.set_xticklabels(bp.get_xticklabels(), rotation=45)
10.0 249986
2.0 59458
0.5 52904
3.0 52737
1.0 43262
5.0 42206
4.0 39597
7.0 34155
8.0 33180
6.0 32585
9.0 26226
Name: emplen, dtype: int64
[Text(0, 0, '0.5'),
Text(0, 0, '1.0'),
Text(0, 0, '2.0'),
Text(0, 0, '3.0'),
Text(0, 0, '4.0'),
Text(0, 0, '5.0'),
Text(0, 0, '6.0'),
Text(0, 0, '7.0'),
Text(0, 0, '8.0'),
Text(0, 0, '9.0'),
Text(0, 0, '10.0')]
# home_ownership
home_ownership_count=data_fin['home_ownership'].value_counts()
print(home_ownership_count)
sns.barplot(home_ownership_count.index,home_ownership_count.values)
1 313062
2 285331
3 67788
4 115
Name: home_ownership, dtype: int64
<matplotlib.axes._subplots.AxesSubplot at 0x1420608ac8>
# acc_now_delinq
acc_now_delinq_count=data_fin['acc_now_delinq'].value_counts()
print(acc_now_delinq_count)
sns.barplot(acc_now_delinq_count.index,acc_now_delinq_count.values)
0.0 663559
1.0 2570
2.0 141
3.0 20
4.0 4
5.0 1
14.0 1
Name: acc_now_delinq, dtype: int64
<matplotlib.axes._subplots.AxesSubplot at 0x1420665cc0>
# Since only one class dominates so we drop this feature
data_fin.drop(['acc_now_delinq'],axis=1,inplace=True)
# annual income
plt.figure(1)
plt.subplot(121)
sns.distplot(data_fin['annual_inc']);
plt.subplot(122)
data_fin['annual_inc'].plot.box(figsize=(16,5))
plt.show()
# dti
plt.figure(1)
plt.subplot(121)
sns.distplot(data_fin['dti']);
plt.subplot(122)
data_fin['dti'].plot.box(figsize=(16,5))
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
# Outliers Treatment
#Find mean of the column "dti"
dti_mean = int(data_fin['dti'].mean())
#FInd 75th Percentile of the column "dti"
IQR_dti_P75 = data_fin['dti'].quantile(q=0.75)
#FInd 25th Percentile of the column "dti"
IQR_dti_P25 = data_fin['dti'].quantile(q=0.25)
#FInd IQR of the column "dti"
IQR_dti = IQR_dti_P75-IQR_dti_P25
#Fix boundaries to detect outliers in column "dti"
IQR_LL = int(IQR_dti_P25 - 1.5*IQR_dti)
IQR_UL = int(IQR_dti_P75 + 1.5*IQR_dti)
#treating upper end outier with mean
data_fin.loc[data_fin['dti']>IQR_UL , 'dti'] = dti_mean
#treating lower end outlier as mean
data_fin.loc[data_fin['dti']<IQR_LL , 'dti'] = dti_mean
plt.figure(1)
plt.subplot(121)
sns.distplot(data_fin['delinq_2yrs']);
plt.subplot(122)
data_fin['delinq_2yrs'].plot.box(figsize=(16,5))
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
# Outliers Treatment
#Find mean of the column "delinq_2yrs"
delinq_2yrs_mean = int(data_fin['delinq_2yrs'].mean())
#FInd 75th Percentile of the column "delinq_2yrs"
IQR_delinq_2yrs_P75 = data_fin['delinq_2yrs'].quantile(q=0.75)
#FInd 25th Percentile of the column "delinq_2yrs"
IQR_delinq_2yrs_P25 = data_fin['delinq_2yrs'].quantile(q=0.25)
#FInd IQR of the column "dti"
IQR_delinq_2yrs = IQR_delinq_2yrs_P75-IQR_delinq_2yrs_P25
#Fix boundaries to detect outliers in column "dti"
IQR_LL = int(IQR_delinq_2yrs_P25 - 1.5*IQR_delinq_2yrs)
IQR_UL = int(IQR_delinq_2yrs_P75 + 1.5*IQR_delinq_2yrs)
#treating upper end outier with mean
data_fin.loc[data_fin['delinq_2yrs']>IQR_UL , 'delinq_2yrs'] = dti_mean
#treating lower end outlier as mean
data_fin.loc[data_fin['delinq_2yrs']<IQR_LL , 'delinq_2yrs'] = dti_mean
# open_acc
plt.figure(1)
plt.subplot(121)
sns.distplot(data_fin['open_acc']);
plt.subplot(122)
data_fin['open_acc'].plot.box(figsize=(16,5))
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
# Outliers Treatment
#Find mean of the column "open_acc"
open_acc_mean = int(data_fin['open_acc'].mean())
#FInd 75th Percentile of the column "open_acc"
IQR_open_acc_P75 = data_fin['open_acc'].quantile(q=0.75)
#FInd 25th Percentile of the column "open_acc"
IQR_open_acc_P25 = data_fin['open_acc'].quantile(q=0.25)
#FInd IQR of the column "open_acc"
IQR_open_acc = IQR_open_acc_P75-IQR_open_acc_P25
#Fix boundaries to detect outliers in column "open_acc"
IQR_LL = int(IQR_open_acc_P25 - 1.5*IQR_open_acc)
IQR_UL = int(IQR_open_acc_P75 + 1.5*IQR_open_acc)
#treating upper end outier with mean
data_fin.loc[data_fin['open_acc']>IQR_UL , 'open_acc'] = open_acc_mean
#treating lower end outlier as mean
data_fin.loc[data_fin['open_acc']<IQR_LL , 'open_acc'] = open_acc_mean
# pub_rec
plt.figure(1)
plt.subplot(121)
sns.distplot(data_fin['pub_rec']);
plt.subplot(122)
data_fin['pub_rec'].plot.box(figsize=(16,5))
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
# Outliers Treatment
#Find mean of the column "pub_rec"
pub_rec_mean = int(data_fin['pub_rec'].mean())
#FInd 75th Percentile of the column "pub_rec"
IQR_pub_rec_P75 = data_fin['pub_rec'].quantile(q=0.75)
#FInd 25th Percentile of the column "pub_rec"
IQR_pub_rec_P25 = data_fin['pub_rec'].quantile(q=0.25)
#FInd IQR of the column "pub_rec"
IQR_pub_rec = IQR_pub_rec_P75-IQR_pub_rec_P25
#Fix boundaries to detect outliers in column "pub_rec"
IQR_LL = int(IQR_pub_rec_P25 - 1.5*IQR_pub_rec)
IQR_UL = int(IQR_pub_rec_P75 + 1.5*IQR_pub_rec)
#treating upper end outier with mean
data_fin.loc[data_fin['pub_rec']>IQR_UL , 'pub_rec'] = pub_rec_mean
#treating lower end outlier as mean
data_fin.loc[data_fin['pub_rec']<IQR_LL , 'pub_rec'] = pub_rec_mean
plt.figure(1)
plt.subplot(121)
sns.distplot(data_fin['revol_bal']);
plt.subplot(122)
data_fin['revol_bal'].plot.box(figsize=(16,5))
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
# Outliers Treatment
#Find mean of the column "revol_bal"
revol_bal_mean = int(data_fin['revol_bal'].mean())
#FInd 75th Percentile of the column "revol_bal"
IQR_revol_bal_P75 = data_fin['revol_bal'].quantile(q=0.75)
#FInd 25th Percentile of the column "revol_bal"
IQR_revol_bal_P25 = data_fin['revol_bal'].quantile(q=0.25)
#FInd IQR of the column "revol_bal"
IQR_revol_bal = IQR_revol_bal_P75-IQR_revol_bal_P25
#Fix boundaries to detect outliers in column "revol_bal"
IQR_LL = int(IQR_revol_bal_P25 - 1.5*IQR_revol_bal)
IQR_UL = int(IQR_revol_bal_P75 + 1.5*IQR_revol_bal)
#treating upper end outier with mean
data_fin.loc[data_fin['revol_bal']>IQR_UL , 'revol_bal'] = revol_bal_mean
#treating lower end outlier as mean
data_fin.loc[data_fin['revol_bal']<IQR_LL , 'revol_bal'] = revol_bal_mean
# total_acc
plt.figure(1)
plt.subplot(121)
sns.distplot(data_fin['total_acc']);
plt.subplot(122)
data_fin['total_acc'].plot.box(figsize=(16,5))
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
# Outliers Treatment
#Find mean of the column "total_acc"
total_acc_mean = int(data_fin['total_acc'].mean())
#FInd 75th Percentile of the column "total_acc"
IQR_total_acc_P75 = data_fin['total_acc'].quantile(q=0.75)
#FInd 25th Percentile of the column "total_acc"
IQR_total_acc_P25 = data_fin['total_acc'].quantile(q=0.25)
#FInd IQR of the column "total_acc"
IQR_total_acc = IQR_total_acc_P75-IQR_total_acc_P25
#Fix boundaries to detect outliers in column "total_acc"
IQR_LL = int(IQR_total_acc_P25 - 1.5*IQR_total_acc)
IQR_UL = int(IQR_total_acc_P75 + 1.5*IQR_total_acc)
#treating upper end outier with mean
data_fin.loc[data_fin['total_acc']>IQR_UL , 'total_acc'] = total_acc_mean
#treating lower end outlier as mean
data_fin.loc[data_fin['total_acc']<IQR_LL , 'total_acc'] = total_acc_mean
data_fin.columns.all
<bound method Index._add_logical_methods.<locals>._make_logical_function.<locals>.logical_func of Index(['loan_amnt', 'int_rate', 'installment', 'sub_grade', 'home_ownership',
'annual_inc', 'verification_status', 'issue_d', 'dti', 'delinq_2yrs',
'inq_last_6mths', 'mths_since_last_delinq', 'open_acc', 'pub_rec',
'revol_bal', 'total_acc', 'default_ind', 'emplen', 'car', 'credit_card',
'debt_consolidation', 'home_improvement', 'house', 'major_purchase',
'medical', 'moving', 'renewable_energy', 'small_business', 'vacation',
'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI',
'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN',
'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH',
'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA',
'WI', 'WV', 'WY', 'INDIVIDUAL', 'JOINT', ' 36 months', ' 60 months'],
dtype='object')>
# Categorizing Numeric and count data
data_numeric=data_fin.loc[:,['loan_amnt','int_rate','sub_grade', 'home_ownership','verification_status','inq_last_6mths',
'annual_inc', 'dti', 'delinq_2yrs','open_acc','mths_since_last_delinq',
'total_acc', 'emplen', 'revol_bal',
]]
#calculating correlation among numeric variable
corr_matrix = data_numeric.corr()
#plot correlation matrix
plt.figure(figsize=(20,12))
sns.heatmap(corr_matrix,
cmap='coolwarm',
annot=True);
Sampling procedures-
1. SMOTE: can increase recall at the cost of precision
2. Undersampling: if less data overall, minority class gets you less data
3. ADASYN will focus on samples which are difficult to classify with NN
data_fin['issue_d'] = pd.to_datetime(data_fin['issue_d'])
data_fin = data_fin.set_index(data_fin['issue_d'])
data_fin = data_fin.sort_index()
train = data_fin['June 2007':'May 2015']
test = data_fin['June 2015':'Dec 2015']
print('Train Dataset:',train.shape)
print('Test Dataset:',test.shape)
Train Dataset: (469032, 84)
Test Dataset: (197264, 84)
train =train.drop('issue_d' , axis=1)
test =test.drop('issue_d', axis=1)
from collections import Counter
#from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import ADASYN, RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
X_train = train.drop('default_ind', axis=1)
y_train = train.loc[:,['default_ind']]
X_test = test.drop('default_ind', axis=1)
y_test = test.loc[:,['default_ind']]
#OVERSAMPLING-SMOTE
sm= SMOTE(random_state=42)
X_sm, y_sm = sm.fit_sample(X_train, y_train)
# #OVERSAMPLING-RANDOM
# ros= RandomOverSampler(random_state=555)
# X_over, y_over= ros.fit_sample(X_train, y_train)
# #Undersampling
# rus = RandomUnderSampler(return_indices=True, random_state=555)
# X_resampled, y_resampled, idx_resampled= rus.fit_sample(X_train, y_train)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
data_fin.info()
data_fin.describe()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 666296 entries, 2007-06-01 to 2015-12-01
Data columns (total 84 columns):
loan_amnt 666296 non-null float64
int_rate 666296 non-null float64
installment 666296 non-null float64
sub_grade 666296 non-null int64
home_ownership 666296 non-null int64
annual_inc 666296 non-null float64
verification_status 666296 non-null int64
issue_d 666296 non-null datetime64[ns]
dti 666296 non-null float64
delinq_2yrs 666296 non-null float64
inq_last_6mths 666296 non-null float64
mths_since_last_delinq 666296 non-null float64
open_acc 666296 non-null float64
pub_rec 666296 non-null float64
revol_bal 666296 non-null float64
total_acc 666296 non-null float64
default_ind 666296 non-null int64
emplen 666296 non-null float64
car 666296 non-null uint8
credit_card 666296 non-null uint8
debt_consolidation 666296 non-null uint8
home_improvement 666296 non-null uint8
house 666296 non-null uint8
major_purchase 666296 non-null uint8
medical 666296 non-null uint8
moving 666296 non-null uint8
renewable_energy 666296 non-null uint8
small_business 666296 non-null uint8
vacation 666296 non-null uint8
AK 666296 non-null uint8
AL 666296 non-null uint8
AR 666296 non-null uint8
AZ 666296 non-null uint8
CA 666296 non-null uint8
CO 666296 non-null uint8
CT 666296 non-null uint8
DC 666296 non-null uint8
DE 666296 non-null uint8
FL 666296 non-null uint8
GA 666296 non-null uint8
HI 666296 non-null uint8
IA 666296 non-null uint8
ID 666296 non-null uint8
IL 666296 non-null uint8
IN 666296 non-null uint8
KS 666296 non-null uint8
KY 666296 non-null uint8
LA 666296 non-null uint8
MA 666296 non-null uint8
MD 666296 non-null uint8
ME 666296 non-null uint8
MI 666296 non-null uint8
MN 666296 non-null uint8
MO 666296 non-null uint8
MS 666296 non-null uint8
MT 666296 non-null uint8
NC 666296 non-null uint8
ND 666296 non-null uint8
NE 666296 non-null uint8
NH 666296 non-null uint8
NJ 666296 non-null uint8
NM 666296 non-null uint8
NV 666296 non-null uint8
NY 666296 non-null uint8
OH 666296 non-null uint8
OK 666296 non-null uint8
OR 666296 non-null uint8
PA 666296 non-null uint8
RI 666296 non-null uint8
SC 666296 non-null uint8
SD 666296 non-null uint8
TN 666296 non-null uint8
TX 666296 non-null uint8
UT 666296 non-null uint8
VA 666296 non-null uint8
VT 666296 non-null uint8
WA 666296 non-null uint8
WI 666296 non-null uint8
WV 666296 non-null uint8
WY 666296 non-null uint8
INDIVIDUAL 666296 non-null uint8
JOINT 666296 non-null uint8
36 months 666296 non-null uint8
60 months 666296 non-null uint8
dtypes: datetime64[ns](1), float64(13), int64(4), uint8(66)
memory usage: 158.5 MB
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_std= scaler.fit_transform(X_sm)
X_std_test= scaler.fit_transform(X_test)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:625: DataConversionWarning: Data with input dtype uint8, int64, float64 were all converted to float64 by StandardScaler.
return self.partial_fit(X, y)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:462: DataConversionWarning: Data with input dtype uint8, int64, float64 were all converted to float64 by StandardScaler.
return self.fit(X, **fit_params).transform(X)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score,classification_report
#1. Logistic Regression
#SMOTE
lr_sm = LogisticRegression()
lr_sm.fit(X_sm, y_sm)
lr_sm.score(X_sm, y_sm)
y_pred_sm= lr_sm.predict(X_test)
accuracy_score(y_test, y_pred_sm)
roc_auc_score(y_test, y_pred_sm)
classification_report(y_test, y_pred_sm)
f1_score(y_test, y_pred_sm)
lr_sm.coef_.shape
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
(1, 82)
#2 Decision Tree
from sklearn.tree import tree, DecisionTreeClassifier
dt = tree.DecisionTreeClassifier(criterion='gini')
dt.fit(X_sm, y_sm)
dt.score(X_sm, y_sm)
y_pred_sm= dt.predict(X_test)
accuracy_score(y_test, y_pred_sm)
roc_auc_score(y_test, y_pred_sm)
classification_report(y_test, y_pred_sm)
f1_score(y_test, y_pred_sm)
0.004060487258471017
#3 Random Forests
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=80, max_features= 'log2')
rf.fit(X_sm, y_sm)
#rf.score(X_sm, y_sm)
y_rf= rf.predict(X_test)
accuracy_score(y_test, y_rf)
roc_auc_score(y_test, y_rf)
classification_report(y_test, y_rf)
f1_score(y_test, y_rf)
0.007662835249042146
#tuning RF
from sklearn.grid_search import GridSearchCV
param_grid = {
'n_estimators': [20, 80],
'max_features': [None, 'log2', 'sqrt']
}
CV_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv= 5)
CV_rf.fit(X_sm, y_sm)
print (CV_rf.best_params)
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-121-51bce7bba813> in <module>
1 #tuning RF
----> 2 from sklearn.grid_search import GridSearchCV
3 param_grid = {
4 'n_estimators': [20, 80],
5 'max_features': [None, 'log2', 'sqrt']
ModuleNotFoundError: No module named 'sklearn.grid_search'
#variable importance
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
# Gradient Boosted Trees
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=80, learning_rate=1,
random_state=42)
gb.fit(X_sm, y_sm)
#rf.score(X_sm, y_sm)
y_gb= gb.predict(X_test)
#accuracy_score(y_test, y_rf)
roc_auc_score(y_test, y_gb)
classification_report(y_test, y_gb)
f1_score(y_test, y_gb)
# Support Vector Machines
from sklearn import svm
from sklearn.svm import SVC
model_svm = svm.SVC(random_state=42, tol=100,class_weight='balanced')
model_svm.fit(X_std, y_sm)
y_svm= model_svm.predict(X_std_test)
accuracy_score(y_test, y_svm)
roc_auc_score(y_test, y_svm)
classification_report(y_test, y_svm)
f1_score(y_test, y_svm)