Learn data science and machine learning by building real-world projects on Jovian
In [2]:
# Filtering out the warnings

import warnings

warnings.filterwarnings('ignore')
In [3]:
# Importing the required libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import jovian
In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Credit EDA Case Study

Task 1: Reading the Data

  • Subtask 1.1: Read the Application Data.

In [5]:
# Reading the application data using read_csv function
application= pd.read_csv("application_data.csv")
In [6]:
#Taking a peak at the data
application.head()
Out[6]:
  • Subtask 1.2: Inspect the Dataframe

Inspect the dataframe for dimensions, null-values, and summary of different numeric columns.

In [7]:
# Getting the dimensions of the data
application.shape
Out[7]:
(307511, 122)
In [8]:
#Checking the nnull values and data types of the columns of the application file.
application.info(verbose=True,null_counts=True)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Data columns (total 122 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_CURR 307511 non-null int64 1 TARGET 307511 non-null int64 2 NAME_CONTRACT_TYPE 307511 non-null object 3 CODE_GENDER 307511 non-null object 4 FLAG_OWN_CAR 307511 non-null object 5 FLAG_OWN_REALTY 307511 non-null object 6 CNT_CHILDREN 307511 non-null int64 7 AMT_INCOME_TOTAL 307511 non-null float64 8 AMT_CREDIT 307511 non-null float64 9 AMT_ANNUITY 307499 non-null float64 10 AMT_GOODS_PRICE 307233 non-null float64 11 NAME_TYPE_SUITE 306219 non-null object 12 NAME_INCOME_TYPE 307511 non-null object 13 NAME_EDUCATION_TYPE 307511 non-null object 14 NAME_FAMILY_STATUS 307511 non-null object 15 NAME_HOUSING_TYPE 307511 non-null object 16 REGION_POPULATION_RELATIVE 307511 non-null float64 17 DAYS_BIRTH 307511 non-null int64 18 DAYS_EMPLOYED 307511 non-null int64 19 DAYS_REGISTRATION 307511 non-null float64 20 DAYS_ID_PUBLISH 307511 non-null int64 21 OWN_CAR_AGE 104582 non-null float64 22 FLAG_MOBIL 307511 non-null int64 23 FLAG_EMP_PHONE 307511 non-null int64 24 FLAG_WORK_PHONE 307511 non-null int64 25 FLAG_CONT_MOBILE 307511 non-null int64 26 FLAG_PHONE 307511 non-null int64 27 FLAG_EMAIL 307511 non-null int64 28 OCCUPATION_TYPE 211120 non-null object 29 CNT_FAM_MEMBERS 307509 non-null float64 30 REGION_RATING_CLIENT 307511 non-null int64 31 REGION_RATING_CLIENT_W_CITY 307511 non-null int64 32 WEEKDAY_APPR_PROCESS_START 307511 non-null object 33 HOUR_APPR_PROCESS_START 307511 non-null int64 34 REG_REGION_NOT_LIVE_REGION 307511 non-null int64 35 REG_REGION_NOT_WORK_REGION 307511 non-null int64 36 LIVE_REGION_NOT_WORK_REGION 307511 non-null int64 37 REG_CITY_NOT_LIVE_CITY 307511 non-null int64 38 REG_CITY_NOT_WORK_CITY 307511 non-null int64 39 LIVE_CITY_NOT_WORK_CITY 307511 non-null int64 40 ORGANIZATION_TYPE 307511 non-null object 41 EXT_SOURCE_1 134133 non-null float64 42 EXT_SOURCE_2 306851 non-null float64 43 EXT_SOURCE_3 246546 non-null float64 44 APARTMENTS_AVG 151450 non-null float64 45 BASEMENTAREA_AVG 127568 non-null float64 46 YEARS_BEGINEXPLUATATION_AVG 157504 non-null float64 47 YEARS_BUILD_AVG 103023 non-null float64 48 COMMONAREA_AVG 92646 non-null float64 49 ELEVATORS_AVG 143620 non-null float64 50 ENTRANCES_AVG 152683 non-null float64 51 FLOORSMAX_AVG 154491 non-null float64 52 FLOORSMIN_AVG 98869 non-null float64 53 LANDAREA_AVG 124921 non-null float64 54 LIVINGAPARTMENTS_AVG 97312 non-null float64 55 LIVINGAREA_AVG 153161 non-null float64 56 NONLIVINGAPARTMENTS_AVG 93997 non-null float64 57 NONLIVINGAREA_AVG 137829 non-null float64 58 APARTMENTS_MODE 151450 non-null float64 59 BASEMENTAREA_MODE 127568 non-null float64 60 YEARS_BEGINEXPLUATATION_MODE 157504 non-null float64 61 YEARS_BUILD_MODE 103023 non-null float64 62 COMMONAREA_MODE 92646 non-null float64 63 ELEVATORS_MODE 143620 non-null float64 64 ENTRANCES_MODE 152683 non-null float64 65 FLOORSMAX_MODE 154491 non-null float64 66 FLOORSMIN_MODE 98869 non-null float64 67 LANDAREA_MODE 124921 non-null float64 68 LIVINGAPARTMENTS_MODE 97312 non-null float64 69 LIVINGAREA_MODE 153161 non-null float64 70 NONLIVINGAPARTMENTS_MODE 93997 non-null float64 71 NONLIVINGAREA_MODE 137829 non-null float64 72 APARTMENTS_MEDI 151450 non-null float64 73 BASEMENTAREA_MEDI 127568 non-null float64 74 YEARS_BEGINEXPLUATATION_MEDI 157504 non-null float64 75 YEARS_BUILD_MEDI 103023 non-null float64 76 COMMONAREA_MEDI 92646 non-null float64 77 ELEVATORS_MEDI 143620 non-null float64 78 ENTRANCES_MEDI 152683 non-null float64 79 FLOORSMAX_MEDI 154491 non-null float64 80 FLOORSMIN_MEDI 98869 non-null float64 81 LANDAREA_MEDI 124921 non-null float64 82 LIVINGAPARTMENTS_MEDI 97312 non-null float64 83 LIVINGAREA_MEDI 153161 non-null float64 84 NONLIVINGAPARTMENTS_MEDI 93997 non-null float64 85 NONLIVINGAREA_MEDI 137829 non-null float64 86 FONDKAPREMONT_MODE 97216 non-null object 87 HOUSETYPE_MODE 153214 non-null object 88 TOTALAREA_MODE 159080 non-null float64 89 WALLSMATERIAL_MODE 151170 non-null object 90 EMERGENCYSTATE_MODE 161756 non-null object 91 OBS_30_CNT_SOCIAL_CIRCLE 306490 non-null float64 92 DEF_30_CNT_SOCIAL_CIRCLE 306490 non-null float64 93 OBS_60_CNT_SOCIAL_CIRCLE 306490 non-null float64 94 DEF_60_CNT_SOCIAL_CIRCLE 306490 non-null float64 95 DAYS_LAST_PHONE_CHANGE 307510 non-null float64 96 FLAG_DOCUMENT_2 307511 non-null int64 97 FLAG_DOCUMENT_3 307511 non-null int64 98 FLAG_DOCUMENT_4 307511 non-null int64 99 FLAG_DOCUMENT_5 307511 non-null int64 100 FLAG_DOCUMENT_6 307511 non-null int64 101 FLAG_DOCUMENT_7 307511 non-null int64 102 FLAG_DOCUMENT_8 307511 non-null int64 103 FLAG_DOCUMENT_9 307511 non-null int64 104 FLAG_DOCUMENT_10 307511 non-null int64 105 FLAG_DOCUMENT_11 307511 non-null int64 106 FLAG_DOCUMENT_12 307511 non-null int64 107 FLAG_DOCUMENT_13 307511 non-null int64 108 FLAG_DOCUMENT_14 307511 non-null int64 109 FLAG_DOCUMENT_15 307511 non-null int64 110 FLAG_DOCUMENT_16 307511 non-null int64 111 FLAG_DOCUMENT_17 307511 non-null int64 112 FLAG_DOCUMENT_18 307511 non-null int64 113 FLAG_DOCUMENT_19 307511 non-null int64 114 FLAG_DOCUMENT_20 307511 non-null int64 115 FLAG_DOCUMENT_21 307511 non-null int64 116 AMT_REQ_CREDIT_BUREAU_HOUR 265992 non-null float64 117 AMT_REQ_CREDIT_BUREAU_DAY 265992 non-null float64 118 AMT_REQ_CREDIT_BUREAU_WEEK 265992 non-null float64 119 AMT_REQ_CREDIT_BUREAU_MON 265992 non-null float64 120 AMT_REQ_CREDIT_BUREAU_QRT 265992 non-null float64 121 AMT_REQ_CREDIT_BUREAU_YEAR 265992 non-null float64 dtypes: float64(65), int64(41), object(16) memory usage: 286.2+ MB
In [9]:
#Using describe() on application file.
application.describe()
Out[9]:

Task 2: Data Quality Checks

  • Subtask 2.1: Check for missing values

In [10]:
# To count the percentage of missing values in each column and sorting them in descending order.
application.isna().sum().sort_values(ascending=False)*100/len(application)
Out[10]:
COMMONAREA_MEDI                 69.872297
COMMONAREA_AVG                  69.872297
COMMONAREA_MODE                 69.872297
NONLIVINGAPARTMENTS_MODE        69.432963
NONLIVINGAPARTMENTS_MEDI        69.432963
NONLIVINGAPARTMENTS_AVG         69.432963
FONDKAPREMONT_MODE              68.386172
LIVINGAPARTMENTS_MEDI           68.354953
LIVINGAPARTMENTS_MODE           68.354953
LIVINGAPARTMENTS_AVG            68.354953
FLOORSMIN_MEDI                  67.848630
FLOORSMIN_MODE                  67.848630
FLOORSMIN_AVG                   67.848630
YEARS_BUILD_MEDI                66.497784
YEARS_BUILD_AVG                 66.497784
YEARS_BUILD_MODE                66.497784
OWN_CAR_AGE                     65.990810
LANDAREA_MODE                   59.376738
LANDAREA_AVG                    59.376738
LANDAREA_MEDI                   59.376738
BASEMENTAREA_MEDI               58.515956
BASEMENTAREA_AVG                58.515956
BASEMENTAREA_MODE               58.515956
EXT_SOURCE_1                    56.381073
NONLIVINGAREA_MEDI              55.179164
NONLIVINGAREA_AVG               55.179164
NONLIVINGAREA_MODE              55.179164
ELEVATORS_MODE                  53.295980
ELEVATORS_AVG                   53.295980
ELEVATORS_MEDI                  53.295980
WALLSMATERIAL_MODE              50.840783
APARTMENTS_MODE                 50.749729
APARTMENTS_AVG                  50.749729
APARTMENTS_MEDI                 50.749729
ENTRANCES_MEDI                  50.348768
ENTRANCES_MODE                  50.348768
ENTRANCES_AVG                   50.348768
LIVINGAREA_MEDI                 50.193326
LIVINGAREA_MODE                 50.193326
LIVINGAREA_AVG                  50.193326
HOUSETYPE_MODE                  50.176091
FLOORSMAX_MODE                  49.760822
FLOORSMAX_MEDI                  49.760822
FLOORSMAX_AVG                   49.760822
YEARS_BEGINEXPLUATATION_MEDI    48.781019
YEARS_BEGINEXPLUATATION_AVG     48.781019
YEARS_BEGINEXPLUATATION_MODE    48.781019
TOTALAREA_MODE                  48.268517
EMERGENCYSTATE_MODE             47.398304
OCCUPATION_TYPE                 31.345545
EXT_SOURCE_3                    19.825307
AMT_REQ_CREDIT_BUREAU_QRT       13.501631
AMT_REQ_CREDIT_BUREAU_YEAR      13.501631
AMT_REQ_CREDIT_BUREAU_WEEK      13.501631
AMT_REQ_CREDIT_BUREAU_MON       13.501631
AMT_REQ_CREDIT_BUREAU_DAY       13.501631
AMT_REQ_CREDIT_BUREAU_HOUR      13.501631
NAME_TYPE_SUITE                  0.420148
OBS_30_CNT_SOCIAL_CIRCLE         0.332021
OBS_60_CNT_SOCIAL_CIRCLE         0.332021
DEF_60_CNT_SOCIAL_CIRCLE         0.332021
DEF_30_CNT_SOCIAL_CIRCLE         0.332021
EXT_SOURCE_2                     0.214626
AMT_GOODS_PRICE                  0.090403
AMT_ANNUITY                      0.003902
CNT_FAM_MEMBERS                  0.000650
DAYS_LAST_PHONE_CHANGE           0.000325
AMT_CREDIT                       0.000000
FLAG_OWN_CAR                     0.000000
FLAG_EMAIL                       0.000000
TARGET                           0.000000
FLAG_PHONE                       0.000000
FLAG_CONT_MOBILE                 0.000000
FLAG_WORK_PHONE                  0.000000
FLAG_EMP_PHONE                   0.000000
FLAG_MOBIL                       0.000000
NAME_CONTRACT_TYPE               0.000000
CODE_GENDER                      0.000000
FLAG_OWN_REALTY                  0.000000
AMT_INCOME_TOTAL                 0.000000
DAYS_ID_PUBLISH                  0.000000
DAYS_REGISTRATION                0.000000
DAYS_EMPLOYED                    0.000000
DAYS_BIRTH                       0.000000
REGION_POPULATION_RELATIVE       0.000000
REGION_RATING_CLIENT             0.000000
NAME_FAMILY_STATUS               0.000000
NAME_EDUCATION_TYPE              0.000000
NAME_INCOME_TYPE                 0.000000
CNT_CHILDREN                     0.000000
NAME_HOUSING_TYPE                0.000000
REG_REGION_NOT_LIVE_REGION       0.000000
REGION_RATING_CLIENT_W_CITY      0.000000
WEEKDAY_APPR_PROCESS_START       0.000000
FLAG_DOCUMENT_2                  0.000000
FLAG_DOCUMENT_3                  0.000000
FLAG_DOCUMENT_4                  0.000000
FLAG_DOCUMENT_5                  0.000000
FLAG_DOCUMENT_6                  0.000000
FLAG_DOCUMENT_7                  0.000000
FLAG_DOCUMENT_8                  0.000000
FLAG_DOCUMENT_9                  0.000000
FLAG_DOCUMENT_10                 0.000000
FLAG_DOCUMENT_11                 0.000000
FLAG_DOCUMENT_12                 0.000000
FLAG_DOCUMENT_13                 0.000000
FLAG_DOCUMENT_14                 0.000000
FLAG_DOCUMENT_15                 0.000000
FLAG_DOCUMENT_16                 0.000000
FLAG_DOCUMENT_17                 0.000000
FLAG_DOCUMENT_18                 0.000000
FLAG_DOCUMENT_19                 0.000000
FLAG_DOCUMENT_20                 0.000000
FLAG_DOCUMENT_21                 0.000000
ORGANIZATION_TYPE                0.000000
LIVE_CITY_NOT_WORK_CITY          0.000000
REG_CITY_NOT_WORK_CITY           0.000000
REG_CITY_NOT_LIVE_CITY           0.000000
LIVE_REGION_NOT_WORK_REGION      0.000000
REG_REGION_NOT_WORK_REGION       0.000000
HOUR_APPR_PROCESS_START          0.000000
SK_ID_CURR                       0.000000
dtype: float64

One can clearly see that there are so many columns with Null values more than 50% in it.

In order to deal with these columns, we opt to delete them/ not work with them. Hence we decide to drop columns which have null values more than 47%

In [11]:
# Dropping the columns which have nul values more than 47%
for column in application.columns:
    if application[column].isna().sum()*100/len(application) >47 :
        application.drop(column,axis=1,inplace=True)

  • Subtask 2.2: Imputing the missing the data

In [12]:
# Checking for null values again.
application.isna().sum().sort_values(ascending=False)*100/len(application)
Out[12]:
OCCUPATION_TYPE                31.345545
EXT_SOURCE_3                   19.825307
AMT_REQ_CREDIT_BUREAU_YEAR     13.501631
AMT_REQ_CREDIT_BUREAU_MON      13.501631
AMT_REQ_CREDIT_BUREAU_WEEK     13.501631
AMT_REQ_CREDIT_BUREAU_DAY      13.501631
AMT_REQ_CREDIT_BUREAU_HOUR     13.501631
AMT_REQ_CREDIT_BUREAU_QRT      13.501631
NAME_TYPE_SUITE                 0.420148
OBS_30_CNT_SOCIAL_CIRCLE        0.332021
DEF_30_CNT_SOCIAL_CIRCLE        0.332021
OBS_60_CNT_SOCIAL_CIRCLE        0.332021
DEF_60_CNT_SOCIAL_CIRCLE        0.332021
EXT_SOURCE_2                    0.214626
AMT_GOODS_PRICE                 0.090403
AMT_ANNUITY                     0.003902
CNT_FAM_MEMBERS                 0.000650
DAYS_LAST_PHONE_CHANGE          0.000325
NAME_INCOME_TYPE                0.000000
FLAG_OWN_REALTY                 0.000000
TARGET                          0.000000
FLAG_EMAIL                      0.000000
FLAG_PHONE                      0.000000
FLAG_CONT_MOBILE                0.000000
NAME_CONTRACT_TYPE              0.000000
FLAG_WORK_PHONE                 0.000000
FLAG_EMP_PHONE                  0.000000
FLAG_MOBIL                      0.000000
CODE_GENDER                     0.000000
FLAG_OWN_CAR                    0.000000
CNT_CHILDREN                    0.000000
NAME_EDUCATION_TYPE             0.000000
AMT_INCOME_TOTAL                0.000000
AMT_CREDIT                      0.000000
DAYS_ID_PUBLISH                 0.000000
DAYS_REGISTRATION               0.000000
DAYS_EMPLOYED                   0.000000
DAYS_BIRTH                      0.000000
REGION_RATING_CLIENT            0.000000
NAME_HOUSING_TYPE               0.000000
NAME_FAMILY_STATUS              0.000000
REGION_POPULATION_RELATIVE      0.000000
REG_CITY_NOT_LIVE_CITY          0.000000
REGION_RATING_CLIENT_W_CITY     0.000000
WEEKDAY_APPR_PROCESS_START      0.000000
FLAG_DOCUMENT_21                0.000000
FLAG_DOCUMENT_20                0.000000
FLAG_DOCUMENT_19                0.000000
FLAG_DOCUMENT_18                0.000000
FLAG_DOCUMENT_17                0.000000
FLAG_DOCUMENT_16                0.000000
FLAG_DOCUMENT_15                0.000000
FLAG_DOCUMENT_14                0.000000
FLAG_DOCUMENT_13                0.000000
FLAG_DOCUMENT_12                0.000000
FLAG_DOCUMENT_11                0.000000
FLAG_DOCUMENT_10                0.000000
FLAG_DOCUMENT_9                 0.000000
FLAG_DOCUMENT_8                 0.000000
FLAG_DOCUMENT_7                 0.000000
FLAG_DOCUMENT_6                 0.000000
FLAG_DOCUMENT_5                 0.000000
FLAG_DOCUMENT_4                 0.000000
FLAG_DOCUMENT_3                 0.000000
FLAG_DOCUMENT_2                 0.000000
ORGANIZATION_TYPE               0.000000
LIVE_CITY_NOT_WORK_CITY         0.000000
REG_CITY_NOT_WORK_CITY          0.000000
LIVE_REGION_NOT_WORK_REGION     0.000000
REG_REGION_NOT_WORK_REGION      0.000000
REG_REGION_NOT_LIVE_REGION      0.000000
HOUR_APPR_PROCESS_START         0.000000
SK_ID_CURR                      0.000000
dtype: float64

We can see that there are still few columns which have null values. We'll deal it by imputing data, as by dropping these columns we might end up losing the data or information.

Let's find out how and which columns we can impute.

In [13]:
# Finding columns with less than 13% and more than 0% null values.
for column in application.columns:
    if application[column].isna().sum()*100/len(application) < 15 and  application[column].isna().sum()*100/len(application) > 0:
        print(column)
AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE CNT_FAM_MEMBERS EXT_SOURCE_2 OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR

Handling "AMT_ANNUITY" Column

In [177]:
# Applying box plot to see the variation in the data
plt.figure(figsize=[5,5])
sns.boxplot(application['AMT_ANNUITY'])
plt.show()
Notebook Image
In [15]:
# Using describe() on "AMT_ANNUITY" to get it's quantiles and mean.
application['AMT_ANNUITY'].describe()
Out[15]:
count    307499.000000
mean      27108.573909
std       14493.737315
min        1615.500000
25%       16524.000000
50%       24903.000000
75%       34596.000000
max      258025.500000
Name: AMT_ANNUITY, dtype: float64

Conclusion:We can replace the missing values in this data with median.

Handling "NAME_TYPE_SUITE" Column

In [16]:
# Appying value counts to identify fequency of the values in the column.
application['NAME_TYPE_SUITE'].value_counts(normalize=True)
Out[16]:
Unaccompanied      0.811596
Family             0.131112
Spouse, partner    0.037130
Children           0.010669
Other_B            0.005780
Other_A            0.002828
Group of people    0.000885
Name: NAME_TYPE_SUITE, dtype: float64

ConclusonWe can replace the missing value in "NAME_TYPE_SUITE" with "Unaccompanied" as it is the mode with 81% of the column filled with it.

Handling "CNT_FAM_MEMBERS" Column

In [17]:
# Appying value counts to identify fequency of the values in the column.
application['CNT_FAM_MEMBERS'].value_counts(normalize=True)
Out[17]:
2.0     0.514967
1.0     0.220634
3.0     0.171055
4.0     0.080313
5.0     0.011310
6.0     0.001327
7.0     0.000263
8.0     0.000065
9.0     0.000020
10.0    0.000010
14.0    0.000007
16.0    0.000007
12.0    0.000007
20.0    0.000007
11.0    0.000003
13.0    0.000003
15.0    0.000003
Name: CNT_FAM_MEMBERS, dtype: float64
In [18]:
# Applyinng describe() to see the mean and median of the data.
application['CNT_FAM_MEMBERS'].describe()
Out[18]:
count    307509.000000
mean          2.152665
std           0.910682
min           1.000000
25%           2.000000
50%           2.000000
75%           3.000000
max          20.000000
Name: CNT_FAM_MEMBERS, dtype: float64

Conclusion:The null values of this column can be filled by 2 as it is the median, approx mean as well as 50% of data in this from a family of 2.

Handling "AMT_REQ_CREDIT_BUREAU" different columns

In [19]:
# Applying value counts to get friquencies of calues of 'AMT_REQ_CREDIT_BUREAU_HOUR'
application['AMT_REQ_CREDIT_BUREAU_HOUR'].value_counts(normalize=True)
Out[19]:
0.0    0.993887
1.0    0.005865
2.0    0.000211
3.0    0.000034
4.0    0.000004
Name: AMT_REQ_CREDIT_BUREAU_HOUR, dtype: float64

Conclusion:0 has the highest number of entries with 99% of the data.

In [20]:
# Applying value counts to get friquencies of calues of 'AMT_REQ_CREDIT_BUREAU_Day'.
application['AMT_REQ_CREDIT_BUREAU_DAY'].value_counts(normalize=True).head()
Out[20]:
0.0    0.994402
1.0    0.004857
2.0    0.000399
3.0    0.000169
4.0    0.000098
Name: AMT_REQ_CREDIT_BUREAU_DAY, dtype: float64

Conclusion:0 has the highest number of entries with 99% of the data

In [21]:
# Applying value counts to get friquencies of calues of 'AMT_REQ_CREDIT_BUREAU_WEEK'.
application['AMT_REQ_CREDIT_BUREAU_WEEK'].value_counts(normalize=True).head()
Out[21]:
0.0    0.967909
1.0    0.030858
2.0    0.000748
3.0    0.000218
4.0    0.000128
Name: AMT_REQ_CREDIT_BUREAU_WEEK, dtype: float64

Conclusion:0 has the highest number of entries with 96% of the data

In [22]:
# Applying value counts to get friquencies of calues of 'AMT_REQ_CREDIT_BUREAU_MON'.
application['AMT_REQ_CREDIT_BUREAU_MON'].value_counts(normalize=True).head()
Out[22]:
0.0    0.835488
1.0    0.124617
2.0    0.020249
3.0    0.007485
4.0    0.004045
Name: AMT_REQ_CREDIT_BUREAU_MON, dtype: float64

Conclusion:0 has the highest number of entries with 83% of the data

In [23]:
# Applying value counts to get friquencies of calues of 'AMT_REQ_CREDIT_BUREAU_QRT'.
application['AMT_REQ_CREDIT_BUREAU_QRT'].value_counts(normalize=True).head()
Out[23]:
0.0    0.809863
1.0    0.127305
2.0    0.054182
3.0    0.006455
4.0    0.001790
Name: AMT_REQ_CREDIT_BUREAU_QRT, dtype: float64

Conclusion:0 has the highest number of entries with 80% of the data

In [24]:
# Applying value counts to get friquencies of calues of 'AMT_REQ_CREDIT_BUREAU_YEAR'.
application['AMT_REQ_CREDIT_BUREAU_YEAR'].value_counts(normalize=True).head()
Out[24]:
0.0    0.269937
1.0    0.238372
2.0    0.188697
3.0    0.126425
4.0    0.077875
Name: AMT_REQ_CREDIT_BUREAU_YEAR, dtype: float64

Conclusion:0 has the highest number of entries with 26% of the data,followed by 1 with 23% of the data.

In [25]:
#Using Describe function on AMT_REQ_CREDIT_BUREAU columns to get their quantile and mean
application[['AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT','AMT_REQ_CREDIT_BUREAU_YEAR']].describe()
Out[25]:

Report on how to fill the missing values of the following columnns

  • AMT_ANNUITY: It is recommended to fill the null values of this column with the median value of the column.
  • NAME_TYPE_SUIUTE: It is suggested to fill the null value of this column with "Unaccompanied" as 81% data is filled with this value.
  • CNT_FAM_MEMBERS: The null value of this coumn can be filled by 2 as it's the approx mean, median and the mode of the column
  • AMT_REQ_CREDIT_BUREAU_HOUR: The null value of this column can be filled by 0 as 99% data in this column has 0 values as well as mean is almost 0
  • AMT_REQ_CREDIT_BUREAU_DAY: The null value of this column can be filled by 0 as 99% data in this column has 0 values as well as mean is almost 0
  • AMT_REQ_CREDIT_BUREAU_WEEK: The null value of this column can be filled by 0 as 96% data in this column has 0 values as well as mean is almost 0
  • AMT_REQ_CREDIT_BUREAU_MON: The null value of this column can be filled by 0 as 83% data in this column has 0 values as well as mean is almost 0
  • AMT_REQ_CREDIT_BUREAU_QRT: The null value of this column can be filled by 0 as 80% data in this column has 0 values as well as mean is almost 0
  • AMT_REQ_CREDIT_BUREAU_YEAR: The null value of this column can be filled by 1 as it is the median value and second highest in the value counts.

  • Subtask 2.3: Treating columns with wrong Data Type or Data

  • ORGANIZATION_TYPE column has null values which are written in the form of "XNA" which is a string value and not Null(although XNA meas null). Hence we are converting those values to null so that they can be eliminated at the time of Analysis
  • COD_GENDER column as well has "XNA" values which are null but are considered as string. Hence we'll get rid of them.
In [26]:
#Using apply() and labda function to change the XNA values on "ORGANIZATION_TYPE" column.
application['ORGANIZATION_TYPE']=application['ORGANIZATION_TYPE'].apply(lambda x:None if x=="XNA" else x)
In [27]:
#Using apply() and labda function to change the XNA values on "CODE_GENDER" column.
application['CODE_GENDER']=application['CODE_GENDER'].apply(lambda x:None if x=="XNA" else x)
In [28]:
#Checking the data types of each column.
application.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Data columns (total 73 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_CURR 307511 non-null int64 1 TARGET 307511 non-null int64 2 NAME_CONTRACT_TYPE 307511 non-null object 3 CODE_GENDER 307507 non-null object 4 FLAG_OWN_CAR 307511 non-null object 5 FLAG_OWN_REALTY 307511 non-null object 6 CNT_CHILDREN 307511 non-null int64 7 AMT_INCOME_TOTAL 307511 non-null float64 8 AMT_CREDIT 307511 non-null float64 9 AMT_ANNUITY 307499 non-null float64 10 AMT_GOODS_PRICE 307233 non-null float64 11 NAME_TYPE_SUITE 306219 non-null object 12 NAME_INCOME_TYPE 307511 non-null object 13 NAME_EDUCATION_TYPE 307511 non-null object 14 NAME_FAMILY_STATUS 307511 non-null object 15 NAME_HOUSING_TYPE 307511 non-null object 16 REGION_POPULATION_RELATIVE 307511 non-null float64 17 DAYS_BIRTH 307511 non-null int64 18 DAYS_EMPLOYED 307511 non-null int64 19 DAYS_REGISTRATION 307511 non-null float64 20 DAYS_ID_PUBLISH 307511 non-null int64 21 FLAG_MOBIL 307511 non-null int64 22 FLAG_EMP_PHONE 307511 non-null int64 23 FLAG_WORK_PHONE 307511 non-null int64 24 FLAG_CONT_MOBILE 307511 non-null int64 25 FLAG_PHONE 307511 non-null int64 26 FLAG_EMAIL 307511 non-null int64 27 OCCUPATION_TYPE 211120 non-null object 28 CNT_FAM_MEMBERS 307509 non-null float64 29 REGION_RATING_CLIENT 307511 non-null int64 30 REGION_RATING_CLIENT_W_CITY 307511 non-null int64 31 WEEKDAY_APPR_PROCESS_START 307511 non-null object 32 HOUR_APPR_PROCESS_START 307511 non-null int64 33 REG_REGION_NOT_LIVE_REGION 307511 non-null int64 34 REG_REGION_NOT_WORK_REGION 307511 non-null int64 35 LIVE_REGION_NOT_WORK_REGION 307511 non-null int64 36 REG_CITY_NOT_LIVE_CITY 307511 non-null int64 37 REG_CITY_NOT_WORK_CITY 307511 non-null int64 38 LIVE_CITY_NOT_WORK_CITY 307511 non-null int64 39 ORGANIZATION_TYPE 252137 non-null object 40 EXT_SOURCE_2 306851 non-null float64 41 EXT_SOURCE_3 246546 non-null float64 42 OBS_30_CNT_SOCIAL_CIRCLE 306490 non-null float64 43 DEF_30_CNT_SOCIAL_CIRCLE 306490 non-null float64 44 OBS_60_CNT_SOCIAL_CIRCLE 306490 non-null float64 45 DEF_60_CNT_SOCIAL_CIRCLE 306490 non-null float64 46 DAYS_LAST_PHONE_CHANGE 307510 non-null float64 47 FLAG_DOCUMENT_2 307511 non-null int64 48 FLAG_DOCUMENT_3 307511 non-null int64 49 FLAG_DOCUMENT_4 307511 non-null int64 50 FLAG_DOCUMENT_5 307511 non-null int64 51 FLAG_DOCUMENT_6 307511 non-null int64 52 FLAG_DOCUMENT_7 307511 non-null int64 53 FLAG_DOCUMENT_8 307511 non-null int64 54 FLAG_DOCUMENT_9 307511 non-null int64 55 FLAG_DOCUMENT_10 307511 non-null int64 56 FLAG_DOCUMENT_11 307511 non-null int64 57 FLAG_DOCUMENT_12 307511 non-null int64 58 FLAG_DOCUMENT_13 307511 non-null int64 59 FLAG_DOCUMENT_14 307511 non-null int64 60 FLAG_DOCUMENT_15 307511 non-null int64 61 FLAG_DOCUMENT_16 307511 non-null int64 62 FLAG_DOCUMENT_17 307511 non-null int64 63 FLAG_DOCUMENT_18 307511 non-null int64 64 FLAG_DOCUMENT_19 307511 non-null int64 65 FLAG_DOCUMENT_20 307511 non-null int64 66 FLAG_DOCUMENT_21 307511 non-null int64 67 AMT_REQ_CREDIT_BUREAU_HOUR 265992 non-null float64 68 AMT_REQ_CREDIT_BUREAU_DAY 265992 non-null float64 69 AMT_REQ_CREDIT_BUREAU_WEEK 265992 non-null float64 70 AMT_REQ_CREDIT_BUREAU_MON 265992 non-null float64 71 AMT_REQ_CREDIT_BUREAU_QRT 265992 non-null float64 72 AMT_REQ_CREDIT_BUREAU_YEAR 265992 non-null float64 dtypes: float64(20), int64(41), object(12) memory usage: 171.3+ MB

DAYS_REGISTRATION,CNT_FAM_MEMBERS,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR

In [156]:
#Changing the DataTypes of the columns which are wrong.
application['AMT_REQ_CREDIT_BUREAU_HOUR'].value_counts()
Out[156]:
0.0    264366
1.0      1560
2.0        56
3.0         9
4.0         1
Name: AMT_REQ_CREDIT_BUREAU_HOUR, dtype: int64
In [160]:
application['CNT_FAM_MEMBERS'].value_counts()
Out[160]:
2.0     158357
1.0      67847
3.0      52601
4.0      24697
5.0       3478
6.0        408
7.0         81
8.0         20
9.0          6
10.0         3
14.0         2
16.0         2
12.0         2
20.0         2
11.0         1
13.0         1
15.0         1
Name: CNT_FAM_MEMBERS, dtype: int64

Conclusion:

  • As Days cannot be a float value, we are converting it into integer.
  • As Count of family members also not be in float, converting that column into integer
  • AMT_REQ_CREDIT_BUREAU columns can not be in float as the count of inquiries made is a whole number and not a fraction. Hence we will convert those columns
In [165]:
#Converting the columns using astype()
application['CNT_FAM_MEMBERS']=application['CNT_FAM_MEMBERS'].dropna().astype(int)
application['DAYS_REGISTRATION']=application['DAYS_REGISTRATION'].astype(int)
application['AMT_REQ_CREDIT_BUREAU_HOUR']=application['AMT_REQ_CREDIT_BUREAU_HOUR'].dropna().astype(int)
application['AMT_REQ_CREDIT_BUREAU_DAY']=application['AMT_REQ_CREDIT_BUREAU_DAY'].dropna().astype(int)
application['AMT_REQ_CREDIT_BUREAU_WEEK']=application['AMT_REQ_CREDIT_BUREAU_WEEK'].dropna().astype(int)
application['AMT_REQ_CREDIT_BUREAU_MON']=application['AMT_REQ_CREDIT_BUREAU_MON'].dropna().astype(int)
application['AMT_REQ_CREDIT_BUREAU_QRT']=application['AMT_REQ_CREDIT_BUREAU_QRT'].dropna().astype(int)
application['AMT_REQ_CREDIT_BUREAU_YEAR']=application['AMT_REQ_CREDIT_BUREAU_YEAR'].dropna().astype(int)

  • Subtask 2.4: Outlier Detection

Checking outliers for column:AMT_INCOME_TOTAL

In [29]:
#Applying box plot to test for outliers in AMT_INCOME_TOTAL
plt.figure(figsize=[10,5])
plt.subplot(1,2,1)
sns.boxplot(application['AMT_INCOME_TOTAL'])
plt.subplot(1,2,2)
sns.distplot(application['AMT_INCOME_TOTAL'])
plt.show()
Notebook Image
In [30]:
#Checking the % of people with AMT_INCOME>400000
len(application[application['AMT_INCOME_TOTAL']>400000])/len(application)
Out[30]:
0.02622020025299908
In [31]:
#Applying describe() on AMT_INCOME
application['AMT_INCOME_TOTAL'].describe()
Out[31]:
count    3.075110e+05
mean     1.687979e+05
std      2.371231e+05
min      2.565000e+04
25%      1.125000e+05
50%      1.471500e+05
75%      2.025000e+05
max      1.170000e+08
Name: AMT_INCOME_TOTAL, dtype: float64

Conclusion:We can clearly see that there are so many outliers in this column. The IQR clculated is 90000. If we would want to fix them , we can go ahead by dropping the values more than 4,00,000 rupees. As only 2% of the data is more than that value.

Checking outliers for column:AMT_CREDIT

In [32]:
#Using boxplot() and distplot() to identify outliers in AMT_CREDIT,
plt.figure(figsize=[10,5])
plt.subplot(1,2,1)
sns.boxplot(application['AMT_CREDIT'])
plt.subplot(1,2,2)
sns.distplot(application['AMT_CREDIT'])
plt.show()
Notebook Image
In [33]:
#USing describe() on AMT_CREDIT.
application['AMT_CREDIT'].describe()
Out[33]:
count    3.075110e+05
mean     5.990260e+05
std      4.024908e+05
min      4.500000e+04
25%      2.700000e+05
50%      5.135310e+05
75%      8.086500e+05
max      4.050000e+06
Name: AMT_CREDIT, dtype: float64
In [34]:
#To check the % of people with AMT_CREDIT<1600000.
len(application[application['AMT_CREDIT']<1600000])/len(application)
Out[34]:
0.9782186653485566

Conclusion:As there are few outliers in the data, we can go ahead and deal it by dropping those rows. Rows with Values more than 16,00,000 rupees can dropped.

Checking outliers for column:AMT_ANNUITY

In [35]:
#Applying boxplot() and distplot() to  identify outliers.
plt.figure(figsize=[10,5])
plt.subplot(1,2,1)
sns.boxplot(application['AMT_ANNUITY'])
plt.subplot(1,2,2)
sns.distplot(application['AMT_ANNUITY'])
plt.show()
Notebook Image
In [36]:
#Using describe() to see the quantiles and mean of AMT_ANNUITY.
application['AMT_ANNUITY'].describe()
Out[36]:
count    307499.000000
mean      27108.573909
std       14493.737315
min        1615.500000
25%       16524.000000
50%       24903.000000
75%       34596.000000
max      258025.500000
Name: AMT_ANNUITY, dtype: float64
In [37]:
#Calculating the % of people with ANN_ANNUITY>60000.
len(application[application['AMT_ANNUITY']>60000])/len(application)
Out[37]:
0.02644458247022058

Conclusion:In AMT_ANNUITY column we can observe that there are few outliers. Values more than 60,000 rupees are outliers. Since only 2% of the data is more than that, we can go ahead in dropping them.

Checking outliers for column:REGION_POPULATION_RELATIVE

In [38]:
#Applying Boxplot() and distplot() on REGION_POPULATION_RELATIVE to check for outliers
plt.figure(figsize=[10,5])
plt.subplot(1,2,1)
sns.boxplot(application['REGION_POPULATION_RELATIVE'])
plt.subplot(1,2,2)
sns.distplot(application['REGION_POPULATION_RELATIVE'])
plt.show()
Notebook Image
In [39]:
#Using describe() on REGION_POPULATION_RELATIVE to check for quantiles and mean.
application['REGION_POPULATION_RELATIVE'].describe()
Out[39]:
count    307511.000000
mean          0.020868
std           0.013831
min           0.000290
25%           0.010006
50%           0.018850
75%           0.028663
max           0.072508
Name: REGION_POPULATION_RELATIVE, dtype: float64
In [40]:
# Finding the % of people with REGION_POPUlATION_RELATIVE]>0.06.
len(application[application['REGION_POPULATION_RELATIVE']>0.06])/len(application)
Out[40]:
0.02735511900387303

Conclusion:REGION_POPULATION_RELATIVE when obsereved, we founnd many outliers. The values of this outliers are more than 0.6. Hence it is best to drop the rows with values of this column more than 0.6

Checking outliers for column:CNT_CHILDREN

In [41]:
#Applying boxplot() and distplot() to check for outliers in "CNT_CHILDREN"
plt.figure(figsize=[10,5])
plt.subplot(1,2,1)
sns.boxplot(application['CNT_CHILDREN'])
plt.subplot(1,2,2)
sns.distplot(application['CNT_CHILDREN'])
plt.show()
Notebook Image
In [42]:
#Using describe() to check for quantiles and mean of CNT_CHILDREN
application['CNT_CHILDREN'].describe()
Out[42]:
count    307511.000000
mean          0.417052
std           0.722121
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          19.000000
Name: CNT_CHILDREN, dtype: float64
In [43]:
#Calculating % of people with CNT_CHILDREN>3.
len(application[application['CNT_CHILDREN']>3])/len(application)
Out[43]:
0.00180481348634683

Conclusion:CNT_CHILDREN columns also have outliers. Values more than 3 are just 0.1%. Hence we should opt for deleting those rows or bin it into one value.

  • Subtask 2.4: Binning of Continous Data

In [44]:
application.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Data columns (total 73 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_CURR 307511 non-null int64 1 TARGET 307511 non-null int64 2 NAME_CONTRACT_TYPE 307511 non-null object 3 CODE_GENDER 307507 non-null object 4 FLAG_OWN_CAR 307511 non-null object 5 FLAG_OWN_REALTY 307511 non-null object 6 CNT_CHILDREN 307511 non-null int64 7 AMT_INCOME_TOTAL 307511 non-null float64 8 AMT_CREDIT 307511 non-null float64 9 AMT_ANNUITY 307499 non-null float64 10 AMT_GOODS_PRICE 307233 non-null float64 11 NAME_TYPE_SUITE 306219 non-null object 12 NAME_INCOME_TYPE 307511 non-null object 13 NAME_EDUCATION_TYPE 307511 non-null object 14 NAME_FAMILY_STATUS 307511 non-null object 15 NAME_HOUSING_TYPE 307511 non-null object 16 REGION_POPULATION_RELATIVE 307511 non-null float64 17 DAYS_BIRTH 307511 non-null int64 18 DAYS_EMPLOYED 307511 non-null int64 19 DAYS_REGISTRATION 307511 non-null float64 20 DAYS_ID_PUBLISH 307511 non-null int64 21 FLAG_MOBIL 307511 non-null int64 22 FLAG_EMP_PHONE 307511 non-null int64 23 FLAG_WORK_PHONE 307511 non-null int64 24 FLAG_CONT_MOBILE 307511 non-null int64 25 FLAG_PHONE 307511 non-null int64 26 FLAG_EMAIL 307511 non-null int64 27 OCCUPATION_TYPE 211120 non-null object 28 CNT_FAM_MEMBERS 307509 non-null float64 29 REGION_RATING_CLIENT 307511 non-null int64 30 REGION_RATING_CLIENT_W_CITY 307511 non-null int64 31 WEEKDAY_APPR_PROCESS_START 307511 non-null object 32 HOUR_APPR_PROCESS_START 307511 non-null int64 33 REG_REGION_NOT_LIVE_REGION 307511 non-null int64 34 REG_REGION_NOT_WORK_REGION 307511 non-null int64 35 LIVE_REGION_NOT_WORK_REGION 307511 non-null int64 36 REG_CITY_NOT_LIVE_CITY 307511 non-null int64 37 REG_CITY_NOT_WORK_CITY 307511 non-null int64 38 LIVE_CITY_NOT_WORK_CITY 307511 non-null int64 39 ORGANIZATION_TYPE 252137 non-null object 40 EXT_SOURCE_2 306851 non-null float64 41 EXT_SOURCE_3 246546 non-null float64 42 OBS_30_CNT_SOCIAL_CIRCLE 306490 non-null float64 43 DEF_30_CNT_SOCIAL_CIRCLE 306490 non-null float64 44 OBS_60_CNT_SOCIAL_CIRCLE 306490 non-null float64 45 DEF_60_CNT_SOCIAL_CIRCLE 306490 non-null float64 46 DAYS_LAST_PHONE_CHANGE 307510 non-null float64 47 FLAG_DOCUMENT_2 307511 non-null int64 48 FLAG_DOCUMENT_3 307511 non-null int64 49 FLAG_DOCUMENT_4 307511 non-null int64 50 FLAG_DOCUMENT_5 307511 non-null int64 51 FLAG_DOCUMENT_6 307511 non-null int64 52 FLAG_DOCUMENT_7 307511 non-null int64 53 FLAG_DOCUMENT_8 307511 non-null int64 54 FLAG_DOCUMENT_9 307511 non-null int64 55 FLAG_DOCUMENT_10 307511 non-null int64 56 FLAG_DOCUMENT_11 307511 non-null int64 57 FLAG_DOCUMENT_12 307511 non-null int64 58 FLAG_DOCUMENT_13 307511 non-null int64 59 FLAG_DOCUMENT_14 307511 non-null int64 60 FLAG_DOCUMENT_15 307511 non-null int64 61 FLAG_DOCUMENT_16 307511 non-null int64 62 FLAG_DOCUMENT_17 307511 non-null int64 63 FLAG_DOCUMENT_18 307511 non-null int64 64 FLAG_DOCUMENT_19 307511 non-null int64 65 FLAG_DOCUMENT_20 307511 non-null int64 66 FLAG_DOCUMENT_21 307511 non-null int64 67 AMT_REQ_CREDIT_BUREAU_HOUR 265992 non-null float64 68 AMT_REQ_CREDIT_BUREAU_DAY 265992 non-null float64 69 AMT_REQ_CREDIT_BUREAU_WEEK 265992 non-null float64 70 AMT_REQ_CREDIT_BUREAU_MON 265992 non-null float64 71 AMT_REQ_CREDIT_BUREAU_QRT 265992 non-null float64 72 AMT_REQ_CREDIT_BUREAU_YEAR 265992 non-null float64 dtypes: float64(20), int64(41), object(12) memory usage: 171.3+ MB
In [203]:
#Binning Age of the data
age=application['DAYS_BIRTH'].abs()/365
application["age_group"]=pd.cut(age,[0,30,40,50,60,9999],labels=["<30","30-40","40-50","50-60","60+"])
In [204]:
#Applying value_counts() on age column.
application.age_group.value_counts()
Out[204]:
30-40    82308
40-50    76541
50-60    68062
<30      45021
60+      35579
Name: age_group, dtype: int64
In [209]:
#Binning of period of since registerations of the customers.
regi=application['DAYS_REGISTRATION'].abs()/365
application['CUSTOMER_REG_CATEGORY']=pd.cut(regi,[0,2,5,10,20,999],labels=["Extremely New Customer","New Customer","Medium Customer","Old Customer","Extremely Old Customer"])
In [210]:
application.CUSTOMER_REG_CATEGORY.value_counts()
Out[210]:
Old Customer              100657
Extremely Old Customer     80639
Medium Customer            54614
New Customer               39130
Extremely New Customer     32391
Name: CUSTOMER_REG_CATEGORY, dtype: int64

Task 3: Analysis

In [45]:
#Checking the imbalance percent of the target variable.
application['TARGET'].value_counts(normalize=True)
Out[45]:
0    0.919271
1    0.080729
Name: TARGET, dtype: float64
In [46]:
# Plotting a horizontal bar graph to check for the imbalance in "TARGET" variable.
application['TARGET'].value_counts().plot.barh()
plt.grid(b=False)
plt.show()
Notebook Image

The target variable is defined as:

  • 1: If the client faces difficulty in paying any installment of the loan. (Only 10% of people in the data are not able to repay the loan on time)
  • 0: If the client is able to pay all the installments. (90% of the people in the data are able pay the loan on time)

One can clearly see that our data is extremely imbalance.

Dividing The DATA INTO TWO SETS.

  • T1: in this dataframe, target variable wil be 1
  • T0: in this dataframe, target variable wil be 0
In [47]:
#Createing dataframe with target=0
T0=application[application["TARGET"]==0]
T0.head()
Out[47]:
In [48]:
#Createing dataframe with target=1
T1=application[application["TARGET"]==1]
T1.head()
Out[48]:

Subtask 1. Univariate Analysis

Analysing NAME_CONTRACT_TYPE

In [50]:
#Applying Value Counts on 'NAME_CONTRACT_TYPE' for T0 dataframe.
T0['NAME_CONTRACT_TYPE'].value_counts(normalize=True)
Out[50]:
Cash loans         0.9021
Revolving loans    0.0979
Name: NAME_CONTRACT_TYPE, dtype: float64
In [51]:
#Applying Value Counts on 'NAME_CONTRACT_TYPE' for T1 dataframe.
T1['NAME_CONTRACT_TYPE'].value_counts(normalize=True)
Out[51]:
Cash loans         0.935388
Revolving loans    0.064612
Name: NAME_CONTRACT_TYPE, dtype: float64

As we already know that there is huge difference in size of both the dataframes, hence in order to compare the results of the frequency we though it it better to normalize them, as it'll show the relative count.

In [52]:
#T0['NAME_CONTRACT_TYPE'].value_counts(normalize=True).plot.barh()
index = ['Cash loans','Revolving loans']
df = pd.DataFrame({'T0': T0['NAME_CONTRACT_TYPE'].value_counts(normalize=True),'T1': T1['NAME_CONTRACT_TYPE'].value_counts(normalize=True)}, index=index)
ax = df.plot.barh()
Notebook Image

Inference:

  1. 93% of non-Defaulters could repay Cash loan and 7% of them could repay Revolvng loan
  2. We can see that 90% of defaulted people defaulted in Cash loan and 10% defaulted in Revolving loan

Analysis on NAME_EDUCATION_TYPE

In [53]:
#Applying value_counts() on NAME_EDUCATION_TYPE of T0 dataframe
T0['NAME_EDUCATION_TYPE'].value_counts(normalize=True)
Out[53]:
Secondary / secondary special    0.703491
Higher education                 0.250646
Incomplete higher                0.033270
Lower secondary                  0.012024
Academic degree                  0.000570
Name: NAME_EDUCATION_TYPE, dtype: float64
In [54]:
#Applying value_counts() on NAME_EDUCATION_TYPE of T1 dataframe
T1['NAME_EDUCATION_TYPE'].value_counts(normalize=True)
Out[54]:
Secondary / secondary special    0.786465
Higher education                 0.161490
Incomplete higher                0.035126
Lower secondary                  0.016798
Academic degree                  0.000121
Name: NAME_EDUCATION_TYPE, dtype: float64
In [55]:
#Plotting barh on value_counts()
index = ['Secondary / secondary special','Higher education','Incomplete higher','Lower secondary','Academic degree']
df = pd.DataFrame({'T0': T0['NAME_EDUCATION_TYPE'].value_counts(normalize=True),'T1': T1['NAME_EDUCATION_TYPE'].value_counts(normalize=True)}, index=index)
ax = df.plot.barh()
Notebook Image

Inferences:

  1. One can see that the proportion of people failing to pay the loan is lower than the proportion of people who are able to with higher education.
  2. The 78% of defaulter are with Secondary education, 25% are of higher education and 3% are with incomplete education

Analysis on NAME_INCOME_TYPE

In [56]:
#Applying value_counts() on NAME_INCOME_TYPE for T0 dataframe
T0['NAME_INCOME_TYPE'].value_counts(normalize=True)
Out[56]:
Working                 0.507807
Commercial associate    0.234384
Pensioner               0.185294
State servant           0.072356
Student                 0.000064
Unemployed              0.000050
Businessman             0.000035
Maternity leave         0.000011
Name: NAME_INCOME_TYPE, dtype: float64
In [179]:
#Applying value_counts on NAME_INCOME_TYPE for T1 dataframe
T1['NAME_INCOME_TYPE'].value_counts(normalize=True)
Out[179]:
Working                 0.613253
Commercial associate    0.215911
Pensioner               0.120121
State servant           0.050312
Unemployed              0.000322
Maternity leave         0.000081
Name: NAME_INCOME_TYPE, dtype: float64
In [180]:
#Plotting bar chart to understand Income type of the customers6
index = ['Working','Commercial Associate','Pensioner','State servant','Student','Unemployeed','Businessman','Maternity leave']
df = pd.DataFrame({'T0': T0['NAME_INCOME_TYPE'].value_counts(normalize=True),'T1': T1['NAME_INCOME_TYPE'].value_counts(normalize=True)}, index=index)
ax = df.plot.bar()
Notebook Image

Inference:

  1. 61% of defaulters are from working category,12% are from prnsioner category ,21% are from commercial associate.
  2. 50% of non defaulters are from Working category, 18% from pensioner category 23% are from commercial associate category.

Analysis on NAME_FAMILY_STATUS

In [59]:
#Applying value_counts() on NAME_FAMILY_STATUS on T0 DataFrame
T0['NAME_FAMILY_STATUS'].value_counts(normalize=True)
Out[59]:
Married                 0.642345
Single / not married    0.144991
Civil marriage          0.094854
Separated               0.064206
Widow                   0.053597
Unknown                 0.000007
Name: NAME_FAMILY_STATUS, dtype: float64
In [60]:
#Applying value_counts() on NAME_FAMILY_STATUS  on T1 Dataframe
T1['NAME_FAMILY_STATUS'].value_counts(normalize=True)
Out[60]:
Married                 0.598187
Single / not married    0.179537
Civil marriage          0.119275
Separated               0.065257
Widow                   0.037744
Name: NAME_FAMILY_STATUS, dtype: float64
In [61]:
#Plotting bar charts to understand the frequency. 
plt.figure(figsize=[10,5])
plt.subplot(1,2,1)
T0['NAME_FAMILY_STATUS'].value_counts(normalize=True).plot.bar()
plt.title("Family status of people who repayed the loan")
plt.subplot(1,2,2)
T1['NAME_FAMILY_STATUS'].value_counts(normalize=True).plot.bar()
plt.title("Family status of people who didn't repay the loan")
plt.show()
Notebook Image

Inferences:One can see that the pattern of the frequncy for FAMILY_STATUS is similar for defaulters as well as non-defaulter

Analysis on CODE_GENDER

In [181]:
#Applying value_counts() on CODE_GENDER for T0 dataframe
T0['CODE_GENDER'].dropna().value_counts(normalize=True)
Out[181]:
F    0.666042
M    0.333958
Name: CODE_GENDER, dtype: float64
In [63]:
#Applying value_counts on CODE_GENDER for T1 dataframe
T1['CODE_GENDER'].value_counts(normalize=True)
Out[63]:
F    0.570796
M    0.429204
Name: CODE_GENDER, dtype: float64
In [64]:
#Applying Pie chart to view the distribution.
plt.figure(figsize=[10,10])
plt.subplot(1,2,1)
T0['CODE_GENDER'].dropna().value_counts(normalize=True).plot.pie()
plt.title("Genders of people who repayed the loan")
plt.subplot(1,2,2)
T1['CODE_GENDER'].value_counts(normalize=True).plot.pie()
plt.title("Genders of people who didn't repay the loan")
plt.show()
Notebook Image

Inferences: The % of males is less in deafulters than in non defaulters. And % of females is mor in non-defaulters tha defaulters.

Analysis on OCCUPATION_TYPE

In [65]:
#Applying value_counts() on OCCUPATION_TYPE on T0 dataframe.
T0['OCCUPATION_TYPE'].dropna().value_counts(normalize=True)
Out[65]:
Laborers                 0.256256
Sales staff              0.150644
Core staff               0.134141
Managers                 0.104080
Drivers                  0.085661
High skill tech staff    0.055454
Accountants              0.048496
Medicine staff           0.041361
Security staff           0.031152
Cooking staff            0.027652
Cleaning staff           0.021841
Private service staff    0.012863
Low-skill Laborers       0.009004
Secretaries              0.006299
Waiters/barmen staff     0.006211
Realty agents            0.003593
HR staff                 0.002737
IT staff                 0.002555
Name: OCCUPATION_TYPE, dtype: float64
In [66]:
#Applying value_counts() on OCCUPATION_TYPE on T1 dataframe.
T1['OCCUPATION_TYPE'].value_counts(normalize=True)
Out[66]:
Laborers                 0.314768
Sales staff              0.166712
Drivers                  0.113603
Core staff               0.093708
Managers                 0.071602
Security staff           0.038928
High skill tech staff    0.037796
Cooking staff            0.033483
Medicine staff           0.030841
Accountants              0.025557
Cleaning staff           0.024101
Low-skill Laborers       0.019356
Private service staff    0.009435
Waiters/barmen staff     0.008195
Secretaries              0.004960
Realty agents            0.003181
HR staff                 0.001941
IT staff                 0.001833
Name: OCCUPATION_TYPE, dtype: float64
In [67]:
#Drawing barchart to visualize the dstribution. 
plt.figure(figsize=[20,5])
plt.subplot(1,2,1)
T0['OCCUPATION_TYPE'].dropna().value_counts(normalize=True).plot.bar()
plt.title("Occupation of people who repayed the loan")
plt.subplot(1,2,2)
T1['OCCUPATION_TYPE'].value_counts(normalize=True).plot.bar()
plt.title("Occupation of people who didn't repay the loan")
plt.show()
Notebook Image

Inferences: The % distribution for both defaulters and non-defaulters is same in Occuation_type column.

Analysis on ORGANIZATION_TYPE

In [68]:
#Applying value_counts() on ORGANISATION_TYPE for T0 Dataframe.
T0['ORGANIZATION_TYPE'].dropna().value_counts(normalize=True)
Out[68]:
Business Entity Type 3    0.267774
Self-employed             0.149821
Other                     0.066903
Medicine                  0.045401
Government                0.042023
Business Entity Type 2    0.041915
School                    0.036331
Trade: type 7             0.030790
Kindergarten              0.027772
Construction              0.025775
Business Entity Type 1    0.023869
Transport: type 4         0.021263
Industry: type 9          0.013647
Trade: type 3             0.013595
Industry: type 3          0.012722
Security                  0.012692
Housing                   0.011824
Military                  0.010851
Industry: type 11         0.010725
Bank                      0.010321
Police                    0.009657
Agriculture               0.009540
Transport: type 2         0.008823
Postal                    0.008576
Security Ministries       0.008155
Trade: type 2             0.007673
Restaurant                0.006943
Services                  0.006387
University                0.005480
Industry: type 7          0.005219
Transport: type 3         0.004342
Industry: type 1          0.004012
Hotel                     0.003925
Electricity               0.003851
Industry: type 4          0.003422
Trade: type 6             0.002614
Insurance                 0.002445
Industry: type 5          0.002423
Telecom                   0.002314
Emergency                 0.002258
Industry: type 2          0.001845
Advertising               0.001711
Culture                   0.001554
Industry: type 12         0.001541
Realtor                   0.001537
Trade: type 1             0.001376
Mobile                    0.001251
Legal Services            0.001220
Cleaning                  0.001003
Transport: type 1         0.000834
Industry: type 6          0.000452
Industry: type 10         0.000443
Religion                  0.000347
Trade: type 4             0.000269
Industry: type 13         0.000252
Trade: type 5             0.000200
Industry: type 8          0.000091
Name: ORGANIZATION_TYPE, dtype: float64
In [183]:
#Applying value_counts() on ORGANISATION_TYPE for T0 Dataframe.
T1['ORGANIZATION_TYPE'].dropna().value_counts(normalize=True)
Out[183]:
Business Entity Type 3    0.289581
Self-employed             0.178979
Other                     0.058392
Business Entity Type 2    0.041218
Construction              0.035951
Trade: type 7             0.033891
Medicine                  0.033753
Government                0.033249
School                    0.024090
Transport: type 4         0.022945
Business Entity Type 1    0.022304
Kindergarten              0.022166
Trade: type 3             0.016533
Industry: type 3          0.015938
Security                  0.014839
Agriculture               0.011770
Housing                   0.010763
Industry: type 11         0.010717
Industry: type 9          0.010305
Restaurant                0.009709
Transport: type 3         0.008564
Postal                    0.008335
Transport: type 2         0.007877
Military                  0.006183
Trade: type 2             0.006091
Bank                      0.005954
Police                    0.005358
Industry: type 1          0.005267
Industry: type 7          0.004809
Services                  0.004763
Security Ministries       0.004397
Industry: type 4          0.004076
University                0.002977
Electricity               0.002885
Hotel                     0.002839
Telecom                   0.002015
Realtor                   0.001924
Industry: type 5          0.001878
Emergency                 0.001832
Advertising               0.001603
Insurance                 0.001557
Industry: type 2          0.001511
Trade: type 1             0.001420
Trade: type 6             0.001328
Cleaning                  0.001328
Mobile                    0.001328
Legal Services            0.001099
Culture                   0.000962
Industry: type 12         0.000641
Industry: type 13         0.000412
Transport: type 1         0.000412
Industry: type 6          0.000366
Industry: type 10         0.000321
Religion                  0.000229
Industry: type 8          0.000137
Trade: type 5             0.000137
Trade: type 4             0.000092
Name: ORGANIZATION_TYPE, dtype: float64
In [70]:
#Drawing a bar chart to view the distributon.
plt.figure(figsize=[20,5])
plt.subplot(1,2,1)
T0['ORGANIZATION_TYPE'].dropna().value_counts(normalize=True).plot.bar()
plt.title("Occupation of people who repayed the loan")
plt.subplot(1,2,2)
T1['ORGANIZATION_TYPE'].dropna().value_counts(normalize=True).plot.bar()
plt.title("Occupation of people who didn't repay the loan")
plt.show()
Notebook Image

Inferences: There doesn't seem to be a significant difference in the distrbution of occupation for defaulters and non-defaulter.

Analysis on DAYS_BIRTH

In [71]:
#making a List for age of people in Datafram T1
ageT1=T1['DAYS_BIRTH'].abs()/365
In [72]:
#making a List for age of people in Datafram T0
ageT0=T0['DAYS_BIRTH'].abs()/365
In [73]:
#Drawing histograph to see the distribution of age in T1 and T0.
plt.figure(figsize=[20,5])
plt.subplot(1,2,1)
ageT0.plot.hist(bins=20)
plt.title("Age Distribution of people who repayed the loan")
plt.subplot(1,2,2)
ageT1.plot.hist(bins=20)
plt.title("Age Distribution of people who didn't repay the loan")
plt.show()
Notebook Image

Inferences:

  1. The % of people in the age group 30-35 is more in defaulters than in non-defaulters.
  2. The % of people in defaulters decreases with age after 30 years of age|

Analysis on CNT_FAM_MEMBERS

In [74]:
#Applying value_counts() on CNT_FAM_MEMBERS on T0 Dataframe
T0['CNT_FAM_MEMBERS'].value_counts()
Out[74]:
2.0     146348
1.0      62172
3.0      47993
4.0      22561
5.0       3151
6.0        353
7.0         75
8.0         14
9.0          6
12.0         2
20.0         2
10.0         2
16.0         2
14.0         2
15.0         1
Name: CNT_FAM_MEMBERS, dtype: int64
In [75]:
#Applying value_counts() on CNT_FAM_MEMBERS on T1 Dataframe
T1['CNT_FAM_MEMBERS'].value_counts()
Out[75]:
2.0     12009
1.0      5675
3.0      4608
4.0      2136
5.0       327
6.0        55
7.0         6
8.0         6
11.0        1
10.0        1
13.0        1
Name: CNT_FAM_MEMBERS, dtype: int64
In [186]:
# Drawing distplot() to see the distribution of age for defalters and non defaluters.
plt.figure(figsize=[20,5])
plt.subplot(1,2,1)
sns.distplot(T0['CNT_FAM_MEMBERS'])
plt.title("Count of family members of people who repayed the loan")
plt.subplot(1,2,2)
sns.distplot(T1['CNT_FAM_MEMBERS'])
plt.title("Count of family members of people who didn't repay the loan")
plt.show()
Notebook Image

Inference : The distribution of count of family members seems similar for both the dataframes.

Subtask 2. Bivariate Analysis

In [77]:
#Using info() to get the names of columns
application.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Data columns (total 73 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_CURR 307511 non-null int64 1 TARGET 307511 non-null int64 2 NAME_CONTRACT_TYPE 307511 non-null object 3 CODE_GENDER 307507 non-null object 4 FLAG_OWN_CAR 307511 non-null object 5 FLAG_OWN_REALTY 307511 non-null object 6 CNT_CHILDREN 307511 non-null int64 7 AMT_INCOME_TOTAL 307511 non-null float64 8 AMT_CREDIT 307511 non-null float64 9 AMT_ANNUITY 307499 non-null float64 10 AMT_GOODS_PRICE 307233 non-null float64 11 NAME_TYPE_SUITE 306219 non-null object 12 NAME_INCOME_TYPE 307511 non-null object 13 NAME_EDUCATION_TYPE 307511 non-null object 14 NAME_FAMILY_STATUS 307511 non-null object 15 NAME_HOUSING_TYPE 307511 non-null object 16 REGION_POPULATION_RELATIVE 307511 non-null float64 17 DAYS_BIRTH 307511 non-null int64 18 DAYS_EMPLOYED 307511 non-null int64 19 DAYS_REGISTRATION 307511 non-null float64 20 DAYS_ID_PUBLISH 307511 non-null int64 21 FLAG_MOBIL 307511 non-null int64 22 FLAG_EMP_PHONE 307511 non-null int64 23 FLAG_WORK_PHONE 307511 non-null int64 24 FLAG_CONT_MOBILE 307511 non-null int64 25 FLAG_PHONE 307511 non-null int64 26 FLAG_EMAIL 307511 non-null int64 27 OCCUPATION_TYPE 211120 non-null object 28 CNT_FAM_MEMBERS 307509 non-null float64 29 REGION_RATING_CLIENT 307511 non-null int64 30 REGION_RATING_CLIENT_W_CITY 307511 non-null int64 31 WEEKDAY_APPR_PROCESS_START 307511 non-null object 32 HOUR_APPR_PROCESS_START 307511 non-null int64 33 REG_REGION_NOT_LIVE_REGION 307511 non-null int64 34 REG_REGION_NOT_WORK_REGION 307511 non-null int64 35 LIVE_REGION_NOT_WORK_REGION 307511 non-null int64 36 REG_CITY_NOT_LIVE_CITY 307511 non-null int64 37 REG_CITY_NOT_WORK_CITY 307511 non-null int64 38 LIVE_CITY_NOT_WORK_CITY 307511 non-null int64 39 ORGANIZATION_TYPE 252137 non-null object 40 EXT_SOURCE_2 306851 non-null float64 41 EXT_SOURCE_3 246546 non-null float64 42 OBS_30_CNT_SOCIAL_CIRCLE 306490 non-null float64 43 DEF_30_CNT_SOCIAL_CIRCLE 306490 non-null float64 44 OBS_60_CNT_SOCIAL_CIRCLE 306490 non-null float64 45 DEF_60_CNT_SOCIAL_CIRCLE 306490 non-null float64 46 DAYS_LAST_PHONE_CHANGE 307510 non-null float64 47 FLAG_DOCUMENT_2 307511 non-null int64 48 FLAG_DOCUMENT_3 307511 non-null int64 49 FLAG_DOCUMENT_4 307511 non-null int64 50 FLAG_DOCUMENT_5 307511 non-null int64 51 FLAG_DOCUMENT_6 307511 non-null int64 52 FLAG_DOCUMENT_7 307511 non-null int64 53 FLAG_DOCUMENT_8 307511 non-null int64 54 FLAG_DOCUMENT_9 307511 non-null int64 55 FLAG_DOCUMENT_10 307511 non-null int64 56 FLAG_DOCUMENT_11 307511 non-null int64 57 FLAG_DOCUMENT_12 307511 non-null int64 58 FLAG_DOCUMENT_13 307511 non-null int64 59 FLAG_DOCUMENT_14 307511 non-null int64 60 FLAG_DOCUMENT_15 307511 non-null int64 61 FLAG_DOCUMENT_16 307511 non-null int64 62 FLAG_DOCUMENT_17 307511 non-null int64 63 FLAG_DOCUMENT_18 307511 non-null int64 64 FLAG_DOCUMENT_19 307511 non-null int64 65 FLAG_DOCUMENT_20 307511 non-null int64 66 FLAG_DOCUMENT_21 307511 non-null int64 67 AMT_REQ_CREDIT_BUREAU_HOUR 265992 non-null float64 68 AMT_REQ_CREDIT_BUREAU_DAY 265992 non-null float64 69 AMT_REQ_CREDIT_BUREAU_WEEK 265992 non-null float64 70 AMT_REQ_CREDIT_BUREAU_MON 265992 non-null float64 71 AMT_REQ_CREDIT_BUREAU_QRT 265992 non-null float64 72 AMT_REQ_CREDIT_BUREAU_YEAR 265992 non-null float64 dtypes: float64(20), int64(41), object(12) memory usage: 171.3+ MB

Below are the columns which we feel are useful for our analysis and understanding. We are considering it solely through our understanding of the problem and our instincts.

"SK_ID_CURR", "TARGET", "NAME_CONTRACT_TYPE", "CODE_GENDER","CNT_CHLDREN","AMT_INCOME_TOTAL","AMT_ANNUITY","AMT_CREDIT","NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE","NAME_FAMMILY","DAYS_BIRTH","DAYS_REGISTRATION","OCCUPATION_TYPE","CNT_FAM_MEMBERS", "REGION_RATING_CLIENT","ORGANIZATION_TYPE","OBS_30_CNT_SOCIAL_CIRCLE","DEF_30_CNT_SOCIAL_CIRCLE", "OBS_60_CNT_SOCIAL_CIRCLE","DEF_60_CNT_SOCIAL_CIRCLE"

AMT_CREDIT vs AMT_INCOME_TOTAL(CONTINOUS vs CONTINOUS)

In [79]:
#Applying scatter plot on AMT_CREDIT and AMT_INCOME_TOTAL for application data.
plt.scatter(application['AMT_CREDIT'] ,application['AMT_INCOME_TOTAL'])
plt.show()
Notebook Image

As there are many outliers in this column we aren't getting a proper graph for how the data is distributed. Drawing another graph to view the distribution of the data after dropping na values

In [80]:
# Creating a temporary file with AMT_CREDIT<1600000 after dropping outliers
temp=application[application['AMT_CREDIT']<1600000]
In [81]:
#Applying scatter plot on AMT_CREDIT and AMT_INCOME_TOTAL for application data.
plt.scatter(temp['AMT_CREDIT'] ,temp['AMT_INCOME_TOTAL'])
plt.show()
Notebook Image

Inferences: There isn't a good correlation between Credit and Income even after dropping the outlier.

AMT_CREDIT vs CNT_CHLDREN(CONTINOUS vs CONTINOUS)

In [187]:
#Applying Scatter plot on CNT_CHILDREN and  AMT_CREDIT on T1 and T0 dataframe
plt.figure(figsize=[20,5])
plt.subplot(1,2,1)
plt.scatter(T0['CNT_CHILDREN'] ,T0['AMT_CREDIT'])
plt.title("Count of Children of people vs Credit of people who repayed the loan")
plt.subplot(1,2,2)
plt.scatter(T1['CNT_CHILDREN'] ,T1['AMT_CREDIT'])
plt.title("Count of Children of people vs Credit of people who didn't repay the loan")
plt.show()
Notebook Image
In [83]:
#Applying Scatter plot on CNT_CHILDREN and  AMT_CREDIT on application dataframe
plt.scatter(application['CNT_CHILDREN'] ,application['AMT_CREDIT'])
plt.show()
Notebook Image

Inferences:

  1. We can Infere from the graph that peope with higher numerber of children do not generally tend to go for the loan.
  2. Also there isn't much difference in the pattern of having children and the credit they have for both defaulters and non- defaulters

AMT_CREDIT vs AMT_ANNUITY(CONTINOUS vs CONTINOUS)

In [84]:
#Applying scatter plot on AMT_ANNUITY and AMT_CREDIT on T1 and T0 dataframes.
plt.figure(figsize=[20,5])
plt.subplot(1,2,1)
plt.scatter(T0['AMT_ANNUITY'] ,T0['AMT_CREDIT'])
plt.title("Annuity vs Credit of people who repayed the loan")
plt.subplot(1,2,2)
plt.scatter(T1['AMT_ANNUITY'] ,T1['AMT_CREDIT'])
plt.title("Annuity vs Credit of people who didn't repay the loan")
plt.show()
Notebook Image
In [85]:
#Applying scatter plot on AMT_ANNUITY and AMT_CREDIT on application dataframes.
plt.scatter(application['AMT_ANNUITY'] ,application['AMT_CREDIT'])
plt.show()
Notebook Image

Inferences:We can see that Annuity is proportional to Credit. Higher the credit on the person, higher is the annuity. This was seen for both people who defaulted and people who didn't

AMT_CREDIT vs AMT_AGE(CONTINOUS vs CONTINOUS)

In [189]:
plt.figure(figsize=[20,5])
plt.subplot(1,2,1)
plt.scatter(ageT0 ,T0['AMT_CREDIT'])
plt.title("Age vs Credit of people who repayed the loan")
plt.subplot(1,2,2)
plt.scatter(ageT1 ,T1['AMT_CREDIT'])
plt.title("Age vs Credit of people who didn't repay the loan")
plt.show()
Notebook Image

Inferences: Age of the people and Credit of the people are not correlated for both the dataframes

NAME_CONTRACT_TYPE vs NAME_EDUCATION_TYPE(CATEGORICAL vs CATEGORICAL)

In [99]:
# Making a crosstable to view the frequency of people according to education and contract for people who successfuly repayed the loan.
pd.crosstab(index=T0["NAME_CONTRACT_TYPE"],columns=T0["NAME_EDUCATION_TYPE"],normalize="columns")
Out[99]:
In [101]:
# Making a crosstable to view the frequency of people according to education and contract for people who failed to repay the loan.
pd.crosstab(index=T1["NAME_CONTRACT_TYPE"],columns=T1["NAME_EDUCATION_TYPE"],normalize="columns")
Out[101]:

Inferences:

  1. Of all the people who could repay the loan and with accedemic degree, 88% repayed Cash loan and 12% repayed Revolving loan.

  2. Of all the people who could not repay the loan and with accedemic degree, 100% failed in repaying Cash loan and 0% failed in repaying Revolving loan.

  3. Of all the people who could repay the loan and with incomplete higher education, 87% repayed repayed cash loans and 13% repayed Revolving,

  4. Of all the people who could not repay the loan and with incomplete higher education, 92% failed in repaying Cash loan and 8% failed in repaying Revolving loan.

NAME_CONTRACT_TYPE vs NAME_INCOME_TYPE(CATEGOICAL vs CATEGORICAL)

In [104]:
# Making a crosstable to view the frequency of people according to income type and contract for people who successfuly repayed the loan.
pd.crosstab(index=T0["NAME_CONTRACT_TYPE"],columns=T0["NAME_INCOME_TYPE"],normalize="columns")
Out[104]:
In [103]:
# Making a crosstable to view the frequency of people according to income type and contract for people who failed to repay the loan.
pd.crosstab(index=T1["NAME_CONTRACT_TYPE"],columns=T1["NAME_INCOME_TYPE"],normalize="columns")
Out[103]:

Inference:

  1. Of those who are not a defaulter and are commercial associate, 88% repayed cash loan and 12% repayed Revolving loans
  2. Of those who are defaulter and are commercial associate,93% failed in repaying cash loans and 7% failed in repaying revolving loans

Note: All the students and buisnessman who were analysed didn't default. Athought he number of people analysed is very low. Hence These categories shouldn't be consdered untill we have equal representation of this class of Income type.

In [111]:
#Printing the count of stundets and businessman analsed overall.
print(len(application[application['NAME_INCOME_TYPE']=="Student"]))
print(len(application[application['NAME_INCOME_TYPE']=="Businessman"]))
18 10

NAME_CONTRACT_TYPE vs NAME_FAMILY_STATUS(CATEGORICAL vs CATEGORICAL)

In [114]:
# Making a crosstable to view the frequency of people according to Family status and contract for people who successfuly repayed the loan.
pd.crosstab(index=T0["NAME_CONTRACT_TYPE"],columns=T0["NAME_FAMILY_STATUS"],normalize="columns")
Out[114]:
In [112]:
# Making a crosstable to view the frequency of people according to Family status and contract for people who successfuly repayed the loan.
pd.crosstab(index=T0["NAME_CONTRACT_TYPE"],columns=T0["NAME_FAMILY_STATUS"],normalize=True)
Out[112]:
In [110]:
# Making a crosstable to view the frequency of people according to Family status and contract for people who failed to repay the loan.
pd.crosstab(index=T1["NAME_CONTRACT_TYPE"],columns=T1["NAME_FAMILY_STATUS"],normalize="columns")
Out[110]:

Inferences:

  1. Of those who are non-Defaulters and are divorced, 90% repayed cash loan successfully, 10% repayed revolving loan.
  2. Of those who are non-Defaulters, 59% of the people are Married and were succussful in repaying the Cash loan.
  3. Of those who are defaulters and have divorced, 94% of them failed to repay Cash Loans and 6% failed to repay Revolving loans.

NAME_CONTRACT_TYPE vs OCCUPATION_TYPE(CATEGORICAL vs CATEGORICAL)

In [116]:
# Making a crosstable to view the frequency of people according to Family status and contract for people who successfuly repayed the loan.
pd.crosstab(index=T0["NAME_CONTRACT_TYPE"],columns=T0["OCCUPATION_TYPE"],normalize="columns")
Out[116]:
In [119]:
# Making a crosstable to view the frequency of people according to Family status and contract for people who successfuly repayed the loan.
pd.crosstab(index=T1["NAME_CONTRACT_TYPE"],columns=T1["OCCUPATION_TYPE"],normalize="columns")
Out[119]:

Inferences:

  1. Of all the non-defaulters,Reality agents 90% payed Cash loans and 10% repayed revolving loan
  2. Of all the defaulters,Reality agents 93% didn't pay Cash loans and 10% didn't repay revolving loan

NAME_INCOME_TYPE vs AMT_CREDIT(CATEGORICAL vs CONTINOUS)

In [190]:
# Applying groupby on NAME_INCOME_TYPE and AMT_CREDIT for T0 Dataframe.
T0.groupby("NAME_INCOME_TYPE")["AMT_CREDIT"].mean()
Out[190]:
NAME_INCOME_TYPE
Businessman             1.228500e+06
Commercial associate    6.749793e+05
Maternity leave         6.300000e+05
Pensioner               5.416641e+05
State servant           6.731780e+05
Student                 5.107875e+05
Unemployed              8.431071e+05
Working                 5.814100e+05
Name: AMT_CREDIT, dtype: float64
In [195]:
8.431071e+05-626625.000000
Out[195]:
216482.09999999998
In [120]:
# Applying groupby on NAME_INCOME_TYPE and AMT_CREDIT for T1 Dataframe.
T1.groupby("NAME_INCOME_TYPE")["AMT_CREDIT"].mean()
Out[120]:
NAME_INCOME_TYPE
Commercial associate    607287.880410
Maternity leave         929250.000000
Pensioner               558038.827968
State servant           614816.484788
Unemployed              626625.000000
Working                 535532.043878
Name: AMT_CREDIT, dtype: float64

Inferences:

  1. The mean of Credit taken by Commercial Associate is more for non defaulters than defaulters.
  2. The mean of Credit taken by maternity leave category is more for defaulters than non-defaulters
  3. The mean of Credit taken by Unemployeed non-defaulters is more than defaulters.

NAME_INCOME_TYPE vs AMT_CREDIT(CATEGORICAL vs CONTINOUS)

In [198]:
#Plotting box plot for NAME_INCOME_TYPE vs CREDIT.
plt.figure(figsize=[10,7])
sns.boxplot(data=T1 , x="NAME_INCOME_TYPE",y="AMT_CREDIT")
plt.show()
Notebook Image
In [199]:
plt.figure(figsize=[20,7])
sns.boxplot(data=T0 , x="NAME_INCOME_TYPE",y="AMT_CREDIT")
plt.show()
Notebook Image

Inferences:

  1. For non-Defaulters, the median of Buissmen's AMT_CREDIT is higher than the rest of the income type.
  2. for Defaulters,the median of maternity leave category's AMT_CREDIT is higher.

NAME_CONTRACT_TYPE vs AMT_CREDIT(CATEGORICAL vs CONTINOUS)

In [129]:
#Using groupby() on NAME_CONTRACT_TYPE and AMT_CREDIT for T0 dataframe.
T0.groupby("NAME_CONTRACT_TYPE")["AMT_CREDIT"].mean()
Out[129]:
NAME_CONTRACT_TYPE
Cash loans         632461.030097
Revolving loans    327939.024390
Name: AMT_CREDIT, dtype: float64
In [128]:
#Using groupby() on NAME_CONTRACT_TYPE and AMT_CREDIT for T1 dataframe.
T1.groupby("NAME_CONTRACT_TYPE")["AMT_CREDIT"].mean()
Out[128]:
NAME_CONTRACT_TYPE
Cash loans         578598.766181
Revolving loans    256365.336658
Name: AMT_CREDIT, dtype: float64
In [131]:
#Plotting Contract vs Credit bar graph.
plt.figure(figsize=[20,5])
plt.subplot(1,2,1)
T0.groupby("NAME_CONTRACT_TYPE")["AMT_CREDIT"].mean().plot.bar()
plt.title("Contract vs Credit of people who repayed the loan")
plt.subplot(1,2,2)
T1.groupby("NAME_CONTRACT_TYPE")["AMT_CREDIT"].mean().plot.bar()
plt.title("Contract vs Credit of people who didn't repay the loan")
plt.show()
Notebook Image

Inference:

  1. There isn't much difference in the Contract vs Credit for both the dataframes

NAME_CONTRACT_TYPE vs AMT_INCOME(CATEGORICAL vs CONTINOUS)

In [201]:
#Using groupby() on NAME_CONTRACT_TYPE and AMT_TOTAL_INCOME for T0 dataframe.
T0.groupby("NAME_CONTRACT_TYPE")["AMT_INCOME_TOTAL"].mean()
Out[201]:
NAME_CONTRACT_TYPE
Cash loans         169225.742548
Revolving loans    167713.790911
Name: AMT_INCOME_TOTAL, dtype: float64
In [134]:
#Using groupby() on NAME_CONTRACT_TYPE and AMT_INCOME_TOTAL for T1 dataframe.
T1.groupby("NAME_CONTRACT_TYPE")["AMT_INCOME_TOTAL"].mean()
Out[134]:
NAME_CONTRACT_TYPE
Cash loans         167353.820593
Revolving loans    140392.080112
Name: AMT_INCOME_TOTAL, dtype: float64
In [135]:
plt.figure(figsize=[20,5])
plt.subplot(1,2,1)
T0.groupby("NAME_CONTRACT_TYPE")["AMT_INCOME_TOTAL"].mean().plot.bar()
plt.title("Contract vs INCOME of people who repayed the loan")
plt.subplot(1,2,2)
T1.groupby("NAME_CONTRACT_TYPE")["AMT_INCOME_TOTAL"].mean().plot.bar()
plt.title("Contract vs Credit of people who didn't repay the loan")
plt.show()
Notebook Image

Inferences:

  1. People who have defaulted and took Revolving loans have lesser income than the people who didn't default and have Revoving loans

Checking Correlation Coefficient of Columns with Target Variable in T1 and T2 Dataframe

We will be checking correlation coefficient of Target variabe with only those columns which we feel are necessary and important for our analysis.

In [146]:
#Creating a teporary dataframe from T0 with favourable columns
temp1=T0[["SK_ID_CURR", "TARGET", "NAME_CONTRACT_TYPE", "CODE_GENDER","CNT_CHILDREN","AMT_INCOME_TOTAL","AMT_ANNUITY","AMT_CREDIT","NAME_INCOME_TYPE","NAME_EDUCATION_TYPE","NAME_FAMILY_STATUS","DAYS_BIRTH","DAYS_REGISTRATION","OCCUPATION_TYPE","CNT_FAM_MEMBERS","REGION_RATING_CLIENT","ORGANIZATION_TYPE","OBS_30_CNT_SOCIAL_CIRCLE","DEF_30_CNT_SOCIAL_CIRCLE","OBS_60_CNT_SOCIAL_CIRCLE","DEF_60_CNT_SOCIAL_CIRCLE"]]
In [148]:
#Creating a correlation heatmap for temp1.
plt.figure(figsize=[20,20])
sns.heatmap(temp1.corr(),annot=True,cmap="RdYlGn")
plt.show()