!pip install jovian -q
import jovian
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')
# Import the packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
# Read the csv file using 'read_csv'
df_src = pd.read_csv("application_data.csv")
df_src.head()
Inspect the structure -Describe and Shape
#df.describe() #defining the mean,sum of few parameters like id,ratings,etc..makes no practical significance; not all though
df_src.shape # rows and columns of original dataframe
(307511, 122)
df=df_src # keeping a safe copy of the original dataframe
average_income=round(df['AMT_INCOME_TOTAL'].mean(),2)
average_income
168797.92
percent_payment_diff=round((df['TARGET']==1).sum()/len(df['TARGET'])*100,2)
percent_payment_diff
8.07
So initial analysis,even before the cleaning of the datasets indicates there are around 307511 applications for Loan recieved. and the average income of the loan applicants is Rs.168797 i.e. Rs.1.68L For the target column, we know 1 indicates client -he/she had payment difficulties in the past and 0 all other cases, here we could find 8.07% of the applicants had a past history of payment difficulty.
Here, we get to see that most of the late payments are not due to monetary/economic issues , but for some other reasons only.
#Columnwise count and percntage of missing values
#This helps us in removing unnecessary columns.
total = df.isnull().sum().sort_values(ascending=False)
perc = ((df.isnull().sum()/df.isnull().count())*100).sort_values(ascending=False)
pd.concat([total,round(perc,2)],axis=1,keys=['Total Null Values','Percentage of Null Values'])
#table of null entries in each column
#Converting the Pandas series to Dataframe
perc2 = pd.DataFrame(perc).transpose()
#Listing all the column names of the DF.
list1 = list(perc2.columns.values)
#For initial drop of columns, we can keep a percentage limit.
#Here I could find many columns are having hugh percentage of missing values. So keeping 47% as kind of threshold.
list_to_drop=[]
for i in list1:
if perc[i] > 47:
list_to_drop.append(i)
list_to_drop
['COMMONAREA_MEDI',
'COMMONAREA_AVG',
'COMMONAREA_MODE',
'NONLIVINGAPARTMENTS_MODE',
'NONLIVINGAPARTMENTS_MEDI',
'NONLIVINGAPARTMENTS_AVG',
'FONDKAPREMONT_MODE',
'LIVINGAPARTMENTS_MEDI',
'LIVINGAPARTMENTS_MODE',
'LIVINGAPARTMENTS_AVG',
'FLOORSMIN_MEDI',
'FLOORSMIN_MODE',
'FLOORSMIN_AVG',
'YEARS_BUILD_MEDI',
'YEARS_BUILD_AVG',
'YEARS_BUILD_MODE',
'OWN_CAR_AGE',
'LANDAREA_MODE',
'LANDAREA_AVG',
'LANDAREA_MEDI',
'BASEMENTAREA_MEDI',
'BASEMENTAREA_AVG',
'BASEMENTAREA_MODE',
'EXT_SOURCE_1',
'NONLIVINGAREA_MEDI',
'NONLIVINGAREA_AVG',
'NONLIVINGAREA_MODE',
'ELEVATORS_MODE',
'ELEVATORS_AVG',
'ELEVATORS_MEDI',
'WALLSMATERIAL_MODE',
'APARTMENTS_MODE',
'APARTMENTS_AVG',
'APARTMENTS_MEDI',
'ENTRANCES_MEDI',
'ENTRANCES_MODE',
'ENTRANCES_AVG',
'LIVINGAREA_MEDI',
'LIVINGAREA_MODE',
'LIVINGAREA_AVG',
'HOUSETYPE_MODE',
'FLOORSMAX_MODE',
'FLOORSMAX_MEDI',
'FLOORSMAX_AVG',
'YEARS_BEGINEXPLUATATION_MEDI',
'YEARS_BEGINEXPLUATATION_AVG',
'YEARS_BEGINEXPLUATATION_MODE',
'TOTALAREA_MODE',
'EMERGENCYSTATE_MODE']
len(list_to_drop) #Hence 49 columns to drop since the percentage values are high.
49
df = df.drop(list_to_drop,axis=1)
df.head()
#Df shape after initial data cleaning.
df.shape
(307511, 73)
# Get the column-wise Null count using 'is.null()' alongwith the 'sum()' function
df.isnull().sum().sort_values(ascending=False)
OCCUPATION_TYPE 96391
EXT_SOURCE_3 60965
AMT_REQ_CREDIT_BUREAU_YEAR 41519
AMT_REQ_CREDIT_BUREAU_MON 41519
AMT_REQ_CREDIT_BUREAU_WEEK 41519
AMT_REQ_CREDIT_BUREAU_DAY 41519
AMT_REQ_CREDIT_BUREAU_HOUR 41519
AMT_REQ_CREDIT_BUREAU_QRT 41519
NAME_TYPE_SUITE 1292
OBS_30_CNT_SOCIAL_CIRCLE 1021
DEF_30_CNT_SOCIAL_CIRCLE 1021
OBS_60_CNT_SOCIAL_CIRCLE 1021
DEF_60_CNT_SOCIAL_CIRCLE 1021
EXT_SOURCE_2 660
AMT_GOODS_PRICE 278
AMT_ANNUITY 12
CNT_FAM_MEMBERS 2
DAYS_LAST_PHONE_CHANGE 1
NAME_INCOME_TYPE 0
FLAG_OWN_REALTY 0
TARGET 0
FLAG_EMAIL 0
FLAG_PHONE 0
FLAG_CONT_MOBILE 0
NAME_CONTRACT_TYPE 0
FLAG_WORK_PHONE 0
FLAG_EMP_PHONE 0
FLAG_MOBIL 0
CODE_GENDER 0
FLAG_OWN_CAR 0
CNT_CHILDREN 0
NAME_EDUCATION_TYPE 0
AMT_INCOME_TOTAL 0
AMT_CREDIT 0
DAYS_ID_PUBLISH 0
DAYS_REGISTRATION 0
DAYS_EMPLOYED 0
DAYS_BIRTH 0
REGION_RATING_CLIENT 0
NAME_HOUSING_TYPE 0
NAME_FAMILY_STATUS 0
REGION_POPULATION_RELATIVE 0
REG_CITY_NOT_LIVE_CITY 0
REGION_RATING_CLIENT_W_CITY 0
WEEKDAY_APPR_PROCESS_START 0
FLAG_DOCUMENT_21 0
FLAG_DOCUMENT_20 0
FLAG_DOCUMENT_19 0
FLAG_DOCUMENT_18 0
FLAG_DOCUMENT_17 0
FLAG_DOCUMENT_16 0
FLAG_DOCUMENT_15 0
FLAG_DOCUMENT_14 0
FLAG_DOCUMENT_13 0
FLAG_DOCUMENT_12 0
FLAG_DOCUMENT_11 0
FLAG_DOCUMENT_10 0
FLAG_DOCUMENT_9 0
FLAG_DOCUMENT_8 0
FLAG_DOCUMENT_7 0
FLAG_DOCUMENT_6 0
FLAG_DOCUMENT_5 0
FLAG_DOCUMENT_4 0
FLAG_DOCUMENT_3 0
FLAG_DOCUMENT_2 0
ORGANIZATION_TYPE 0
LIVE_CITY_NOT_WORK_CITY 0
REG_CITY_NOT_WORK_CITY 0
LIVE_REGION_NOT_WORK_REGION 0
REG_REGION_NOT_WORK_REGION 0
REG_REGION_NOT_LIVE_REGION 0
HOUR_APPR_PROCESS_START 0
SK_ID_CURR 0
dtype: int64
# The rows for which the sum of Null is less than or equal to five are retained
df = df[df.isnull().sum(axis=1) <= 5]
df.isnull().sum().sort_values(ascending=False)
OCCUPATION_TYPE 83144
EXT_SOURCE_3 19420
NAME_TYPE_SUITE 1183
DEF_60_CNT_SOCIAL_CIRCLE 824
OBS_30_CNT_SOCIAL_CIRCLE 824
DEF_30_CNT_SOCIAL_CIRCLE 824
OBS_60_CNT_SOCIAL_CIRCLE 824
EXT_SOURCE_2 525
AMT_GOODS_PRICE 232
AMT_ANNUITY 11
CNT_FAM_MEMBERS 1
FLAG_EMP_PHONE 0
FLAG_WORK_PHONE 0
FLAG_CONT_MOBILE 0
FLAG_PHONE 0
AMT_REQ_CREDIT_BUREAU_YEAR 0
DAYS_ID_PUBLISH 0
FLAG_EMAIL 0
REGION_RATING_CLIENT 0
REGION_RATING_CLIENT_W_CITY 0
FLAG_MOBIL 0
DAYS_EMPLOYED 0
DAYS_REGISTRATION 0
AMT_INCOME_TOTAL 0
TARGET 0
NAME_CONTRACT_TYPE 0
CODE_GENDER 0
FLAG_OWN_CAR 0
FLAG_OWN_REALTY 0
CNT_CHILDREN 0
AMT_CREDIT 0
HOUR_APPR_PROCESS_START 0
NAME_INCOME_TYPE 0
NAME_EDUCATION_TYPE 0
NAME_FAMILY_STATUS 0
NAME_HOUSING_TYPE 0
REGION_POPULATION_RELATIVE 0
DAYS_BIRTH 0
WEEKDAY_APPR_PROCESS_START 0
REG_CITY_NOT_LIVE_CITY 0
REG_REGION_NOT_LIVE_REGION 0
FLAG_DOCUMENT_11 0
AMT_REQ_CREDIT_BUREAU_MON 0
AMT_REQ_CREDIT_BUREAU_WEEK 0
AMT_REQ_CREDIT_BUREAU_DAY 0
AMT_REQ_CREDIT_BUREAU_HOUR 0
FLAG_DOCUMENT_21 0
FLAG_DOCUMENT_20 0
FLAG_DOCUMENT_19 0
FLAG_DOCUMENT_18 0
FLAG_DOCUMENT_17 0
FLAG_DOCUMENT_16 0
FLAG_DOCUMENT_15 0
FLAG_DOCUMENT_14 0
FLAG_DOCUMENT_13 0
FLAG_DOCUMENT_12 0
FLAG_DOCUMENT_10 0
REG_REGION_NOT_WORK_REGION 0
FLAG_DOCUMENT_9 0
FLAG_DOCUMENT_8 0
FLAG_DOCUMENT_7 0
FLAG_DOCUMENT_6 0
FLAG_DOCUMENT_5 0
FLAG_DOCUMENT_4 0
FLAG_DOCUMENT_3 0
FLAG_DOCUMENT_2 0
DAYS_LAST_PHONE_CHANGE 0
ORGANIZATION_TYPE 0
LIVE_CITY_NOT_WORK_CITY 0
REG_CITY_NOT_WORK_CITY 0
AMT_REQ_CREDIT_BUREAU_QRT 0
LIVE_REGION_NOT_WORK_REGION 0
SK_ID_CURR 0
dtype: int64
#Second set of columns to be dropped.
list_to_drop2 =['NAME_TYPE_SUITE','DEF_60_CNT_SOCIAL_CIRCLE','OBS_30_CNT_SOCIAL_CIRCLE','DEF_30_CNT_SOCIAL_CIRCLE','OBS_60_CNT_SOCIAL_CIRCLE']
df = df.drop(list_to_drop2,axis=1)
Decisions made while Dropping Columns based on the Null Values:
1.The Normalised Credit Scores from External Data source 2 and 3,eventhough they have considerable Null Values, as they are important for calculating the mean of credit Scores for the customers.
2.The columns dropped above like the customer accompanying person,number of defaulters defaulted in 30 days,60 days do not prove much essential towards estimating whether a person is likely to default or not.
3.Thus, Irrelevant columns are dropped
df = df.loc[df['CNT_FAM_MEMBERS'].isnull() !=True]
df = df.loc[df['AMT_ANNUITY'].isnull() !=True]
df = df.loc[df['AMT_GOODS_PRICE'].isnull() !=True]
4.The columns like Count of Family Members, Annual Annuity and price of goods, which play key role in determining whether the loan approved is as per requirement.Also,annuity amount can decide whether the person can complete paying the annuity to replay the loan based on his/her total income and its relevant expenses like on family for which the number of family members is essential to know.
df.isnull().sum().sort_values(ascending=False)
OCCUPATION_TYPE 83097
EXT_SOURCE_3 19398
EXT_SOURCE_2 525
FLAG_PHONE 0
DAYS_ID_PUBLISH 0
FLAG_MOBIL 0
FLAG_EMP_PHONE 0
FLAG_WORK_PHONE 0
FLAG_CONT_MOBILE 0
AMT_REQ_CREDIT_BUREAU_YEAR 0
FLAG_EMAIL 0
DAYS_EMPLOYED 0
CNT_FAM_MEMBERS 0
REGION_RATING_CLIENT 0
REGION_RATING_CLIENT_W_CITY 0
WEEKDAY_APPR_PROCESS_START 0
DAYS_REGISTRATION 0
DAYS_BIRTH 0
REG_REGION_NOT_LIVE_REGION 0
AMT_INCOME_TOTAL 0
TARGET 0
NAME_CONTRACT_TYPE 0
CODE_GENDER 0
FLAG_OWN_CAR 0
FLAG_OWN_REALTY 0
CNT_CHILDREN 0
AMT_CREDIT 0
REGION_POPULATION_RELATIVE 0
AMT_ANNUITY 0
AMT_GOODS_PRICE 0
NAME_INCOME_TYPE 0
NAME_EDUCATION_TYPE 0
NAME_FAMILY_STATUS 0
NAME_HOUSING_TYPE 0
HOUR_APPR_PROCESS_START 0
REG_REGION_NOT_WORK_REGION 0
AMT_REQ_CREDIT_BUREAU_QRT 0
LIVE_REGION_NOT_WORK_REGION 0
AMT_REQ_CREDIT_BUREAU_MON 0
AMT_REQ_CREDIT_BUREAU_WEEK 0
AMT_REQ_CREDIT_BUREAU_DAY 0
AMT_REQ_CREDIT_BUREAU_HOUR 0
FLAG_DOCUMENT_21 0
FLAG_DOCUMENT_20 0
FLAG_DOCUMENT_19 0
FLAG_DOCUMENT_18 0
FLAG_DOCUMENT_17 0
FLAG_DOCUMENT_16 0
FLAG_DOCUMENT_15 0
FLAG_DOCUMENT_14 0
FLAG_DOCUMENT_13 0
FLAG_DOCUMENT_12 0
FLAG_DOCUMENT_11 0
FLAG_DOCUMENT_10 0
FLAG_DOCUMENT_9 0
FLAG_DOCUMENT_8 0
FLAG_DOCUMENT_7 0
FLAG_DOCUMENT_6 0
FLAG_DOCUMENT_5 0
FLAG_DOCUMENT_4 0
FLAG_DOCUMENT_3 0
FLAG_DOCUMENT_2 0
DAYS_LAST_PHONE_CHANGE 0
ORGANIZATION_TYPE 0
LIVE_CITY_NOT_WORK_CITY 0
REG_CITY_NOT_WORK_CITY 0
REG_CITY_NOT_LIVE_CITY 0
SK_ID_CURR 0
dtype: int64
df['OCCUPATION_TYPE'].describe()
count 182625
unique 18
top Laborers
freq 47352
Name: OCCUPATION_TYPE, dtype: object
5.The columns of occupation type cannot be removed, because the loan is approved only after the proper verification of the user,i.e whether his total income is genuine and he/she is capable of repaying the loan in
the given time. This is also to verify any fradulent or money laundering cases,the customer might be attached to.
Hence , for ease of Understanding the missing occupations are replaced as 'Unknown'
#Here we could find the most frequent applicants occupation type is of Laborers, but still it won't be a feasbile method,
#to generalise and fill all the Nan values with this most frequent occupation.
#Hence Better to fill those values with new value 'Unknown'.
# Fill the NaN values with 'English' since most of the movies are in the English language
#Also it wont be a good option to drop this column,even though there are high number of missing values for this column.
df.loc[pd.isnull(df['OCCUPATION_TYPE']), ['OCCUPATION_TYPE']] = 'Unknown'
Dataset Ready!!!!!
#Final size of the dataset df_final
df_final=df
df_final.shape
(265722, 68)
df_final.head()
#No of retained rows in percentage
print(str(round((df_final.shape[0]/307511)*100,2))+'%')
86.41%
So we have retained around 86.5% of the existing dataset after our data cleaning drive which is good enough to proceed with our EDA.
pay_diff_num=df_final[df_final['TARGET']==1]['TARGET'].value_counts()
pay_all_num=df_final[df_final['TARGET']==0]['TARGET'].value_counts()
percent_late_pay=np.array((pay_diff_num/len(df_final))*100) # percent of people who pay late
percent_timely_pay=np.array((pay_all_num/len(df_final))*100) #percent of all others who pay on time
stat_summ=pd.DataFrame({'Percentage':[percent_late_pay[0],percent_timely_pay[0]]},index=['late','early'])
sns.barplot(x=stat_summ.index,y=stat_summ['Percentage'],palette='RdYlGn')
plt.title('Percent of customers who pay ')
plt.show()
stat_summ # summary of percentage of people who pay early vs on time
temp = df_final["TARGET"].value_counts()
df_imbl = pd.DataFrame({'Values': temp.values},index=['Early Payments','Late Payments'])
plt.figure(figsize = (6,6))
plt.title('Data Imbalance\n (Client with Payment difficulties=1,All others=0)')
sns.barplot(x =df_imbl.index, y="Values", data=df_imbl)
locs, labels = plt.xticks()
plt.show()
df_imbl
round(df_imbl.Values[0] / df_imbl['Values'].sum()*100,2)
92.28
round(df_imbl.Values[1] / df_imbl['Values'].sum()*100,2)
7.72
Ratio of Data Imabalance
print(round((df_imbl.Values[0]/df_imbl.Values[1]),2))
11.95
We can find a a data imbalance here with respect to the Target variable of the dataset since the data points for Target=1 i.e applicants with payment difficulties is at around 7.72% only compared to the total number of applicants.
This High Imbalance of Target Data can be removed with random re-sampling of the collected sample data in order to avoid any bias.
The ratio of 12:1 can create bias in data and thus is important to remove or redistribute in sampling data as uniformly distributed random data.
plt.figure(figsize=(6,6))
sns.boxplot(x='TARGET',y='AMT_CREDIT',data=df)
plt.yscale('log')
plt.show()
Here from the above boxplot between the Target and the Income of the client, we can find, the income for the clients who are having payment difficulties and other cases both are in similiar range with median value almost same. But most importantly,what we can notice is outlier values present in the dataset. There are outlier values present for both the Target cases, but we can easily spot a High Income client present who had previously a payment difficulty.
sns.distplot(df['AMT_CREDIT'])
plt.show()
The above Density Plot for Credited amount of the loan for the clients shows that majority of the loan amounts of the clients are below 10 Lakhs.
temp = df_final['NAME_CONTRACT_TYPE'].value_counts()
df2 = pd.DataFrame({'NAME_CONTRACT_TYPE': temp.index,'Values': temp.values})
plt.figure(figsize = (6,6))
sns.barplot(x = 'NAME_CONTRACT_TYPE', y= 'Values', data=df2)
locs, labels = plt.xticks()
plt.show()
round(df2.Values[0] / df2['Values'].sum(),2)*100
91.0
So, around 90% of the client Loans are Cash Loans. Rest 10% are the only revolving Loans.
Based on the Analysis Done above we find that the Number of People who don't have monetary problems are higher than the ones who have monetary Problems.
But, unfortunately the Number of Cash Loans given out to Customers are higher than Revolving Loans.Revolving Loans has few benefits for both customers as well as Banks.
For Customers:
Customers who do pay loans on time and are capable, do not need to apply loan separately each time.This creates ease for customers to apply loans and avail easy loans without hassle, thus increasing chances of Customer retention.
For Banks:
Banks can generate more revenue and easily charge more interest for successive loans applied from same customer and convince customer to apply loan from them without filing for a fresh loan anywhere else.
This also improves the credit Scores of Customers and helps banks to eliminate defaulters with low Credit Scores.
Also, when compared with the Previous Applications , clients have discontinued applying for consumer loans.Now most of them deal in cash loans and hence apply for more cash loans.
#Converting the TARGET column to a categorical column with 'New_TARGET'
df_final['New_TARGET'] = df_final['TARGET'].apply(lambda x: 'Payment Difficulty' if x==1 else 'All Others')
#To find out the correlation between the income of the Clients and the Credit amount of the Loan.
curr = df_final[["AMT_INCOME_TOTAL", "AMT_CREDIT"]]
cor = curr.corr()
round(cor, 3)
# heatmap
sns.heatmap(cor,annot=True)
plt.show()
The above Heatmap shows very low correlation between the Income of the Applicants and the Loan credited amount which in turn implies that even less income applicants have opted out for a large Loan amount and also high income applicant would have opted out for Small Loan amounts.
sns.scatterplot(x=df_final['AMT_INCOME_TOTAL'],y=df_final['AMT_CREDIT'])
plt.yscale('log')
plt.xscale('log')
plt.show()
This scatterplot gives us an important insight that the amount of default credit given to the people with lower total income is quite higher which may lead to defaulting.Also,above total income greater than 1000000,the amount of credit decreases.This can be due to be reasons like
1.high income persons turn to default likely
2.the banks may not have sufficient funds to credit customers.
income_list_paydiff2=np.array(abs(df_final[df_final['TARGET']==1]['DAYS_BIRTH']).sort_values(ascending=True))
avg_age_paydiff=round(np.mean(income_list_paydiff2)/365,2) #minimum income of the people who have payment difficulties
income_list_allothers2=np.array(abs(df_final[df_final['TARGET']==0]['DAYS_BIRTH']).sort_values(ascending=True))
avg_age_others=round(np.mean(income_list_allothers2)/365,2) #minimum income of the people who have pay installments on time
print(avg_age_paydiff)#Average Age of customers with payment difficulty
print(avg_age_others)#Average Age of customers without payment difficulty
41.02
44.42
The Average Age of Customers who have difficult in paying is 41 years whereas those paying is time are on an average of 44 years.
Analysis of the Customer Base based on Occupation Type
As seen from the analysis above,the Labourers are the largest Customers. Let's analyse who among the labourers apply for loans most.
df_final['OCCUPATION_TYPE'].unique()
array(['Laborers', 'Core staff', 'Accountants', 'Managers', 'Unknown',
'Drivers', 'Sales staff', 'Cleaning staff', 'Cooking staff',
'Private service staff', 'Medicine staff', 'Security staff',
'Waiters/barmen staff', 'Low-skill Laborers', 'Realty agents',
'Secretaries', 'High skill tech staff', 'IT staff', 'HR staff'],
dtype=object)
plt.figure(figsize=(10,10))
occu_df_late=df_final[df_final['TARGET']==1].groupby(df_final['OCCUPATION_TYPE']).count()
ax=sns.barplot(y=occu_df_late.index,x=occu_df_late['TARGET'])
ax.set(xlabel='Number of Clients who pay late',ylabel='Different Occupation Types of Clients')
plt.title("Number of Late Payers Based on Client's Occupation Types")
plt.show()
plt.figure(figsize=(10,10))
occu_df_early=df_final[df_final['TARGET']==0].groupby(df_final['OCCUPATION_TYPE']).count()
ax=sns.barplot(y=occu_df_early.index,x=occu_df_early['TARGET'])
ax.set(xlabel='Number of Clients who pay late',ylabel='Different Occupation Types of Clients')
plt.title("Number of Early Payers Based on Client's Occupation Types")
plt.show()
plt.figure(num=None, figsize=(12, 12), dpi=80, facecolor='w', edgecolor='k')
sns.boxplot(y='New_TARGET', x='AMT_CREDIT',hue="OCCUPATION_TYPE", data=df_final)
plt.show()
plt.figure(figsize=(10,10))
salary_df=df_final.groupby(df_final['OCCUPATION_TYPE']).sum()
ax=sns.barplot(y=salary_df.index,x=salary_df['AMT_INCOME_TOTAL'])
ax.set(xlabel='Income of Clients',ylabel='Different Occupation Types of Clients')
plt.title("Total Income Based on Client's Occupation Types")
plt.show()
As we can see here, the largest Customer Base of the Bank ,Laborers are also the highest number of people who have diffculty in payment.
HR Staff and IT staff are have the lowest percentage of paying late debts and hence are most likely to repay their loans on time.
Anomalies:
1.Laborers have highest total Income but are the ones highest in number who pay late due to payment difficulties.This shows that data collected is skewed/Biased and not random collection of Sample Data.
2.We see that the Laborers are the ones who are highest in late payments as well as in payments on right time.This clearly shows the data is biased and more labourers have been sampled in to form the sample data.
income_type_df=df_final.groupby('NAME_INCOME_TYPE').sum()
ax=sns.barplot(y=income_type_df.index,x=income_type_df['TARGET'])
ax.set(xlabel='Clients Income Type', ylabel='Number of Clients who pay late')
plt.title("Number of Late Payments Based on Customer's Income Types")
plt.show()
Based on the above graph, it is seen that Working Class of Customers find it hard to pay off their doubts on time.But, on the other hand it is also seen that Businessman do not pay late.
But very suspicious insights obtained is that how come Unemployed Customers and Students do not delay in paying debts.
Customers with Maternity leaves can still get their maternity benefits with their regular salary.
income_type_df=df_final.groupby('NAME_EDUCATION_TYPE').sum()
ax=sns.barplot(y=income_type_df.index,x=income_type_df['TARGET'])
ax.set(xlabel='Number of Clients with late payments',ylabel='Highest Education Qualification of Clients')
plt.title("Late Payments Based on Customer's Level Of Highest Education Achieved")
plt.show()
plt.figure(num=None, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
ax=sns.boxplot(x='New_TARGET', y='AMT_CREDIT',hue="NAME_EDUCATION_TYPE", data=df_final)
ax.set(xlabel='Target Customers',ylabel='Amount of Money Credited')
plt.show()
income_type_df=df_final.groupby('NAME_EDUCATION_TYPE').sum()
ax=sns.barplot(y=income_type_df.index,x=income_type_df['AMT_INCOME_TOTAL'])
ax.set(xlabel='Number of Clients with late payments',ylabel='Highest Education Qualification of Clients')
plt.title("Total Income of Customers Based on Level Of Highest Education Achieved")
plt.show()
Anomalies:
1.One Strange thing about the above analysis, how can the people with the lowest Education Recieved,i.e Lower Secondary has the minimum times where they paid off their debts late,eventhough their total income is very less
2.From the box plot above we see that the average Credit given to the Clients with academic degree is highest for both who pay late and pay on time , but there is missing data on the total income of such clients.Hence we need to investigate the authenticity of the group,as maximum credit is given to them.
z=df_final.groupby(df_final['NAME_INCOME_TYPE']).mean()
sns.barplot(y=z.index,x=z['AMT_INCOME_TOTAL'])
plt.title("Total Income for Customers")
plt.show()
low_df=df_final[df_final['NAME_EDUCATION_TYPE']=='Lower secondary']
ab=low_df.groupby(low_df['NAME_INCOME_TYPE']).count()
sns.barplot(y=ab.index,x=ab['AMT_INCOME_TOTAL'])
plt.title("Count for Customers with Lower secondary Education based on Income Type")
plt.show()
Anomalies:
Thus, we see that Customers with Lower Secondary Education are mostly from Working Class, but still do not face any diffculty in paying their debts.
This can be due to 2 reasons:
1.They have a hidden source of income which is not mentioned in the dataset.
2.They data may be corrupt or wrong data is provided.
Analysis Based on Credit Score
df_final['avg_norm_credit_score']=(df_final['EXT_SOURCE_2']+df_final['EXT_SOURCE_3'])/2
df_final['avg_norm_credit_score'].head(5) # creating a new column for average credit score
0 0.201162
1 NaN
2 0.642739
4 NaN
5 0.487726
Name: avg_norm_credit_score, dtype: float64
df_final_score=df_final.groupby(df_final['TARGET']).mean()
sns.barplot(x=df_final_score.index,y=df_final_score['avg_norm_credit_score'])
plt.title('Normalised credit score for Target Customers\n payment difficulty Customers(1) Vs All Others(0)')
plt.show()
diff=df_final_score['avg_norm_credit_score'].max() - df_final_score['avg_norm_credit_score'].min()
credit_diff_perc=diff/df_final_score['avg_norm_credit_score'].max() *100
round(credit_diff_perc,2) # percentage difference between avg credit score between Targets
22.83
Thus, we see that the Average Normalised Credit Score of Customers who pay on time is 22.83 % higher than those who pay late due to payment difficulties
sns.distplot(df_final['AMT_INCOME_TOTAL'])
plt.yscale('log')
As we see above, most of the people in the group have low income, but still we see that the number of people who pay early are more than who pay late with payment difficulties.
Conclusions:
Income of the people are not the major reason why few people have payment difficulties,as we can see most of the people in the group have low income.
There has to be separate entries for expenses which can highlight why people have payment difficulties.
jovian.commit()
[jovian] Saving notebook..