Jovian
⭐️
Sign In
In [483]:
import jovian
In [432]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [433]:
#read data
df = pd.read_csv('Leads.csv')
df.head()
Out[433]:
In [434]:
df.shape #rows and columns of dataframe
Out[434]:
(9240, 37)
In [435]:
#percentage of null values in each column
null_val = []
for x in df.columns: 
    null_val.append(round(df[x].isna().sum()/len(df[x]) *100,2))
null_df = pd.DataFrame([df.columns,null_val]).T   
null_df.columns = ['Columns','Missing values']
null_df['Missing values'].head()
    
Out[435]:
0       0
1       0
2       0
3    0.39
4       0
Name: Missing values, dtype: object
In [436]:
null_df
Out[436]:

Columns like Lead Quality,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,Tags,etc . have the highest Null Values. Let's analyse the columns to decide on null values handling.

In [437]:
from collections import Counter      #Assessing the number of entries in lead-quality
Counter(df['Lead Quality'])
Out[437]:
Counter({'Low in Relevance': 583,
         nan: 4767,
         'Might be': 1560,
         'Not Sure': 1092,
         'Worst': 601,
         'High in Relevance': 637})
In [438]:
df['Lead Quality'].fillna(df['Lead Quality'].mode()[0],inplace=True) #imputing with mode 
In [439]:
Counter(df['Lead Quality'])
Out[439]:
Counter({'Low in Relevance': 583,
         'Might be': 6327,
         'Not Sure': 1092,
         'Worst': 601,
         'High in Relevance': 637})
In [440]:
df['Asymmetrique Activity Index'].fillna(df['Asymmetrique Activity Index'].mode()[0],inplace=True)    #imputing the Activity Index with mode
In [441]:
Counter(df['Asymmetrique Activity Index'])#count of the unique values after imputing
Out[441]:
Counter({'02.Medium': 8057, '01.High': 821, '03.Low': 362})
In [442]:
df['Asymmetrique Activity Index'].replace(['02.Medium','01.High','03.Low'],[2,1,3],inplace=True) #replacing 02.medium,03.low and 01.High as 2,3,1
In [443]:
Counter(df['Asymmetrique Profile Index'])
Out[443]:
Counter({'02.Medium': 2788, '01.High': 2203, '03.Low': 31, nan: 4218})
In [444]:
df['Asymmetrique Profile Index'].fillna(df['Asymmetrique Profile Index'].mode()[0],inplace=True)    #imputing the Activity Index with mode
In [445]:
df['Asymmetrique Profile Index'].replace(['02.Medium','01.High','03.Low'],[2,3,1],inplace=True) #replacing the index with ordinal numerical values
In [446]:
sns.distplot(df['Asymmetrique Activity Score'].dropna())  #original distribution of activity score without Null values
Out[446]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f42134b9208>
Notebook Image
In [447]:
sns.distplot(df['Asymmetrique Activity Score'].fillna(df['Asymmetrique Activity Score'].mean())) #distribution of activity score after Null values with Mean
Out[447]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f42149b2e10>
Notebook Image
In [448]:
#Therefore imputing the null values with Mean for Asymmetry Score
df['Asymmetrique Activity Score'].fillna(df['Asymmetrique Activity Score'].mean(),inplace=True)
In [449]:
sns.distplot(df['Asymmetrique Profile Score'].dropna())  #original distribution of profile score without Null values
Out[449]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f421483e780>
Notebook Image
In [450]:
sns.distplot(df['Asymmetrique Profile Score'].fillna(df['Asymmetrique Profile Score'].mean())) #distribution of Profile score after Null values with Mean
Out[450]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f421485d080>
Notebook Image
In [451]:
#Therefore imputing the null values with Mean for Asymmetry Score
df['Asymmetrique Profile Score'].fillna(df['Asymmetrique Profile Score'].mean(),inplace=True)
In [452]:
Counter(df['Tags'])      # Imputing Tags 
Out[452]:
Counter({'Interested in other courses': 513,
         'Ringing': 1203,
         'Will revert after reading the email': 2072,
         nan: 3353,
         'Lost to EINS': 175,
         'In confusion whether part time or DLP': 5,
         'Busy': 186,
         'switched off': 240,
         'in touch with EINS': 12,
         'Already a student': 465,
         'Diploma holder (Not Eligible)': 63,
         'Graduation in progress': 111,
         'Closed by Horizzon': 358,
         'number not provided': 27,
         'opp hangup': 33,
         'Not doing further education': 145,
         'invalid number': 83,
         'wrong number given': 47,
         'Interested  in full time MBA': 117,
         'Still Thinking': 6,
         'Lost to Others': 7,
         'Shall take in the next coming month': 2,
         'Lateral student': 3,
         'Interested in Next batch': 5,
         'Recognition issue (DEC approval)': 1,
         'Want to take admission but has financial problems': 6,
         'University not recognized': 2})
In [453]:
df['Tags'].fillna(df['Tags'].mode()[0],inplace=True) #imputing Tags with mode
In [454]:
Counter(df['Lead Profile'])
Out[454]:
Counter({'Select': 4146,
         'Potential Lead': 1613,
         nan: 2709,
         'Other Leads': 487,
         'Lateral Student': 24,
         'Dual Specialization Student': 20,
         'Student of SomeSchool': 241})
In [455]:
df['Lead Profile'].fillna(df['Lead Profile'].mode()[0],inplace=True) #imputing Lead Profile with mode('Select')
In [456]:
Counter(df['Lead Profile'])
Out[456]:
Counter({'Select': 6855,
         'Potential Lead': 1613,
         'Other Leads': 487,
         'Lateral Student': 24,
         'Dual Specialization Student': 20,
         'Student of SomeSchool': 241})
In [457]:
df['Lead Profile'].replace('Select','Potential Lead',inplace=True) # replacing the select label with the second most occuring parameter->Potential Lead

In [458]:
Counter(df['What matters most to you in choosing a course'])
Out[458]:
Counter({'Better Career Prospects': 6528,
         nan: 2709,
         'Flexibility & Convenience': 2,
         'Other': 1})
In [459]:
df['What matters most to you in choosing a course'].fillna(df['What matters most to you in choosing a course'].mode()[0],inplace=True) #imputing with mode
In [460]:
Counter(df['What is your current occupation'])
Out[460]:
Counter({'Unemployed': 5600,
         'Student': 210,
         nan: 2690,
         'Working Professional': 706,
         'Businessman': 8,
         'Other': 16,
         'Housewife': 10})
In [461]:
df['What is your current occupation'].fillna(df['What is your current occupation'].mode()[0],inplace=True) #imputing current occupation with mode
In [462]:
Counter(df['How did you hear about X Education'])
Out[462]:
Counter({'Select': 5043,
         'Word Of Mouth': 348,
         'Other': 186,
         nan: 2207,
         'Online Search': 808,
         'Multiple Sources': 152,
         'Advertisements': 70,
         'Student of SomeSchool': 310,
         'Email': 26,
         'Social Media': 67,
         'SMS': 23})
In [463]:
df['How did you hear about X Education'].fillna(df['How did you hear about X Education'].mode()[0],inplace=True) #imputing Lead Profile with mode('Select')
In [464]:
Counter(df['How did you hear about X Education'])
Out[464]:
Counter({'Select': 7250,
         'Word Of Mouth': 348,
         'Other': 186,
         'Online Search': 808,
         'Multiple Sources': 152,
         'Advertisements': 70,
         'Student of SomeSchool': 310,
         'Email': 26,
         'Social Media': 67,
         'SMS': 23})
In [465]:
df['How did you hear about X Education'].replace('Select','Online Search',inplace=True) #replacing select with second highest parameter Online Search
In [466]:
Counter(df['How did you hear about X Education'])
Out[466]:
Counter({'Online Search': 8058,
         'Word Of Mouth': 348,
         'Other': 186,
         'Multiple Sources': 152,
         'Advertisements': 70,
         'Student of SomeSchool': 310,
         'Email': 26,
         'Social Media': 67,
         'SMS': 23})
In [467]:
Counter(df['Country'])  
Out[467]:
Counter({nan: 2461,
         'India': 6492,
         'Russia': 1,
         'Kuwait': 4,
         'Oman': 6,
         'United Arab Emirates': 53,
         'United States': 69,
         'Australia': 13,
         'United Kingdom': 15,
         'Bahrain': 7,
         'Ghana': 2,
         'Singapore': 24,
         'Qatar': 10,
         'Saudi Arabia': 21,
         'Belgium': 2,
         'France': 6,
         'Sri Lanka': 1,
         'China': 2,
         'Canada': 4,
         'Netherlands': 2,
         'Sweden': 3,
         'Nigeria': 4,
         'Hong Kong': 7,
         'Germany': 4,
         'Asia/Pacific Region': 2,
         'Uganda': 2,
         'Kenya': 1,
         'Italy': 2,
         'South Africa': 4,
         'Tanzania': 1,
         'unknown': 5,
         'Malaysia': 1,
         'Liberia': 1,
         'Switzerland': 1,
         'Denmark': 1,
         'Philippines': 2,
         'Bangladesh': 2,
         'Vietnam': 1,
         'Indonesia': 1})
In [468]:
df['Country'].fillna(df['Country'].mode()[0],inplace=True) #imputing Country with mode('India')
In [469]:
Counter(df['Specialization'])
Out[469]:
Counter({'Select': 1942,
         'Business Administration': 403,
         'Media and Advertising': 203,
         nan: 1438,
         'Supply Chain Management': 349,
         'IT Projects Management': 366,
         'Finance Management': 976,
         'Travel and Tourism': 203,
         'Human Resource Management': 848,
         'Marketing Management': 838,
         'Banking, Investment And Insurance': 338,
         'International Business': 178,
         'E-COMMERCE': 112,
         'Operations Management': 503,
         'Retail Management': 100,
         'Services Excellence': 40,
         'Hospitality Management': 114,
         'Rural and Agribusiness': 73,
         'Healthcare Management': 159,
         'E-Business': 57})
In [470]:
df['Specialization'].fillna(df['Specialization'].mode()[0],inplace=True)
In [471]:
df['Specialization'].replace('Select','Finance Management',inplace=True)
In [472]:
Counter(df['Specialization'])
Out[472]:
Counter({'Finance Management': 4356,
         'Business Administration': 403,
         'Media and Advertising': 203,
         'Supply Chain Management': 349,
         'IT Projects Management': 366,
         'Travel and Tourism': 203,
         'Human Resource Management': 848,
         'Marketing Management': 838,
         'Banking, Investment And Insurance': 338,
         'International Business': 178,
         'E-COMMERCE': 112,
         'Operations Management': 503,
         'Retail Management': 100,
         'Services Excellence': 40,
         'Hospitality Management': 114,
         'Rural and Agribusiness': 73,
         'Healthcare Management': 159,
         'E-Business': 57})
In [473]:
Counter(df['City'])
Out[473]:
Counter({'Select': 2249,
         'Mumbai': 3222,
         nan: 1420,
         'Thane & Outskirts': 752,
         'Other Metro Cities': 380,
         'Other Cities': 686,
         'Other Cities of Maharashtra': 457,
         'Tier II Cities': 74})
In [474]:
df['City'].fillna(df['City'].mode()[0],inplace=True) # imputing null values with mode for City
In [475]:
Counter(df['City'])
Out[475]:
Counter({'Select': 2249,
         'Mumbai': 4642,
         'Thane & Outskirts': 752,
         'Other Metro Cities': 380,
         'Other Cities': 686,
         'Other Cities of Maharashtra': 457,
         'Tier II Cities': 74})
In [476]:
df['City'].replace('Select','Mumbai',inplace=True) # replacing the select labels with the highest city entry -> Mumbai
In [477]:
Counter(df['City'])
Out[477]:
Counter({'Mumbai': 6891,
         'Thane & Outskirts': 752,
         'Other Metro Cities': 380,
         'Other Cities': 686,
         'Other Cities of Maharashtra': 457,
         'Tier II Cities': 74})
In [478]:
df.dropna(axis=0,how='any',inplace=True) # dropping rows with null values
In [479]:
df.shape # final shape of dataframe after removing null values ~ dropping approx 1.68% of the original data
Out[479]:
(9074, 37)
In [480]:
df.head() #final dataset with removed null values
Out[480]:
In [ ]:
jovian.commit()

[jovian] Saving notebook..
In [ ]: