import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
df = pd.read_csv('telecom_churn_data.csv')
import warnings
warnings.filterwarnings('ignore')
df.shape
(99999, 226)
Original Dataset has 99999
rows and 226
columns
df.head()
df.describe()
Since, the analysis is for high value customers,hence we should look for customers who pay an amount greater than 70 th percentile of average recharge done in the first two months(6,7)
# prepare a derived column for high return customer based on condition given
# prepare a column for usage of data ..separate for 2g and 3g
# prepare a column for usage of mobile data..3g use max or 2g max?
# customer spends mostly call services / data services ? (done)
df['avg_rech_good_month']= (df['total_rech_amt_6']+df['total_rech_amt_7'])/2
df['avg_rech_good_month'].quantile(0.7)
368.5
df_high_end_cus= df[df['avg_rech_good_month']>=368.5]
df_high_end_cus.shape
(30011, 227)
df_high_end_cus.isna().sum()/len(df_high_end_cus) *100 # % of missing data /NA Values in the dataframe
mobile_number 0.000000
circle_id 0.000000
loc_og_t2o_mou 0.379861
std_og_t2o_mou 0.379861
loc_ic_t2o_mou 0.379861
last_date_of_month_6 0.000000
last_date_of_month_7 0.103295
last_date_of_month_8 0.523142
last_date_of_month_9 1.199560
arpu_6 0.000000
arpu_7 0.000000
arpu_8 0.000000
arpu_9 0.000000
onnet_mou_6 1.052947
onnet_mou_7 1.009630
onnet_mou_8 3.125521
onnet_mou_9 5.677918
offnet_mou_6 1.052947
offnet_mou_7 1.009630
offnet_mou_8 3.125521
offnet_mou_9 5.677918
roam_ic_mou_6 1.052947
roam_ic_mou_7 1.009630
roam_ic_mou_8 3.125521
roam_ic_mou_9 5.677918
roam_og_mou_6 1.052947
roam_og_mou_7 1.009630
roam_og_mou_8 3.125521
roam_og_mou_9 5.677918
loc_og_t2t_mou_6 1.052947
loc_og_t2t_mou_7 1.009630
loc_og_t2t_mou_8 3.125521
loc_og_t2t_mou_9 5.677918
loc_og_t2m_mou_6 1.052947
loc_og_t2m_mou_7 1.009630
loc_og_t2m_mou_8 3.125521
loc_og_t2m_mou_9 5.677918
loc_og_t2f_mou_6 1.052947
loc_og_t2f_mou_7 1.009630
loc_og_t2f_mou_8 3.125521
loc_og_t2f_mou_9 5.677918
loc_og_t2c_mou_6 1.052947
loc_og_t2c_mou_7 1.009630
loc_og_t2c_mou_8 3.125521
loc_og_t2c_mou_9 5.677918
loc_og_mou_6 1.052947
loc_og_mou_7 1.009630
loc_og_mou_8 3.125521
loc_og_mou_9 5.677918
std_og_t2t_mou_6 1.052947
std_og_t2t_mou_7 1.009630
std_og_t2t_mou_8 3.125521
std_og_t2t_mou_9 5.677918
std_og_t2m_mou_6 1.052947
std_og_t2m_mou_7 1.009630
std_og_t2m_mou_8 3.125521
std_og_t2m_mou_9 5.677918
std_og_t2f_mou_6 1.052947
std_og_t2f_mou_7 1.009630
std_og_t2f_mou_8 3.125521
std_og_t2f_mou_9 5.677918
std_og_t2c_mou_6 1.052947
std_og_t2c_mou_7 1.009630
std_og_t2c_mou_8 3.125521
std_og_t2c_mou_9 5.677918
std_og_mou_6 1.052947
std_og_mou_7 1.009630
std_og_mou_8 3.125521
std_og_mou_9 5.677918
isd_og_mou_6 1.052947
isd_og_mou_7 1.009630
isd_og_mou_8 3.125521
isd_og_mou_9 5.677918
spl_og_mou_6 1.052947
spl_og_mou_7 1.009630
spl_og_mou_8 3.125521
spl_og_mou_9 5.677918
og_others_6 1.052947
og_others_7 1.009630
og_others_8 3.125521
og_others_9 5.677918
total_og_mou_6 0.000000
total_og_mou_7 0.000000
total_og_mou_8 0.000000
total_og_mou_9 0.000000
loc_ic_t2t_mou_6 1.052947
loc_ic_t2t_mou_7 1.009630
loc_ic_t2t_mou_8 3.125521
loc_ic_t2t_mou_9 5.677918
loc_ic_t2m_mou_6 1.052947
loc_ic_t2m_mou_7 1.009630
loc_ic_t2m_mou_8 3.125521
loc_ic_t2m_mou_9 5.677918
loc_ic_t2f_mou_6 1.052947
loc_ic_t2f_mou_7 1.009630
loc_ic_t2f_mou_8 3.125521
loc_ic_t2f_mou_9 5.677918
loc_ic_mou_6 1.052947
loc_ic_mou_7 1.009630
loc_ic_mou_8 3.125521
loc_ic_mou_9 5.677918
std_ic_t2t_mou_6 1.052947
std_ic_t2t_mou_7 1.009630
std_ic_t2t_mou_8 3.125521
std_ic_t2t_mou_9 5.677918
std_ic_t2m_mou_6 1.052947
std_ic_t2m_mou_7 1.009630
std_ic_t2m_mou_8 3.125521
std_ic_t2m_mou_9 5.677918
std_ic_t2f_mou_6 1.052947
std_ic_t2f_mou_7 1.009630
std_ic_t2f_mou_8 3.125521
std_ic_t2f_mou_9 5.677918
std_ic_t2o_mou_6 1.052947
std_ic_t2o_mou_7 1.009630
std_ic_t2o_mou_8 3.125521
std_ic_t2o_mou_9 5.677918
std_ic_mou_6 1.052947
std_ic_mou_7 1.009630
std_ic_mou_8 3.125521
std_ic_mou_9 5.677918
total_ic_mou_6 0.000000
total_ic_mou_7 0.000000
total_ic_mou_8 0.000000
total_ic_mou_9 0.000000
spl_ic_mou_6 1.052947
spl_ic_mou_7 1.009630
spl_ic_mou_8 3.125521
spl_ic_mou_9 5.677918
isd_ic_mou_6 1.052947
isd_ic_mou_7 1.009630
isd_ic_mou_8 3.125521
isd_ic_mou_9 5.677918
ic_others_6 1.052947
ic_others_7 1.009630
ic_others_8 3.125521
ic_others_9 5.677918
total_rech_num_6 0.000000
total_rech_num_7 0.000000
total_rech_num_8 0.000000
total_rech_num_9 0.000000
total_rech_amt_6 0.000000
total_rech_amt_7 0.000000
total_rech_amt_8 0.000000
total_rech_amt_9 0.000000
max_rech_amt_6 0.000000
max_rech_amt_7 0.000000
max_rech_amt_8 0.000000
max_rech_amt_9 0.000000
date_of_last_rech_6 0.206591
date_of_last_rech_7 0.379861
date_of_last_rech_8 1.979274
date_of_last_rech_9 2.885609
last_day_rch_amt_6 0.000000
last_day_rch_amt_7 0.000000
last_day_rch_amt_8 0.000000
last_day_rch_amt_9 0.000000
date_of_last_rech_data_6 62.023925
date_of_last_rech_data_7 61.140915
date_of_last_rech_data_8 60.834361
date_of_last_rech_data_9 61.810669
total_rech_data_6 62.023925
total_rech_data_7 61.140915
total_rech_data_8 60.834361
total_rech_data_9 61.810669
max_rech_data_6 62.023925
max_rech_data_7 61.140915
max_rech_data_8 60.834361
max_rech_data_9 61.810669
count_rech_2g_6 62.023925
count_rech_2g_7 61.140915
count_rech_2g_8 60.834361
count_rech_2g_9 61.810669
count_rech_3g_6 62.023925
count_rech_3g_7 61.140915
count_rech_3g_8 60.834361
count_rech_3g_9 61.810669
av_rech_amt_data_6 62.023925
av_rech_amt_data_7 61.140915
av_rech_amt_data_8 60.834361
av_rech_amt_data_9 61.810669
vol_2g_mb_6 0.000000
vol_2g_mb_7 0.000000
vol_2g_mb_8 0.000000
vol_2g_mb_9 0.000000
vol_3g_mb_6 0.000000
vol_3g_mb_7 0.000000
vol_3g_mb_8 0.000000
vol_3g_mb_9 0.000000
arpu_3g_6 62.023925
arpu_3g_7 61.140915
arpu_3g_8 60.834361
arpu_3g_9 61.810669
arpu_2g_6 62.023925
arpu_2g_7 61.140915
arpu_2g_8 60.834361
arpu_2g_9 61.810669
night_pck_user_6 62.023925
night_pck_user_7 61.140915
night_pck_user_8 60.834361
night_pck_user_9 61.810669
monthly_2g_6 0.000000
monthly_2g_7 0.000000
monthly_2g_8 0.000000
monthly_2g_9 0.000000
sachet_2g_6 0.000000
sachet_2g_7 0.000000
sachet_2g_8 0.000000
sachet_2g_9 0.000000
monthly_3g_6 0.000000
monthly_3g_7 0.000000
monthly_3g_8 0.000000
monthly_3g_9 0.000000
sachet_3g_6 0.000000
sachet_3g_7 0.000000
sachet_3g_8 0.000000
sachet_3g_9 0.000000
fb_user_6 62.023925
fb_user_7 61.140915
fb_user_8 60.834361
fb_user_9 61.810669
aon 0.000000
aug_vbc_3g 0.000000
jul_vbc_3g 0.000000
jun_vbc_3g 0.000000
sep_vbc_3g 0.000000
avg_rech_good_month 0.000000
dtype: float64
As we see that few columns like date_of_last_rech_data_6
,date_of_last_rech_data_7
,date_of_last_rech_data_8
,
date_of_last_rech_data_9
,total_rech_data_6
,total_rech_data_7
,total_rech_data_8
,total_rech_data_9
,
max_rech_data_6
,max_rech_data_7
,max_rech_data_8
,max_rech_data_9
,count_rech_2g_6
,count_rech_2g_7
,
count_rech_2g_8
,count_rech_2g_9
,count_rech_3g_6
,count_rech_3g_7
,count_rech_3g_8
,count_rech_3g_9
,
av_rech_amt_data_6
,av_rech_amt_data_7
,av_rech_amt_data_8
,av_rech_amt_data_9
,arpu_3g_6
,arpu_3g_7
,
arpu_3g_8
,arpu_3g_9
,fb_user_6
,fb_user_7
,fb_user_8
,fb_user_9
have high null values , i.e above 50 percent . Hence, we can drop them based on their significance.
For columns like fb_user_6
,fb_user_7
,fb_user_8
,fb_user_9
, as because usage of facebook might not be a factor for deciding whether customer will churn or not.
df_high_end_cus = df_high_end_cus.drop(columns=['fb_user_6','fb_user_7','fb_user_8','fb_user_9'],axis=1)
For columns like count_rech_2g_6
,count_rech_2g_7
,count_rech_2g_8
,count_rech_2g_9
, count_rech_3g_6
,
count_rech_3g_7
,count_rech_3g_8
,count_rech_3g_9
, we should not remove these columns as they might be helpful in predicting churning of customers.
For columns like max_rech_data_9
,max_rech_data_8
,max_rech_data_7
,max_rech_data_6
, total_rech_data_9
,
total_rech_data_8
,total_rech_data_7
,total_rech_data_6
, we retain recharge data.
For columns like arpu_3g_9
,arpu_3g_8
,arpu_3g_7
,arpu_3g_6
, arpu_2g_9
,
arpu_2g_8
,arpu_2g_7
,arpu_2g_6
, we should retain this column as the average revenue of internet users can also help in predicting churning.
Similarly for other columns with high null values , we remove the columns
cols_to_retain = ['count_rech_2g_6','count_rech_2g_7','count_rech_2g_8','count_rech_2g_9','count_rech_3g_6',
'count_rech_3g_7','count_rech_3g_8','count_rech_3g_9','max_rech_data_9','max_rech_data_8',
'max_rech_data_7','max_rech_data_6','total_rech_data_9','total_rech_data_8','total_rech_data_7',
'total_rech_data_6','arpu_2g_6','arpu_2g_7','arpu_2g_8','arpu_2g_9','arpu_3g_9','arpu_3g_8','arpu_3g_7','arpu_3g_6']
df_high_end_cus[cols_to_retain] = df_high_end_cus[cols_to_retain].fillna(df_high_end_cus[cols_to_retain].median())
# for columns like like night pack user or not we can remove those columns as already the recharges have been covered.
df_high_end_cus = df_high_end_cus.drop(columns=['night_pck_user_6','night_pck_user_7','night_pck_user_8',
'night_pck_user_9'],axis=1)
Since , columns are numeric , we can use mean values for respective columns to impute missing values.
df_high_end_cus = df_high_end_cus.fillna(df_high_end_cus.median())
df_high_end_cus.isna().sum() /len(df_high_end_cus) *100
mobile_number 0.000000
circle_id 0.000000
loc_og_t2o_mou 0.000000
std_og_t2o_mou 0.000000
loc_ic_t2o_mou 0.000000
last_date_of_month_6 0.000000
last_date_of_month_7 0.103295
last_date_of_month_8 0.523142
last_date_of_month_9 1.199560
arpu_6 0.000000
arpu_7 0.000000
arpu_8 0.000000
arpu_9 0.000000
onnet_mou_6 0.000000
onnet_mou_7 0.000000
onnet_mou_8 0.000000
onnet_mou_9 0.000000
offnet_mou_6 0.000000
offnet_mou_7 0.000000
offnet_mou_8 0.000000
offnet_mou_9 0.000000
roam_ic_mou_6 0.000000
roam_ic_mou_7 0.000000
roam_ic_mou_8 0.000000
roam_ic_mou_9 0.000000
roam_og_mou_6 0.000000
roam_og_mou_7 0.000000
roam_og_mou_8 0.000000
roam_og_mou_9 0.000000
loc_og_t2t_mou_6 0.000000
loc_og_t2t_mou_7 0.000000
loc_og_t2t_mou_8 0.000000
loc_og_t2t_mou_9 0.000000
loc_og_t2m_mou_6 0.000000
loc_og_t2m_mou_7 0.000000
loc_og_t2m_mou_8 0.000000
loc_og_t2m_mou_9 0.000000
loc_og_t2f_mou_6 0.000000
loc_og_t2f_mou_7 0.000000
loc_og_t2f_mou_8 0.000000
loc_og_t2f_mou_9 0.000000
loc_og_t2c_mou_6 0.000000
loc_og_t2c_mou_7 0.000000
loc_og_t2c_mou_8 0.000000
loc_og_t2c_mou_9 0.000000
loc_og_mou_6 0.000000
loc_og_mou_7 0.000000
loc_og_mou_8 0.000000
loc_og_mou_9 0.000000
std_og_t2t_mou_6 0.000000
std_og_t2t_mou_7 0.000000
std_og_t2t_mou_8 0.000000
std_og_t2t_mou_9 0.000000
std_og_t2m_mou_6 0.000000
std_og_t2m_mou_7 0.000000
std_og_t2m_mou_8 0.000000
std_og_t2m_mou_9 0.000000
std_og_t2f_mou_6 0.000000
std_og_t2f_mou_7 0.000000
std_og_t2f_mou_8 0.000000
std_og_t2f_mou_9 0.000000
std_og_t2c_mou_6 0.000000
std_og_t2c_mou_7 0.000000
std_og_t2c_mou_8 0.000000
std_og_t2c_mou_9 0.000000
std_og_mou_6 0.000000
std_og_mou_7 0.000000
std_og_mou_8 0.000000
std_og_mou_9 0.000000
isd_og_mou_6 0.000000
isd_og_mou_7 0.000000
isd_og_mou_8 0.000000
isd_og_mou_9 0.000000
spl_og_mou_6 0.000000
spl_og_mou_7 0.000000
spl_og_mou_8 0.000000
spl_og_mou_9 0.000000
og_others_6 0.000000
og_others_7 0.000000
og_others_8 0.000000
og_others_9 0.000000
total_og_mou_6 0.000000
total_og_mou_7 0.000000
total_og_mou_8 0.000000
total_og_mou_9 0.000000
loc_ic_t2t_mou_6 0.000000
loc_ic_t2t_mou_7 0.000000
loc_ic_t2t_mou_8 0.000000
loc_ic_t2t_mou_9 0.000000
loc_ic_t2m_mou_6 0.000000
loc_ic_t2m_mou_7 0.000000
loc_ic_t2m_mou_8 0.000000
loc_ic_t2m_mou_9 0.000000
loc_ic_t2f_mou_6 0.000000
loc_ic_t2f_mou_7 0.000000
loc_ic_t2f_mou_8 0.000000
loc_ic_t2f_mou_9 0.000000
loc_ic_mou_6 0.000000
loc_ic_mou_7 0.000000
loc_ic_mou_8 0.000000
loc_ic_mou_9 0.000000
std_ic_t2t_mou_6 0.000000
std_ic_t2t_mou_7 0.000000
std_ic_t2t_mou_8 0.000000
std_ic_t2t_mou_9 0.000000
std_ic_t2m_mou_6 0.000000
std_ic_t2m_mou_7 0.000000
std_ic_t2m_mou_8 0.000000
std_ic_t2m_mou_9 0.000000
std_ic_t2f_mou_6 0.000000
std_ic_t2f_mou_7 0.000000
std_ic_t2f_mou_8 0.000000
std_ic_t2f_mou_9 0.000000
std_ic_t2o_mou_6 0.000000
std_ic_t2o_mou_7 0.000000
std_ic_t2o_mou_8 0.000000
std_ic_t2o_mou_9 0.000000
std_ic_mou_6 0.000000
std_ic_mou_7 0.000000
std_ic_mou_8 0.000000
std_ic_mou_9 0.000000
total_ic_mou_6 0.000000
total_ic_mou_7 0.000000
total_ic_mou_8 0.000000
total_ic_mou_9 0.000000
spl_ic_mou_6 0.000000
spl_ic_mou_7 0.000000
spl_ic_mou_8 0.000000
spl_ic_mou_9 0.000000
isd_ic_mou_6 0.000000
isd_ic_mou_7 0.000000
isd_ic_mou_8 0.000000
isd_ic_mou_9 0.000000
ic_others_6 0.000000
ic_others_7 0.000000
ic_others_8 0.000000
ic_others_9 0.000000
total_rech_num_6 0.000000
total_rech_num_7 0.000000
total_rech_num_8 0.000000
total_rech_num_9 0.000000
total_rech_amt_6 0.000000
total_rech_amt_7 0.000000
total_rech_amt_8 0.000000
total_rech_amt_9 0.000000
max_rech_amt_6 0.000000
max_rech_amt_7 0.000000
max_rech_amt_8 0.000000
max_rech_amt_9 0.000000
date_of_last_rech_6 0.206591
date_of_last_rech_7 0.379861
date_of_last_rech_8 1.979274
date_of_last_rech_9 2.885609
last_day_rch_amt_6 0.000000
last_day_rch_amt_7 0.000000
last_day_rch_amt_8 0.000000
last_day_rch_amt_9 0.000000
date_of_last_rech_data_6 62.023925
date_of_last_rech_data_7 61.140915
date_of_last_rech_data_8 60.834361
date_of_last_rech_data_9 61.810669
total_rech_data_6 0.000000
total_rech_data_7 0.000000
total_rech_data_8 0.000000
total_rech_data_9 0.000000
max_rech_data_6 0.000000
max_rech_data_7 0.000000
max_rech_data_8 0.000000
max_rech_data_9 0.000000
count_rech_2g_6 0.000000
count_rech_2g_7 0.000000
count_rech_2g_8 0.000000
count_rech_2g_9 0.000000
count_rech_3g_6 0.000000
count_rech_3g_7 0.000000
count_rech_3g_8 0.000000
count_rech_3g_9 0.000000
av_rech_amt_data_6 0.000000
av_rech_amt_data_7 0.000000
av_rech_amt_data_8 0.000000
av_rech_amt_data_9 0.000000
vol_2g_mb_6 0.000000
vol_2g_mb_7 0.000000
vol_2g_mb_8 0.000000
vol_2g_mb_9 0.000000
vol_3g_mb_6 0.000000
vol_3g_mb_7 0.000000
vol_3g_mb_8 0.000000
vol_3g_mb_9 0.000000
arpu_3g_6 0.000000
arpu_3g_7 0.000000
arpu_3g_8 0.000000
arpu_3g_9 0.000000
arpu_2g_6 0.000000
arpu_2g_7 0.000000
arpu_2g_8 0.000000
arpu_2g_9 0.000000
monthly_2g_6 0.000000
monthly_2g_7 0.000000
monthly_2g_8 0.000000
monthly_2g_9 0.000000
sachet_2g_6 0.000000
sachet_2g_7 0.000000
sachet_2g_8 0.000000
sachet_2g_9 0.000000
monthly_3g_6 0.000000
monthly_3g_7 0.000000
monthly_3g_8 0.000000
monthly_3g_9 0.000000
sachet_3g_6 0.000000
sachet_3g_7 0.000000
sachet_3g_8 0.000000
sachet_3g_9 0.000000
aon 0.000000
aug_vbc_3g 0.000000
jul_vbc_3g 0.000000
jun_vbc_3g 0.000000
sep_vbc_3g 0.000000
avg_rech_good_month 0.000000
dtype: float64
For dates we have to handle data bit differently date_of_last_rech_data_9
,date_of_last_rech_data_8
,
date_of_last_rech_data_7
,date_of_last_rech_data_6
,last_date_of_month_6
,last_date_of_month_7
,last_date_of_month_8
,last_date_of_month_9
.. we separate the dates as separate column for each month
date_cols = ['date_of_last_rech_data_9','date_of_last_rech_data_8','date_of_last_rech_data_7','date_of_last_rech_data_6',
'last_date_of_month_6','last_date_of_month_7','last_date_of_month_8','last_date_of_month_9',
'date_of_last_rech_6','date_of_last_rech_7','date_of_last_rech_8','date_of_last_rech_9']
#df_high_end_cus[date_cols] = df_high_end_cus[date_cols].astype('datetime64[ns]')
df_high_end_cus[date_cols].isna().sum()
date_of_last_rech_data_9 18550
date_of_last_rech_data_8 18257
date_of_last_rech_data_7 18349
date_of_last_rech_data_6 18614
last_date_of_month_6 0
last_date_of_month_7 31
last_date_of_month_8 157
last_date_of_month_9 360
date_of_last_rech_6 62
date_of_last_rech_7 114
date_of_last_rech_8 594
date_of_last_rech_9 866
dtype: int64
#extracting days from the datetime format columns
df_high_end_cus['date_last_rech_data_9'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_data_9']).dt.day
df_high_end_cus['date_last_rech_data_8'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_data_8']).dt.day
df_high_end_cus['date_last_rech_data_7'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_data_7']).dt.day
df_high_end_cus['date_last_rech_data_6'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_data_6']).dt.day
df_high_end_cus['date_last_rech_6'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_6']).dt.day
df_high_end_cus['date_last_rech_7'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_7']).dt.day
df_high_end_cus['date_last_rech_8'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_8']).dt.day
df_high_end_cus['date_last_rech_9'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_9']).dt.day
df_high_end_cus['last_date_@_month_6'] = pd.to_datetime(df_high_end_cus['last_date_of_month_6']).dt.day
df_high_end_cus['last_date_@_month_7'] = pd.to_datetime(df_high_end_cus['last_date_of_month_7']).dt.day
df_high_end_cus['last_date_@_month_8'] = pd.to_datetime(df_high_end_cus['last_date_of_month_8']).dt.day
df_high_end_cus['last_date_@_month_9'] = pd.to_datetime(df_high_end_cus['last_date_of_month_9']).dt.day
df_high_end_cus = df_high_end_cus.drop(columns = date_cols,axis=1) # sropping all the original date-time columns
df_high_end_cus = df_high_end_cus.fillna(df_high_end_cus.median()) # filling columns with days ..with median
df_high_end_cus.isna().sum()
mobile_number 0
circle_id 0
loc_og_t2o_mou 0
std_og_t2o_mou 0
loc_ic_t2o_mou 0
arpu_6 0
arpu_7 0
arpu_8 0
arpu_9 0
onnet_mou_6 0
onnet_mou_7 0
onnet_mou_8 0
onnet_mou_9 0
offnet_mou_6 0
offnet_mou_7 0
offnet_mou_8 0
offnet_mou_9 0
roam_ic_mou_6 0
roam_ic_mou_7 0
roam_ic_mou_8 0
roam_ic_mou_9 0
roam_og_mou_6 0
roam_og_mou_7 0
roam_og_mou_8 0
roam_og_mou_9 0
loc_og_t2t_mou_6 0
loc_og_t2t_mou_7 0
loc_og_t2t_mou_8 0
loc_og_t2t_mou_9 0
loc_og_t2m_mou_6 0
loc_og_t2m_mou_7 0
loc_og_t2m_mou_8 0
loc_og_t2m_mou_9 0
loc_og_t2f_mou_6 0
loc_og_t2f_mou_7 0
loc_og_t2f_mou_8 0
loc_og_t2f_mou_9 0
loc_og_t2c_mou_6 0
loc_og_t2c_mou_7 0
loc_og_t2c_mou_8 0
loc_og_t2c_mou_9 0
loc_og_mou_6 0
loc_og_mou_7 0
loc_og_mou_8 0
loc_og_mou_9 0
std_og_t2t_mou_6 0
std_og_t2t_mou_7 0
std_og_t2t_mou_8 0
std_og_t2t_mou_9 0
std_og_t2m_mou_6 0
std_og_t2m_mou_7 0
std_og_t2m_mou_8 0
std_og_t2m_mou_9 0
std_og_t2f_mou_6 0
std_og_t2f_mou_7 0
std_og_t2f_mou_8 0
std_og_t2f_mou_9 0
std_og_t2c_mou_6 0
std_og_t2c_mou_7 0
std_og_t2c_mou_8 0
std_og_t2c_mou_9 0
std_og_mou_6 0
std_og_mou_7 0
std_og_mou_8 0
std_og_mou_9 0
isd_og_mou_6 0
isd_og_mou_7 0
isd_og_mou_8 0
isd_og_mou_9 0
spl_og_mou_6 0
spl_og_mou_7 0
spl_og_mou_8 0
spl_og_mou_9 0
og_others_6 0
og_others_7 0
og_others_8 0
og_others_9 0
total_og_mou_6 0
total_og_mou_7 0
total_og_mou_8 0
total_og_mou_9 0
loc_ic_t2t_mou_6 0
loc_ic_t2t_mou_7 0
loc_ic_t2t_mou_8 0
loc_ic_t2t_mou_9 0
loc_ic_t2m_mou_6 0
loc_ic_t2m_mou_7 0
loc_ic_t2m_mou_8 0
loc_ic_t2m_mou_9 0
loc_ic_t2f_mou_6 0
loc_ic_t2f_mou_7 0
loc_ic_t2f_mou_8 0
loc_ic_t2f_mou_9 0
loc_ic_mou_6 0
loc_ic_mou_7 0
loc_ic_mou_8 0
loc_ic_mou_9 0
std_ic_t2t_mou_6 0
std_ic_t2t_mou_7 0
std_ic_t2t_mou_8 0
std_ic_t2t_mou_9 0
std_ic_t2m_mou_6 0
std_ic_t2m_mou_7 0
std_ic_t2m_mou_8 0
std_ic_t2m_mou_9 0
std_ic_t2f_mou_6 0
std_ic_t2f_mou_7 0
std_ic_t2f_mou_8 0
std_ic_t2f_mou_9 0
std_ic_t2o_mou_6 0
std_ic_t2o_mou_7 0
std_ic_t2o_mou_8 0
std_ic_t2o_mou_9 0
std_ic_mou_6 0
std_ic_mou_7 0
std_ic_mou_8 0
std_ic_mou_9 0
total_ic_mou_6 0
total_ic_mou_7 0
total_ic_mou_8 0
total_ic_mou_9 0
spl_ic_mou_6 0
spl_ic_mou_7 0
spl_ic_mou_8 0
spl_ic_mou_9 0
isd_ic_mou_6 0
isd_ic_mou_7 0
isd_ic_mou_8 0
isd_ic_mou_9 0
ic_others_6 0
ic_others_7 0
ic_others_8 0
ic_others_9 0
total_rech_num_6 0
total_rech_num_7 0
total_rech_num_8 0
total_rech_num_9 0
total_rech_amt_6 0
total_rech_amt_7 0
total_rech_amt_8 0
total_rech_amt_9 0
max_rech_amt_6 0
max_rech_amt_7 0
max_rech_amt_8 0
max_rech_amt_9 0
last_day_rch_amt_6 0
last_day_rch_amt_7 0
last_day_rch_amt_8 0
last_day_rch_amt_9 0
total_rech_data_6 0
total_rech_data_7 0
total_rech_data_8 0
total_rech_data_9 0
max_rech_data_6 0
max_rech_data_7 0
max_rech_data_8 0
max_rech_data_9 0
count_rech_2g_6 0
count_rech_2g_7 0
count_rech_2g_8 0
count_rech_2g_9 0
count_rech_3g_6 0
count_rech_3g_7 0
count_rech_3g_8 0
count_rech_3g_9 0
av_rech_amt_data_6 0
av_rech_amt_data_7 0
av_rech_amt_data_8 0
av_rech_amt_data_9 0
vol_2g_mb_6 0
vol_2g_mb_7 0
vol_2g_mb_8 0
vol_2g_mb_9 0
vol_3g_mb_6 0
vol_3g_mb_7 0
vol_3g_mb_8 0
vol_3g_mb_9 0
arpu_3g_6 0
arpu_3g_7 0
arpu_3g_8 0
arpu_3g_9 0
arpu_2g_6 0
arpu_2g_7 0
arpu_2g_8 0
arpu_2g_9 0
monthly_2g_6 0
monthly_2g_7 0
monthly_2g_8 0
monthly_2g_9 0
sachet_2g_6 0
sachet_2g_7 0
sachet_2g_8 0
sachet_2g_9 0
monthly_3g_6 0
monthly_3g_7 0
monthly_3g_8 0
monthly_3g_9 0
sachet_3g_6 0
sachet_3g_7 0
sachet_3g_8 0
sachet_3g_9 0
aon 0
aug_vbc_3g 0
jul_vbc_3g 0
jun_vbc_3g 0
sep_vbc_3g 0
avg_rech_good_month 0
date_last_rech_data_9 0
date_last_rech_data_8 0
date_last_rech_data_7 0
date_last_rech_data_6 0
date_last_rech_6 0
date_last_rech_7 0
date_last_rech_8 0
date_last_rech_9 0
last_date_@_month_6 0
last_date_@_month_7 0
last_date_@_month_8 0
last_date_@_month_9 0
dtype: int64
df_high_end_cus.shape # final shape of dataset after null value treatment
(30011, 219)
churn_attrs = ['total_ic_mou_9','total_og_mou_9','vol_2g_mb_9','vol_3g_mb_9']
df_high_end_cus['churn'] = df_high_end_cus.apply(lambda row:1 if ((row.total_ic_mou_9 == 0 or row.total_og_mou_9==0)
and (row.vol_2g_mb_9 == 0 or row.vol_3g_mb_9 ==0))
else 0 , axis = 1)
df_high_end_cus['churn'].value_counts()
0 26964
1 3047
Name: churn, dtype: int64
so , as we see a customer churns churns if he/she does not any call service(incoming/outgoing) or internet services (2G/3G)
. Based on the interpretation above :
3047 churned customers
26964 non-churned customers
# Now , we drop all the columns that act as data from the churn month - i.e month of september(9)
drop_churn_cols = []
col_list = df_high_end_cus.columns.values
for x in range(len(col_list)):
if(col_list[x][-1]=='9'):
drop_churn_cols.append(col_list[x])
drop_churn_cols
['arpu_9',
'onnet_mou_9',
'offnet_mou_9',
'roam_ic_mou_9',
'roam_og_mou_9',
'loc_og_t2t_mou_9',
'loc_og_t2m_mou_9',
'loc_og_t2f_mou_9',
'loc_og_t2c_mou_9',
'loc_og_mou_9',
'std_og_t2t_mou_9',
'std_og_t2m_mou_9',
'std_og_t2f_mou_9',
'std_og_t2c_mou_9',
'std_og_mou_9',
'isd_og_mou_9',
'spl_og_mou_9',
'og_others_9',
'total_og_mou_9',
'loc_ic_t2t_mou_9',
'loc_ic_t2m_mou_9',
'loc_ic_t2f_mou_9',
'loc_ic_mou_9',
'std_ic_t2t_mou_9',
'std_ic_t2m_mou_9',
'std_ic_t2f_mou_9',
'std_ic_t2o_mou_9',
'std_ic_mou_9',
'total_ic_mou_9',
'spl_ic_mou_9',
'isd_ic_mou_9',
'ic_others_9',
'total_rech_num_9',
'total_rech_amt_9',
'max_rech_amt_9',
'last_day_rch_amt_9',
'total_rech_data_9',
'max_rech_data_9',
'count_rech_2g_9',
'count_rech_3g_9',
'av_rech_amt_data_9',
'vol_2g_mb_9',
'vol_3g_mb_9',
'arpu_3g_9',
'arpu_2g_9',
'monthly_2g_9',
'sachet_2g_9',
'monthly_3g_9',
'sachet_3g_9',
'date_last_rech_data_9',
'date_last_rech_9',
'last_date_@_month_9']
df_high_end_cus=df_high_end_cus.drop(drop_churn_cols,axis=1)
df_high_end_cus.shape #final dataframe after removing all churn month attributes and null value treated
(30011, 168)
#Get Correlation of "Churn" with other variables:
plt.figure(figsize=(30,10))
df_high_end_cus.corr()['churn'].sort_values(ascending = False).plot(kind='bar')
plt.show()
ax = sns.distplot(df_high_end_cus['arpu_6'], hist=True, kde=False,
bins=int(180/5), color = 'darkblue',
hist_kws={'edgecolor':'black'},
kde_kws={'linewidth': 4})
ax.set_ylabel('Churn')
ax.set_xlabel('Average revenue per user in july')
ax.set_title('Relation between revenue and churn rate')
Text(0.5, 1.0, 'Relation between revenue and churn rate')
ax = sns.distplot(df_high_end_cus['arpu_7'], hist=True, kde=False,
bins=int(180/5), color = 'darkblue',
hist_kws={'edgecolor':'black'},
kde_kws={'linewidth': 4})
ax.set_ylabel('Churn')
ax.set_xlabel('Average revenue per user in August')
ax.set_title('Relation between revenue and churn rate')
Text(0.5, 1.0, 'Relation between revenue and churn rate')
df_high_end_cus=df_high_end_cus.drop(columns=['mobile_number','circle_id'])
We can drop column mobile_number
,circle_id
as it does not help in predicting churning
df_high_end_cus.corr()
We see above from the above heatmap that some of the correlation have Null Values(whitespaces) as these columns have same value and do not change, hence the standard deviation of these variables is 0 , which leads to Null values of correlation.
from sklearn.decomposition import FactorAnalysis
FA = FactorAnalysis(n_components = 3).fit_transform(df_high_end_cus.values)
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
plt.title('Factor Analysis Components')
plt.scatter(FA[:,0], FA[:,1])
plt.scatter(FA[:,1], FA[:,2])
plt.scatter(FA[:,2], FA[:,0])
<matplotlib.collections.PathCollection at 0x7f8c61995ed0>
We see that, the data is highly correlated and hence, we have to remove multicollinearity. Here, every group of factor is a set of highly correlated variables/columns
#report = pp.ProfileReport(df_high_end_cus)
#report.to_file('output_report.html')
Dropping variables with high multicollinearity
as per report from pandas-profile.
df_high_end_cus = df_high_end_cus.drop(columns=['arpu_3g_6','arpu_3g_7','arpu_3g_8','isd_og_mou_7','isd_og_mou_8',
'sachet_2g_6','sachet_2g_7','sachet_2g_8','total_rech_amt_6',
'total_rech_amt_7','total_rech_amt_8'])
Dropping variables with constant values
as per report from pandas-profile.
df_high_end_cus = df_high_end_cus.drop(columns=['last_date_@_month_6','last_date_@_month_7','last_date_@_month_8',
'loc_ic_t2o_mou','loc_og_t2o_mou','std_ic_t2o_mou_6','std_ic_t2o_mou_7',
'std_ic_t2o_mou_8','std_og_t2c_mou_6','std_og_t2c_mou_7','std_og_t2c_mou_8',
'std_og_t2o_mou'])
df_high_end_cus.shape
(30011, 143)
Now, identifying skewed columns
and do the necessary transformations.
sns.distplot(df_high_end_cus['avg_rech_good_month'])
plt.show()
#since, the column avg_rech_good_month is highly skewed, we can do log transformation.
df_high_end_cus['avg_rech_good_month_trnsfrm'] = np.log(df_high_end_cus['avg_rech_good_month'])
df_high_end_cus = df_high_end_cus.drop(columns=['avg_rech_good_month'])
sns.distplot(df_high_end_cus['avg_rech_good_month_trnsfrm'])
plt.show()
sns.distplot(df_high_end_cus['ic_others_6'])
plt.show()
df_high_end_cus['ic_others_6_trnsfrm']=np.sqrt(df_high_end_cus['ic_others_6'])
df_high_end_cus['ic_others_8_trnsfrm']=np.sqrt(df_high_end_cus['ic_others_8'])
df_high_end_cus['isd_ic_mou_7_trnsfrm']=np.sqrt(df_high_end_cus['isd_ic_mou_7'])
df_high_end_cus['isd_ic_mou_6_trnsfrm']=np.sqrt(df_high_end_cus['isd_ic_mou_6'])
df_high_end_cus['loc_og_t2c_mou_7_trnsfrm']=np.sqrt(df_high_end_cus['loc_og_t2c_mou_7'])
df_high_end_cus['og_others_7_trnsfrm']=np.sqrt(df_high_end_cus['og_others_7'])
df_high_end_cus['og_others_8_trnsfrm']=np.sqrt(df_high_end_cus['og_others_8'])
df_high_end_cus['spl_ic_mou_6_trnsfrm']=np.sqrt(df_high_end_cus['spl_ic_mou_6'])
df_high_end_cus['spl_ic_mou_7_trnsfrm']=np.sqrt(df_high_end_cus['spl_ic_mou_7'])
df_high_end_cus['std_ic_t2f_mou_6_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2f_mou_6'])
df_high_end_cus['std_ic_t2f_mou_7_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2f_mou_7'])
df_high_end_cus['std_ic_t2f_mou_8_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2f_mou_8'])
df_high_end_cus['std_ic_t2t_mou_6_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2t_mou_6'])
df_high_end_cus['std_ic_t2t_mou_7_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2t_mou_7'])
df_high_end_cus['std_ic_t2t_mou_8_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2t_mou_8'])
cols_skewed_sqrt = ['ic_others_6','ic_others_8','isd_ic_mou_7','isd_ic_mou_6','loc_og_t2c_mou_7','og_others_7',
'og_others_8','spl_ic_mou_6','spl_ic_mou_7','std_ic_t2f_mou_6','std_ic_t2f_mou_7','std_ic_t2f_mou_8',
'std_ic_t2t_mou_6','std_ic_t2t_mou_7','std_ic_t2t_mou_8']
df_high_end_cus = df_high_end_cus.drop(columns=cols_skewed_sqrt,axis=1)
#report_1 = pp.ProfileReport(df_high_end_cus)
#report_1.to_file('output_report_2.html')
df_high_end_cus['ic_others_7_trnsfrm']=np.sqrt(df_high_end_cus['ic_others_7'])
df_high_end_cus['isd_og_mou_6_trnsfrm']=np.sqrt(df_high_end_cus['isd_og_mou_6'])
df_high_end_cus = df_high_end_cus.drop(columns=['ic_others_7','isd_og_mou_6'])
FA_1 = FactorAnalysis(n_components = 3).fit_transform(df_high_end_cus.values)
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
plt.title('Factor Analysis Components')
plt.scatter(FA_1[:,0],FA_1[:,1])
plt.scatter(FA_1[:,1], FA_1[:,2])
plt.scatter(FA_1[:,2],FA_1[:,0])
plt.show()
#report_2 = pp.ProfileReport(df_high_end_cus)
#report_2.to_file('output_report_3.html')
As we can see from the above factor plot, that the multicollinearity among the datasets have been reduced due to removal of the above mentioned columns as per pandas profiling.
df_high_end_cus.corr()['churn'].sort_values(ascending = False)
churn 1.000000
std_og_mou_6 0.131846
std_og_t2m_mou_6 0.099027
std_og_t2t_mou_6 0.093168
roam_og_mou_7 0.092717
roam_og_mou_8 0.072746
total_og_mou_6 0.072193
onnet_mou_6 0.071193
roam_ic_mou_7 0.069398
total_rech_num_6 0.064926
roam_ic_mou_8 0.063417
roam_og_mou_6 0.061005
offnet_mou_6 0.058547
arpu_6 0.058299
std_og_mou_7 0.049341
roam_ic_mou_6 0.045281
std_og_t2m_mou_7 0.039915
std_ic_t2t_mou_6_trnsfrm 0.032413
std_og_t2t_mou_7 0.029499
isd_og_mou_6_trnsfrm 0.027086
og_others_6 0.023838
max_rech_data_7 0.021388
spl_og_mou_6 0.019219
max_rech_data_6 0.016762
arpu_2g_6 0.016439
std_ic_mou_6 0.014432
max_rech_data_8 0.010959
std_ic_t2m_mou_6 0.009146
avg_rech_good_month_trnsfrm 0.007734
loc_og_t2c_mou_7_trnsfrm 0.007074
last_day_rch_amt_6 0.006301
date_last_rech_6 0.006051
loc_og_t2c_mou_6 0.005826
av_rech_amt_data_6 0.005406
spl_og_mou_7 0.004479
onnet_mou_7 0.004469
isd_ic_mou_6_trnsfrm 0.002928
og_others_7_trnsfrm 0.001932
date_last_rech_data_6 0.001892
max_rech_amt_6 0.000408
sachet_3g_6 -0.000659
monthly_3g_6 -0.000746
count_rech_3g_6 -0.000943
date_last_rech_data_8 -0.001495
arpu_2g_7 -0.001693
sachet_3g_7 -0.001900
spl_ic_mou_7_trnsfrm -0.004778
og_others_8_trnsfrm -0.005165
date_last_rech_data_7 -0.005310
av_rech_amt_data_7 -0.005919
offnet_mou_7 -0.005927
total_rech_data_7 -0.006471
std_ic_t2t_mou_7_trnsfrm -0.006982
vol_3g_mb_6 -0.007034
total_rech_data_6 -0.007746
count_rech_2g_6 -0.009005
vol_2g_mb_6 -0.010459
ic_others_6_trnsfrm -0.012010
count_rech_3g_7 -0.013208
spl_ic_mou_6_trnsfrm -0.013247
total_rech_num_7 -0.017163
std_ic_mou_7 -0.017219
std_ic_t2m_mou_7 -0.017647
isd_ic_mou_7_trnsfrm -0.017897
vol_2g_mb_7 -0.020292
count_rech_2g_7 -0.020302
vol_3g_mb_7 -0.021298
monthly_3g_7 -0.022068
std_og_t2f_mou_6 -0.023002
max_rech_amt_7 -0.025411
total_og_mou_7 -0.025698
std_og_t2f_mou_7 -0.025904
sachet_3g_8 -0.030317
jun_vbc_3g -0.031576
arpu_7 -0.032264
isd_ic_mou_8 -0.033997
loc_og_t2c_mou_8 -0.035344
total_rech_data_8 -0.035385
ic_others_7_trnsfrm -0.038166
loc_og_t2t_mou_6 -0.040706
std_og_t2f_mou_8 -0.040923
av_rech_amt_data_8 -0.041120
loc_ic_t2t_mou_6 -0.042433
sep_vbc_3g -0.043487
jul_vbc_3g -0.046368
count_rech_2g_8 -0.048568
arpu_2g_8 -0.049299
monthly_2g_6 -0.049323
last_day_rch_amt_7 -0.050486
loc_og_t2f_mou_6 -0.050514
vol_2g_mb_8 -0.051534
std_ic_t2f_mou_6_trnsfrm -0.051998
spl_og_mou_8 -0.054581
count_rech_3g_8 -0.055715
loc_ic_t2f_mou_6 -0.056275
spl_ic_mou_8 -0.057730
std_ic_t2f_mou_7_trnsfrm -0.058405
std_ic_t2m_mou_8 -0.058697
loc_og_t2f_mou_7 -0.059601
monthly_3g_8 -0.060327
loc_ic_t2f_mou_7 -0.060758
vol_3g_mb_8 -0.061113
loc_ic_t2t_mou_7 -0.061258
monthly_2g_7 -0.061820
loc_og_t2t_mou_7 -0.063534
loc_ic_t2m_mou_6 -0.065530
total_ic_mou_6 -0.065879
std_ic_mou_8 -0.068913
std_og_t2t_mou_8 -0.072706
std_og_t2m_mou_8 -0.073009
ic_others_8_trnsfrm -0.073157
loc_og_t2m_mou_6 -0.074936
loc_ic_mou_6 -0.075223
loc_og_mou_6 -0.076626
aug_vbc_3g -0.081413
loc_og_t2f_mou_8 -0.083690
monthly_2g_8 -0.083851
loc_ic_t2f_mou_8 -0.085501
loc_og_t2t_mou_8 -0.089523
onnet_mou_8 -0.090815
std_og_mou_8 -0.092509
std_ic_t2t_mou_8_trnsfrm -0.096043
loc_ic_t2t_mou_8 -0.097180
std_ic_t2f_mou_8_trnsfrm -0.097314
loc_ic_t2m_mou_7 -0.101247
loc_ic_mou_7 -0.110967
loc_og_t2m_mou_7 -0.111377
total_ic_mou_7 -0.113269
loc_og_mou_7 -0.115236
offnet_mou_8 -0.119262
last_day_rch_amt_8 -0.125655
date_last_rech_7 -0.134863
max_rech_amt_8 -0.136965
aon -0.138380
loc_ic_t2m_mou_8 -0.151994
loc_og_mou_8 -0.156810
loc_og_t2m_mou_8 -0.157403
loc_ic_mou_8 -0.163109
total_rech_num_8 -0.167651
date_last_rech_8 -0.190048
arpu_8 -0.194774
total_og_mou_8 -0.195494
total_ic_mou_8 -0.207454
Name: churn, dtype: float64
sns.boxplot(df_high_end_cus['std_og_t2t_mou_6'])
<matplotlib.axes._subplots.AxesSubplot at 0x7f8c706b7290>
df_high_end_cus.std_og_t2t_mou_6.quantile(0.99)
1885.1980000000017
max(df_high_end_cus['std_og_t2t_mou_6'])
7366.58
As we can see that for the column std_og_t2t_mou_6
,there is a significant difference between the maximum value
and the value of 99 th percentile, this shows there are clearly many outliers.
As a strategy to remove outliers, we retain values till 99th percentile of the top 10 correlated columns with column churn.
churn_corr_cols = ['std_og_mou_6','std_og_t2m_mou_6','std_og_t2t_mou_6','roam_og_mou_7','roam_og_mou_8','total_og_mou_6',
'onnet_mou_6','roam_ic_mou_7','total_rech_num_6','roam_ic_mou_8']
df_high_end_cus = df_high_end_cus[df_high_end_cus.std_og_mou_6 < df_high_end_cus.std_og_mou_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.std_og_t2m_mou_6 < df_high_end_cus.std_og_t2m_mou_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.std_og_t2t_mou_6 < df_high_end_cus.std_og_t2t_mou_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.roam_og_mou_7 < df_high_end_cus.roam_og_mou_7.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.roam_og_mou_8 < df_high_end_cus.roam_og_mou_8.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.total_og_mou_6 < df_high_end_cus.total_og_mou_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.onnet_mou_6 < df_high_end_cus.onnet_mou_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.roam_ic_mou_7 < df_high_end_cus.roam_ic_mou_7.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.total_rech_num_6 < df_high_end_cus.total_rech_num_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.roam_ic_mou_8 < df_high_end_cus.roam_ic_mou_8.quantile(.99)]
df_high_end_cus.shape #after maximum possible outlier treatment
(27134, 143)
X = df_high_end_cus.drop(columns=['churn'],axis=1)
y = df_high_end_cus['churn']
# split into train and test with ratio of 80% and 20%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
train_size=0.8,
test_size = 0.2, random_state=100)
y_train.value_counts()
0 19739
1 1968
Name: churn, dtype: int64
y_test.value_counts()
0 4910
1 517
Name: churn, dtype: int64
As , we see there is a clear class imbalance of the customers who have churned and not churned. So, we apply
methods like SMOTE
(Synthetic Minority OverSampling Technique) to upsample the minority class of data points in
churn
column.
from imblearn.over_sampling import SMOTE
Using TensorFlow backend.
sm = SMOTE(random_state=27, ratio=1.0)
X_train, y_train = sm.fit_sample(X_train, y_train)
np.bincount(y_train) #19739 rows of each class for the column churn
array([19739, 19739])
# Converting n-arrays to dataframe
X_train_df = pd.DataFrame(X_train)
y_train_df = pd.DataFrame(y_train)
X_train_df.columns = X.columns
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_df_scaled = scaler.fit_transform(X_train_df)
Model with simple logistic regression
using SAGA (Stochastic Average Gradient descent solver)
that includes
both L1 and L2 regularisation.
smote = LogisticRegression(solver='saga').fit(X_train_df_scaled, y_train_df)
X_test = scaler.transform(X_test)
smote_pred = smote.predict(X_test)
print(accuracy_score(y_test, smote_pred))
#0.84
# f1 score
print(f1_score(y_test, smote_pred))
#0.51
print(recall_score(y_test, smote_pred))
#0.83
print(precision_score(y_test, smote_pred))
#0.37
0.8494564215957251
0.5145573380867499
0.8375241779497099
0.3713550600343053
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import explained_variance_score
pca = PCA(n_components=40,random_state=100,svd_solver='randomized')
Xtrain_reduced = pca.fit_transform(X_train_df_scaled)
Xtest_reduced = pca.transform(X_test)
regrpca = LogisticRegression()
# Train the model using the principal components of the transformed training sets
regrpca.fit(Xtrain_reduced, y_train_df)
# Make predictions using the principal components of the transformed testing set
y_pred = regrpca.predict(Xtest_reduced)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('R2 score: %.2f' % r2_score(y_test, y_pred))
Mean squared error: 0.19
R2 score: -1.15
sum(pca.explained_variance_ratio_)
0.7986403605500407
#plotting a scree plot
fig = plt.figure(figsize = (12,8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()
print(accuracy_score(y_test, y_pred))
#0.76
# f1 score
print(f1_score(y_test, y_pred))
#0.393
print(recall_score(y_test, y_pred))
#0.814
print(precision_score(y_test, y_pred))
#0.259
0.8148148148148148
0.45646295294753925
0.816247582205029
0.31681681681681684
We see that the accuracy scores, precision,recall scores are pretty low for logistic regression model with PCA components
Let's start with the default hyperparameters
# Importing random forest classifier from sklearn library
from sklearn.ensemble import RandomForestClassifier
# Running the random forest with default parameters.
rfc = RandomForestClassifier()
# fit
rfc.fit(Xtrain_reduced,y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
# Making predictions
predictions = rfc.predict(Xtest_reduced)
# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
# Let's check the report of our default model
print(classification_report(y_test,predictions))
precision recall f1-score support
0 0.95 0.94 0.94 4910
1 0.46 0.51 0.49 517
accuracy 0.90 5427
macro avg 0.70 0.72 0.71 5427
weighted avg 0.90 0.90 0.90 5427
# Printing confusion matrix
print(confusion_matrix(y_test,predictions))
[[4600 310]
[ 252 265]]
print(accuracy_score(y_test,predictions))
0.896443707388981
We see that we get a accuracy of 88.8%
, average precision of 69 %
, recall of 73 %
if we use random forest model with PCA Components.
# GridSearchCV to find optimal n_estimators
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
# specify number of folds for k-fold CV
n_folds = 5
# parameters to build the model on
parameters = {'max_depth': range(2, 20, 5)}
# instantiate the model
rf = RandomForestClassifier()
# fit tree on training data
rf = GridSearchCV(rf, parameters,
cv=n_folds,
scoring="accuracy",return_train_score=True)
rf.fit(Xtrain_reduced, y_train)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
criterion='gini', max_depth=None,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators='warn', n_jobs=None,
oob_score=False,
random_state=None, verbose=0,
warm_start=False),
iid='warn', n_jobs=None, param_grid={'max_depth': range(2, 20, 5)},
pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
scoring='accuracy', verbose=0)
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
# plotting accuracies with max_depth
plt.figure()
plt.plot(scores["param_max_depth"],
scores["mean_train_score"],
label="training accuracy")
plt.plot(scores["param_max_depth"],
scores["mean_test_score"],
label="test accuracy")
plt.xlabel("max_depth")
plt.ylabel("Accuracy")
plt.legend()
plt.show()
Thus , we see that the above a certain point the depth if increased the training accuracy and test accuracy seems to grow at a steady rate.
# GridSearchCV to find optimal n_estimators
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
# specify number of folds for k-fold CV
n_folds = 5
# parameters to build the model on
parameters = {'n_estimators': range(20, 100, 10)}
# instantiate the model (note we are specifying a max_depth)
rf = RandomForestClassifier(max_depth=6)
# fit tree on training data
rf = GridSearchCV(rf, parameters,
cv=n_folds,
scoring="accuracy",return_train_score=True)
rf.fit(Xtrain_reduced, y_train)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
criterion='gini', max_depth=6,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators='warn', n_jobs=None,
oob_score=False,
random_state=None, verbose=0,
warm_start=False),
iid='warn', n_jobs=None,
param_grid={'n_estimators': range(20, 100, 10)},
pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
scoring='accuracy', verbose=0)
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
# plotting accuracies with n_estimators
plt.figure()
plt.plot(scores["param_n_estimators"],
scores["mean_train_score"],
label="training accuracy")
plt.plot(scores["param_n_estimators"],
scores["mean_test_score"],
label="test accuracy")
plt.xlabel("n_estimators")
plt.ylabel("Accuracy")
plt.legend()
plt.show()
We see that optimal value of n_estimators would be iin range of 60 to 90.
# GridSearchCV to find optimal max_features
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
# specify number of folds for k-fold CV
n_folds = 5
# parameters to build the model on
parameters = {'max_features': [4, 8, 14, 20, 24]}
# instantiate the model
rf = RandomForestClassifier(max_depth=6)
# fit tree on training data
rf = GridSearchCV(rf, parameters,
cv=n_folds,
scoring="accuracy",return_train_score=True)
rf.fit(Xtrain_reduced, y_train)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
criterion='gini', max_depth=6,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators='warn', n_jobs=None,
oob_score=False,
random_state=None, verbose=0,
warm_start=False),
iid='warn', n_jobs=None,
param_grid={'max_features': [4, 8, 14, 20, 24]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
scoring='accuracy', verbose=0)
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
# plotting accuracies with max_features
plt.figure()
plt.plot(scores["param_max_features"],
scores["mean_train_score"],
label="training accuracy")
plt.plot(scores["param_max_features"],
scores["mean_test_score"],
label="test accuracy")
plt.xlabel("max_features")
plt.ylabel("Accuracy")
plt.legend()
plt.show()
We see that , the number of maximum features that should be present is between 15 to 20
Let's now look at the performance of the ensemble as we vary min_samples_split.
# GridSearchCV to find optimal min_samples_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
# specify number of folds for k-fold CV
n_folds = 5
# parameters to build the model on
parameters = {'min_samples_split': range(10, 200, 50)}
# instantiate the model
rf = RandomForestClassifier()
# fit tree on training data
rf = GridSearchCV(rf, parameters,
cv=n_folds,
scoring="accuracy",return_train_score=True)
rf.fit(Xtrain_reduced, y_train)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
criterion='gini', max_depth=None,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators='warn', n_jobs=None,
oob_score=False,
random_state=None, verbose=0,
warm_start=False),
iid='warn', n_jobs=None,
param_grid={'min_samples_split': range(10, 200, 50)},
pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
scoring='accuracy', verbose=0)
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
# plotting accuracies with min_samples_split
plt.figure()
plt.plot(scores["param_min_samples_split"],
scores["mean_train_score"],
label="training accuracy")
plt.plot(scores["param_min_samples_split"],
scores["mean_test_score"],
label="test accuracy")
plt.xlabel("min_samples_split")
plt.ylabel("Accuracy")
plt.legend()
plt.show()
We see that the accuracy goes down as the number of samples per split increases.
# GridSearchCV to find optimal min_samples_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
# specify number of folds for k-fold CV
n_folds = 5
# parameters to build the model on
parameters = {'min_samples_leaf': range(200, 500, 50)}
# instantiate the model
rf = RandomForestClassifier()
# fit tree on training data
rf = GridSearchCV(rf, parameters,
cv=n_folds,
scoring="accuracy",return_train_score=True)
rf.fit(Xtrain_reduced, y_train)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
criterion='gini', max_depth=None,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators='warn', n_jobs=None,
oob_score=False,
random_state=None, verbose=0,
warm_start=False),
iid='warn', n_jobs=None,
param_grid={'min_samples_leaf': range(200, 500, 50)},
pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
scoring='accuracy', verbose=0)
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
# plotting accuracies with min_samples_split
plt.figure()
plt.plot(scores["param_min_samples_leaf"],
scores["mean_train_score"],
label="training accuracy")
plt.plot(scores["param_min_samples_leaf"],
scores["mean_test_score"],
label="test accuracy")
plt.xlabel("min_samples_leaf")
plt.ylabel("Accuracy")
plt.legend()
plt.show()
# GridSearchCV to find optimal min_samples_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
# specify number of folds for k-fold CV
n_folds = 5
# parameters to build the model on
parameters = {'max_features': range(5, 30, 5)}
# instantiate the model
rf = RandomForestClassifier()
# fit tree on training data
rf = GridSearchCV(rf, parameters,
cv=n_folds,
scoring="accuracy",return_train_score=True)
rf.fit(Xtrain_reduced, y_train)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
criterion='gini', max_depth=None,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators='warn', n_jobs=None,
oob_score=False,
random_state=None, verbose=0,
warm_start=False),
iid='warn', n_jobs=None,
param_grid={'max_features': range(5, 30, 5)},
pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
scoring='accuracy', verbose=0)
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
# plotting accuracies with min_samples_split
plt.figure()
plt.plot(scores["param_max_features"],
scores["mean_train_score"],
label="training accuracy")
plt.plot(scores["param_max_features"],
scores["mean_test_score"],
label="test accuracy")
plt.xlabel("max_features")
plt.ylabel("Accuracy")
plt.legend()
plt.show()
We can now find the optimal hyperparameters using GridSearchCV.
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search
param_grid = {
'max_depth': [12,14,16],
'min_samples_leaf': range(100, 200, 350),
'min_samples_split': range(100, 150, 200),
'n_estimators': [30,60, 90],
'max_features': [10, 15]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
cv = 3, n_jobs = -1,verbose = 1)
# Fit the grid search to the data
grid_search.fit(Xtrain_reduced, y_train)
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 4.3min
[Parallel(n_jobs=-1)]: Done 54 out of 54 | elapsed: 5.9min finished
GridSearchCV(cv=3, error_score='raise-deprecating',
estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
criterion='gini', max_depth=None,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators='warn', n_jobs=None,
oob_score=False,
random_state=None, verbose=0,
warm_start=False),
iid='warn', n_jobs=-1,
param_grid={'max_depth': [12, 14, 16], 'max_features': [10, 15],
'min_samples_leaf': range(100, 200, 350),
'min_samples_split': range(100, 150, 200),
'n_estimators': [30, 60, 90]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=1)
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)
We can get accuracy of 0.8487005420740665 using {'max_depth': 16, 'max_features': 10, 'min_samples_leaf': 100, 'min_samples_split': 100, 'n_estimators': 60}
Fitting the final model with the best parameters obtained from grid search.
# model with the best hyperparameters
from sklearn.ensemble import RandomForestClassifier
rfc_selected_model = RandomForestClassifier(bootstrap=True,
max_depth=16,
min_samples_leaf=100,
min_samples_split=100,
max_features=10,
n_estimators=90)
# fit
rfc_selected_model.fit(Xtrain_reduced,y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=16, max_features=10, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=100, min_samples_split=100,
min_weight_fraction_leaf=0.0, n_estimators=90,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
# predictions for the selected model
predictions_final_selected = rfc_selected_model.predict(Xtest_reduced)
# evaluation metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(classification_report(y_test,predictions_final_selected))
precision recall f1-score support
0 0.97 0.86 0.91 4910
1 0.37 0.75 0.50 517
accuracy 0.85 5427
macro avg 0.67 0.81 0.71 5427
weighted avg 0.91 0.85 0.87 5427
print(confusion_matrix(y_test,predictions_final_selected))
[[4246 664]
[ 127 390]]
print(accuracy_score(y_test,predictions_final_selected))
0.8542472821079786
We see that on tuning the hyperparameters we get a accuracy of 85.42 %
, precision of 67%
, recall of 81%
.
from sklearn.svm import SVC
# Model building
# instantiate an object of class SVC()
# note that we are using cost C=1
svm_model = SVC(C = 1)
# fit
svm_model.fit(Xtrain_reduced, y_train)
# predict
y_pred = svm_model.predict(Xtest_reduced)
# Evaluate the model using confusion matrix
from sklearn import metrics
metrics.confusion_matrix(y_true=y_test, y_pred=y_pred)
array([[4513, 397],
[ 186, 331]])
# print other metrics
# accuracy
print("accuracy", metrics.accuracy_score(y_test, y_pred))
# precision
print("precision", metrics.precision_score(y_test, y_pred))
# recall/sensitivity
print("recall", metrics.recall_score(y_test, y_pred))
accuracy 0.892574166206007
precision 0.45467032967032966
recall 0.6402321083172147
Let's first run a simple k-fold cross validation to get a sense of the average metrics as computed over multiple folds. the easiest way to do cross-validation is to use the cross_val_score()
function.
# creating a KFold object with 5 splits
folds = KFold(n_splits = 5, shuffle = True, random_state = 4)
# instantiating a model with cost=1
svm_model_tuned = SVC(C = 1)
from sklearn.model_selection import cross_val_score
# computing the cross-validation scores
# note that the argument cv takes the 'folds' object, and
# we have specified 'accuracy' as the metric
cv_results = cross_val_score(svm_model_tuned, Xtrain_reduced, y_train, cv = folds, scoring = 'accuracy')
# print 5 accuracies obtained from the 5 folds
print(cv_results)
print("mean accuracy = {}".format(cv_results.mean()))
[0.94224924 0.94136272 0.94034954 0.94528182 0.93742875]
mean accuracy = 0.9413344151615075
Thus , we see that the mean accuracy of the svm model is 0.94
Precision with default_parameters is 0.45
Recall with default parameters is 0.64
from xgboost import XGBClassifier
xgb_model = XGBClassifier()
xgb_model.fit(Xtrain_reduced,y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0,
learning_rate=0.1, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
nthread=None, objective='binary:logistic', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=None, subsample=1, verbosity=1)
y_pred = xgb_model.predict(Xtest_reduced)
accuracy_score(y_pred,y_test)
0.8395061728395061
precision_score(y_pred,y_test)
0.781431334622824
recall_score(y_pred,y_test)
0.3476764199655766
From , XG Boost Classifier, we get the following metrics for the model with PCA Components:
83.9%
78.1%
34.76%
Thus from the models we see above with PCA , we see that random forest has performed best in terms of recall and precision score.
Recall is important here, as it gives a measure of the predicted customers to churn against customers who would actually churn. Since the company focuses on retaining the high-value customers, so high recall for the model is important. On the other hand, precision is a measure of the actual customers who would churn out, among those predicted to churn out. So, even if the model predicts few customers to churn out eventhough they will not churn actually, that is less harmful than the model not able to identify a customer who would actually churn out.
final_pred = pd.Series(predictions_final_selected)
final_pred.value_counts()
0 4373
1 1054
dtype: int64
Thus 19.4
% of the high-value customers are predicted to churn in the test data.