Jovian
⭐️
Sign In
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import pandas_profiling as pp
import sys
!{sys.executable} -m pip install pandas-profiling

from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score 
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

df = pd.read_csv('telecom_churn_data.csv')
'C:\Users\Arijit' is not recognized as an internal or external command, operable program or batch file.
In [2]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
df.shape
Out[3]:
(99999, 226)

Original Dataset has 99999 rows and 226 columns

In [4]:
df.head()
Out[4]:
In [5]:
df.describe()
Out[5]:

Since, the analysis is for high value customers,hence we should look for customers who pay an amount greater than 70 th percentile of average recharge done in the first two months(6,7)

In [6]:
# prepare a derived column for high return customer based on condition given
# prepare a column for usage of data ..separate for 2g and 3g 
# prepare a column for usage of mobile data..3g use max or 2g max?
# customer spends mostly call services / data services ? (done)

In [7]:
df['avg_rech_good_month']= (df['total_rech_amt_6']+df['total_rech_amt_7'])/2
In [8]:
df['avg_rech_good_month'].quantile(0.7)
Out[8]:
368.5
In [9]:
df_high_end_cus= df[df['avg_rech_good_month']>=368.5]
In [10]:
df_high_end_cus.shape
Out[10]:
(30011, 227)
In [11]:
df_high_end_cus.isna().sum()/len(df_high_end_cus) *100  # % of missing data /NA Values in the dataframe
Out[11]:
mobile_number                0.000000
circle_id                    0.000000
loc_og_t2o_mou               0.379861
std_og_t2o_mou               0.379861
loc_ic_t2o_mou               0.379861
last_date_of_month_6         0.000000
last_date_of_month_7         0.103295
last_date_of_month_8         0.523142
last_date_of_month_9         1.199560
arpu_6                       0.000000
arpu_7                       0.000000
arpu_8                       0.000000
arpu_9                       0.000000
onnet_mou_6                  1.052947
onnet_mou_7                  1.009630
onnet_mou_8                  3.125521
onnet_mou_9                  5.677918
offnet_mou_6                 1.052947
offnet_mou_7                 1.009630
offnet_mou_8                 3.125521
offnet_mou_9                 5.677918
roam_ic_mou_6                1.052947
roam_ic_mou_7                1.009630
roam_ic_mou_8                3.125521
roam_ic_mou_9                5.677918
roam_og_mou_6                1.052947
roam_og_mou_7                1.009630
roam_og_mou_8                3.125521
roam_og_mou_9                5.677918
loc_og_t2t_mou_6             1.052947
loc_og_t2t_mou_7             1.009630
loc_og_t2t_mou_8             3.125521
loc_og_t2t_mou_9             5.677918
loc_og_t2m_mou_6             1.052947
loc_og_t2m_mou_7             1.009630
loc_og_t2m_mou_8             3.125521
loc_og_t2m_mou_9             5.677918
loc_og_t2f_mou_6             1.052947
loc_og_t2f_mou_7             1.009630
loc_og_t2f_mou_8             3.125521
loc_og_t2f_mou_9             5.677918
loc_og_t2c_mou_6             1.052947
loc_og_t2c_mou_7             1.009630
loc_og_t2c_mou_8             3.125521
loc_og_t2c_mou_9             5.677918
loc_og_mou_6                 1.052947
loc_og_mou_7                 1.009630
loc_og_mou_8                 3.125521
loc_og_mou_9                 5.677918
std_og_t2t_mou_6             1.052947
std_og_t2t_mou_7             1.009630
std_og_t2t_mou_8             3.125521
std_og_t2t_mou_9             5.677918
std_og_t2m_mou_6             1.052947
std_og_t2m_mou_7             1.009630
std_og_t2m_mou_8             3.125521
std_og_t2m_mou_9             5.677918
std_og_t2f_mou_6             1.052947
std_og_t2f_mou_7             1.009630
std_og_t2f_mou_8             3.125521
std_og_t2f_mou_9             5.677918
std_og_t2c_mou_6             1.052947
std_og_t2c_mou_7             1.009630
std_og_t2c_mou_8             3.125521
std_og_t2c_mou_9             5.677918
std_og_mou_6                 1.052947
std_og_mou_7                 1.009630
std_og_mou_8                 3.125521
std_og_mou_9                 5.677918
isd_og_mou_6                 1.052947
isd_og_mou_7                 1.009630
isd_og_mou_8                 3.125521
isd_og_mou_9                 5.677918
spl_og_mou_6                 1.052947
spl_og_mou_7                 1.009630
spl_og_mou_8                 3.125521
spl_og_mou_9                 5.677918
og_others_6                  1.052947
og_others_7                  1.009630
og_others_8                  3.125521
og_others_9                  5.677918
total_og_mou_6               0.000000
total_og_mou_7               0.000000
total_og_mou_8               0.000000
total_og_mou_9               0.000000
loc_ic_t2t_mou_6             1.052947
loc_ic_t2t_mou_7             1.009630
loc_ic_t2t_mou_8             3.125521
loc_ic_t2t_mou_9             5.677918
loc_ic_t2m_mou_6             1.052947
loc_ic_t2m_mou_7             1.009630
loc_ic_t2m_mou_8             3.125521
loc_ic_t2m_mou_9             5.677918
loc_ic_t2f_mou_6             1.052947
loc_ic_t2f_mou_7             1.009630
loc_ic_t2f_mou_8             3.125521
loc_ic_t2f_mou_9             5.677918
loc_ic_mou_6                 1.052947
loc_ic_mou_7                 1.009630
loc_ic_mou_8                 3.125521
loc_ic_mou_9                 5.677918
std_ic_t2t_mou_6             1.052947
std_ic_t2t_mou_7             1.009630
std_ic_t2t_mou_8             3.125521
std_ic_t2t_mou_9             5.677918
std_ic_t2m_mou_6             1.052947
std_ic_t2m_mou_7             1.009630
std_ic_t2m_mou_8             3.125521
std_ic_t2m_mou_9             5.677918
std_ic_t2f_mou_6             1.052947
std_ic_t2f_mou_7             1.009630
std_ic_t2f_mou_8             3.125521
std_ic_t2f_mou_9             5.677918
std_ic_t2o_mou_6             1.052947
std_ic_t2o_mou_7             1.009630
std_ic_t2o_mou_8             3.125521
std_ic_t2o_mou_9             5.677918
std_ic_mou_6                 1.052947
std_ic_mou_7                 1.009630
std_ic_mou_8                 3.125521
std_ic_mou_9                 5.677918
total_ic_mou_6               0.000000
total_ic_mou_7               0.000000
total_ic_mou_8               0.000000
total_ic_mou_9               0.000000
spl_ic_mou_6                 1.052947
spl_ic_mou_7                 1.009630
spl_ic_mou_8                 3.125521
spl_ic_mou_9                 5.677918
isd_ic_mou_6                 1.052947
isd_ic_mou_7                 1.009630
isd_ic_mou_8                 3.125521
isd_ic_mou_9                 5.677918
ic_others_6                  1.052947
ic_others_7                  1.009630
ic_others_8                  3.125521
ic_others_9                  5.677918
total_rech_num_6             0.000000
total_rech_num_7             0.000000
total_rech_num_8             0.000000
total_rech_num_9             0.000000
total_rech_amt_6             0.000000
total_rech_amt_7             0.000000
total_rech_amt_8             0.000000
total_rech_amt_9             0.000000
max_rech_amt_6               0.000000
max_rech_amt_7               0.000000
max_rech_amt_8               0.000000
max_rech_amt_9               0.000000
date_of_last_rech_6          0.206591
date_of_last_rech_7          0.379861
date_of_last_rech_8          1.979274
date_of_last_rech_9          2.885609
last_day_rch_amt_6           0.000000
last_day_rch_amt_7           0.000000
last_day_rch_amt_8           0.000000
last_day_rch_amt_9           0.000000
date_of_last_rech_data_6    62.023925
date_of_last_rech_data_7    61.140915
date_of_last_rech_data_8    60.834361
date_of_last_rech_data_9    61.810669
total_rech_data_6           62.023925
total_rech_data_7           61.140915
total_rech_data_8           60.834361
total_rech_data_9           61.810669
max_rech_data_6             62.023925
max_rech_data_7             61.140915
max_rech_data_8             60.834361
max_rech_data_9             61.810669
count_rech_2g_6             62.023925
count_rech_2g_7             61.140915
count_rech_2g_8             60.834361
count_rech_2g_9             61.810669
count_rech_3g_6             62.023925
count_rech_3g_7             61.140915
count_rech_3g_8             60.834361
count_rech_3g_9             61.810669
av_rech_amt_data_6          62.023925
av_rech_amt_data_7          61.140915
av_rech_amt_data_8          60.834361
av_rech_amt_data_9          61.810669
vol_2g_mb_6                  0.000000
vol_2g_mb_7                  0.000000
vol_2g_mb_8                  0.000000
vol_2g_mb_9                  0.000000
vol_3g_mb_6                  0.000000
vol_3g_mb_7                  0.000000
vol_3g_mb_8                  0.000000
vol_3g_mb_9                  0.000000
arpu_3g_6                   62.023925
arpu_3g_7                   61.140915
arpu_3g_8                   60.834361
arpu_3g_9                   61.810669
arpu_2g_6                   62.023925
arpu_2g_7                   61.140915
arpu_2g_8                   60.834361
arpu_2g_9                   61.810669
night_pck_user_6            62.023925
night_pck_user_7            61.140915
night_pck_user_8            60.834361
night_pck_user_9            61.810669
monthly_2g_6                 0.000000
monthly_2g_7                 0.000000
monthly_2g_8                 0.000000
monthly_2g_9                 0.000000
sachet_2g_6                  0.000000
sachet_2g_7                  0.000000
sachet_2g_8                  0.000000
sachet_2g_9                  0.000000
monthly_3g_6                 0.000000
monthly_3g_7                 0.000000
monthly_3g_8                 0.000000
monthly_3g_9                 0.000000
sachet_3g_6                  0.000000
sachet_3g_7                  0.000000
sachet_3g_8                  0.000000
sachet_3g_9                  0.000000
fb_user_6                   62.023925
fb_user_7                   61.140915
fb_user_8                   60.834361
fb_user_9                   61.810669
aon                          0.000000
aug_vbc_3g                   0.000000
jul_vbc_3g                   0.000000
jun_vbc_3g                   0.000000
sep_vbc_3g                   0.000000
avg_rech_good_month          0.000000
dtype: float64

As we see that few columns like date_of_last_rech_data_6,date_of_last_rech_data_7,date_of_last_rech_data_8, date_of_last_rech_data_9,total_rech_data_6,total_rech_data_7,total_rech_data_8,total_rech_data_9, max_rech_data_6,max_rech_data_7,max_rech_data_8,max_rech_data_9,count_rech_2g_6,count_rech_2g_7, count_rech_2g_8,count_rech_2g_9,count_rech_3g_6,count_rech_3g_7,count_rech_3g_8,count_rech_3g_9, av_rech_amt_data_6,av_rech_amt_data_7,av_rech_amt_data_8,av_rech_amt_data_9,arpu_3g_6,arpu_3g_7, arpu_3g_8,arpu_3g_9,fb_user_6,fb_user_7,fb_user_8,fb_user_9 have high null values , i.e above 50 percent . Hence, we can drop them based on their significance.

For columns like fb_user_6,fb_user_7,fb_user_8,fb_user_9, as because usage of facebook might not be a factor for deciding whether customer will churn or not.

In [12]:
df_high_end_cus = df_high_end_cus.drop(columns=['fb_user_6','fb_user_7','fb_user_8','fb_user_9'],axis=1)

For columns like count_rech_2g_6,count_rech_2g_7,count_rech_2g_8,count_rech_2g_9, count_rech_3g_6, count_rech_3g_7,count_rech_3g_8,count_rech_3g_9, we should not remove these columns as they might be helpful in predicting churning of customers.

For columns like max_rech_data_9,max_rech_data_8,max_rech_data_7,max_rech_data_6, total_rech_data_9, total_rech_data_8,total_rech_data_7,total_rech_data_6, we retain recharge data.

For columns like arpu_3g_9,arpu_3g_8,arpu_3g_7,arpu_3g_6, arpu_2g_9, arpu_2g_8,arpu_2g_7,arpu_2g_6, we should retain this column as the average revenue of internet users can also help in predicting churning.

Similarly for other columns with high null values , we remove the columns

In [13]:
cols_to_retain = ['count_rech_2g_6','count_rech_2g_7','count_rech_2g_8','count_rech_2g_9','count_rech_3g_6',
                 'count_rech_3g_7','count_rech_3g_8','count_rech_3g_9','max_rech_data_9','max_rech_data_8',
                 'max_rech_data_7','max_rech_data_6','total_rech_data_9','total_rech_data_8','total_rech_data_7',
                 'total_rech_data_6','arpu_2g_6','arpu_2g_7','arpu_2g_8','arpu_2g_9','arpu_3g_9','arpu_3g_8','arpu_3g_7','arpu_3g_6']

df_high_end_cus[cols_to_retain] = df_high_end_cus[cols_to_retain].fillna(df_high_end_cus[cols_to_retain].median())
In [14]:
# for columns like like night pack user or not we can remove those columns as already the recharges have been covered.

df_high_end_cus = df_high_end_cus.drop(columns=['night_pck_user_6','night_pck_user_7','night_pck_user_8',
                                                'night_pck_user_9'],axis=1)

Since , columns are numeric , we can use mean values for respective columns to impute missing values.

In [15]:
df_high_end_cus = df_high_end_cus.fillna(df_high_end_cus.median())
In [16]:
df_high_end_cus.isna().sum() /len(df_high_end_cus) *100
Out[16]:
mobile_number                0.000000
circle_id                    0.000000
loc_og_t2o_mou               0.000000
std_og_t2o_mou               0.000000
loc_ic_t2o_mou               0.000000
last_date_of_month_6         0.000000
last_date_of_month_7         0.103295
last_date_of_month_8         0.523142
last_date_of_month_9         1.199560
arpu_6                       0.000000
arpu_7                       0.000000
arpu_8                       0.000000
arpu_9                       0.000000
onnet_mou_6                  0.000000
onnet_mou_7                  0.000000
onnet_mou_8                  0.000000
onnet_mou_9                  0.000000
offnet_mou_6                 0.000000
offnet_mou_7                 0.000000
offnet_mou_8                 0.000000
offnet_mou_9                 0.000000
roam_ic_mou_6                0.000000
roam_ic_mou_7                0.000000
roam_ic_mou_8                0.000000
roam_ic_mou_9                0.000000
roam_og_mou_6                0.000000
roam_og_mou_7                0.000000
roam_og_mou_8                0.000000
roam_og_mou_9                0.000000
loc_og_t2t_mou_6             0.000000
loc_og_t2t_mou_7             0.000000
loc_og_t2t_mou_8             0.000000
loc_og_t2t_mou_9             0.000000
loc_og_t2m_mou_6             0.000000
loc_og_t2m_mou_7             0.000000
loc_og_t2m_mou_8             0.000000
loc_og_t2m_mou_9             0.000000
loc_og_t2f_mou_6             0.000000
loc_og_t2f_mou_7             0.000000
loc_og_t2f_mou_8             0.000000
loc_og_t2f_mou_9             0.000000
loc_og_t2c_mou_6             0.000000
loc_og_t2c_mou_7             0.000000
loc_og_t2c_mou_8             0.000000
loc_og_t2c_mou_9             0.000000
loc_og_mou_6                 0.000000
loc_og_mou_7                 0.000000
loc_og_mou_8                 0.000000
loc_og_mou_9                 0.000000
std_og_t2t_mou_6             0.000000
std_og_t2t_mou_7             0.000000
std_og_t2t_mou_8             0.000000
std_og_t2t_mou_9             0.000000
std_og_t2m_mou_6             0.000000
std_og_t2m_mou_7             0.000000
std_og_t2m_mou_8             0.000000
std_og_t2m_mou_9             0.000000
std_og_t2f_mou_6             0.000000
std_og_t2f_mou_7             0.000000
std_og_t2f_mou_8             0.000000
std_og_t2f_mou_9             0.000000
std_og_t2c_mou_6             0.000000
std_og_t2c_mou_7             0.000000
std_og_t2c_mou_8             0.000000
std_og_t2c_mou_9             0.000000
std_og_mou_6                 0.000000
std_og_mou_7                 0.000000
std_og_mou_8                 0.000000
std_og_mou_9                 0.000000
isd_og_mou_6                 0.000000
isd_og_mou_7                 0.000000
isd_og_mou_8                 0.000000
isd_og_mou_9                 0.000000
spl_og_mou_6                 0.000000
spl_og_mou_7                 0.000000
spl_og_mou_8                 0.000000
spl_og_mou_9                 0.000000
og_others_6                  0.000000
og_others_7                  0.000000
og_others_8                  0.000000
og_others_9                  0.000000
total_og_mou_6               0.000000
total_og_mou_7               0.000000
total_og_mou_8               0.000000
total_og_mou_9               0.000000
loc_ic_t2t_mou_6             0.000000
loc_ic_t2t_mou_7             0.000000
loc_ic_t2t_mou_8             0.000000
loc_ic_t2t_mou_9             0.000000
loc_ic_t2m_mou_6             0.000000
loc_ic_t2m_mou_7             0.000000
loc_ic_t2m_mou_8             0.000000
loc_ic_t2m_mou_9             0.000000
loc_ic_t2f_mou_6             0.000000
loc_ic_t2f_mou_7             0.000000
loc_ic_t2f_mou_8             0.000000
loc_ic_t2f_mou_9             0.000000
loc_ic_mou_6                 0.000000
loc_ic_mou_7                 0.000000
loc_ic_mou_8                 0.000000
loc_ic_mou_9                 0.000000
std_ic_t2t_mou_6             0.000000
std_ic_t2t_mou_7             0.000000
std_ic_t2t_mou_8             0.000000
std_ic_t2t_mou_9             0.000000
std_ic_t2m_mou_6             0.000000
std_ic_t2m_mou_7             0.000000
std_ic_t2m_mou_8             0.000000
std_ic_t2m_mou_9             0.000000
std_ic_t2f_mou_6             0.000000
std_ic_t2f_mou_7             0.000000
std_ic_t2f_mou_8             0.000000
std_ic_t2f_mou_9             0.000000
std_ic_t2o_mou_6             0.000000
std_ic_t2o_mou_7             0.000000
std_ic_t2o_mou_8             0.000000
std_ic_t2o_mou_9             0.000000
std_ic_mou_6                 0.000000
std_ic_mou_7                 0.000000
std_ic_mou_8                 0.000000
std_ic_mou_9                 0.000000
total_ic_mou_6               0.000000
total_ic_mou_7               0.000000
total_ic_mou_8               0.000000
total_ic_mou_9               0.000000
spl_ic_mou_6                 0.000000
spl_ic_mou_7                 0.000000
spl_ic_mou_8                 0.000000
spl_ic_mou_9                 0.000000
isd_ic_mou_6                 0.000000
isd_ic_mou_7                 0.000000
isd_ic_mou_8                 0.000000
isd_ic_mou_9                 0.000000
ic_others_6                  0.000000
ic_others_7                  0.000000
ic_others_8                  0.000000
ic_others_9                  0.000000
total_rech_num_6             0.000000
total_rech_num_7             0.000000
total_rech_num_8             0.000000
total_rech_num_9             0.000000
total_rech_amt_6             0.000000
total_rech_amt_7             0.000000
total_rech_amt_8             0.000000
total_rech_amt_9             0.000000
max_rech_amt_6               0.000000
max_rech_amt_7               0.000000
max_rech_amt_8               0.000000
max_rech_amt_9               0.000000
date_of_last_rech_6          0.206591
date_of_last_rech_7          0.379861
date_of_last_rech_8          1.979274
date_of_last_rech_9          2.885609
last_day_rch_amt_6           0.000000
last_day_rch_amt_7           0.000000
last_day_rch_amt_8           0.000000
last_day_rch_amt_9           0.000000
date_of_last_rech_data_6    62.023925
date_of_last_rech_data_7    61.140915
date_of_last_rech_data_8    60.834361
date_of_last_rech_data_9    61.810669
total_rech_data_6            0.000000
total_rech_data_7            0.000000
total_rech_data_8            0.000000
total_rech_data_9            0.000000
max_rech_data_6              0.000000
max_rech_data_7              0.000000
max_rech_data_8              0.000000
max_rech_data_9              0.000000
count_rech_2g_6              0.000000
count_rech_2g_7              0.000000
count_rech_2g_8              0.000000
count_rech_2g_9              0.000000
count_rech_3g_6              0.000000
count_rech_3g_7              0.000000
count_rech_3g_8              0.000000
count_rech_3g_9              0.000000
av_rech_amt_data_6           0.000000
av_rech_amt_data_7           0.000000
av_rech_amt_data_8           0.000000
av_rech_amt_data_9           0.000000
vol_2g_mb_6                  0.000000
vol_2g_mb_7                  0.000000
vol_2g_mb_8                  0.000000
vol_2g_mb_9                  0.000000
vol_3g_mb_6                  0.000000
vol_3g_mb_7                  0.000000
vol_3g_mb_8                  0.000000
vol_3g_mb_9                  0.000000
arpu_3g_6                    0.000000
arpu_3g_7                    0.000000
arpu_3g_8                    0.000000
arpu_3g_9                    0.000000
arpu_2g_6                    0.000000
arpu_2g_7                    0.000000
arpu_2g_8                    0.000000
arpu_2g_9                    0.000000
monthly_2g_6                 0.000000
monthly_2g_7                 0.000000
monthly_2g_8                 0.000000
monthly_2g_9                 0.000000
sachet_2g_6                  0.000000
sachet_2g_7                  0.000000
sachet_2g_8                  0.000000
sachet_2g_9                  0.000000
monthly_3g_6                 0.000000
monthly_3g_7                 0.000000
monthly_3g_8                 0.000000
monthly_3g_9                 0.000000
sachet_3g_6                  0.000000
sachet_3g_7                  0.000000
sachet_3g_8                  0.000000
sachet_3g_9                  0.000000
aon                          0.000000
aug_vbc_3g                   0.000000
jul_vbc_3g                   0.000000
jun_vbc_3g                   0.000000
sep_vbc_3g                   0.000000
avg_rech_good_month          0.000000
dtype: float64

For dates we have to handle data bit differently date_of_last_rech_data_9,date_of_last_rech_data_8, date_of_last_rech_data_7,date_of_last_rech_data_6,last_date_of_month_6,last_date_of_month_7,last_date_of_month_8,last_date_of_month_9.. we separate the dates as separate column for each month

In [17]:
date_cols = ['date_of_last_rech_data_9','date_of_last_rech_data_8','date_of_last_rech_data_7','date_of_last_rech_data_6',
               'last_date_of_month_6','last_date_of_month_7','last_date_of_month_8','last_date_of_month_9',
            'date_of_last_rech_6','date_of_last_rech_7','date_of_last_rech_8','date_of_last_rech_9']
In [18]:
#df_high_end_cus[date_cols] = df_high_end_cus[date_cols].astype('datetime64[ns]')
In [19]:
df_high_end_cus[date_cols].isna().sum()
Out[19]:
date_of_last_rech_data_9    18550
date_of_last_rech_data_8    18257
date_of_last_rech_data_7    18349
date_of_last_rech_data_6    18614
last_date_of_month_6            0
last_date_of_month_7           31
last_date_of_month_8          157
last_date_of_month_9          360
date_of_last_rech_6            62
date_of_last_rech_7           114
date_of_last_rech_8           594
date_of_last_rech_9           866
dtype: int64
In [20]:
#extracting days from the datetime format columns
df_high_end_cus['date_last_rech_data_9'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_data_9']).dt.day
df_high_end_cus['date_last_rech_data_8'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_data_8']).dt.day
df_high_end_cus['date_last_rech_data_7'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_data_7']).dt.day
df_high_end_cus['date_last_rech_data_6'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_data_6']).dt.day
df_high_end_cus['date_last_rech_6'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_6']).dt.day
df_high_end_cus['date_last_rech_7'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_7']).dt.day
df_high_end_cus['date_last_rech_8'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_8']).dt.day
df_high_end_cus['date_last_rech_9'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_9']).dt.day
df_high_end_cus['last_date_@_month_6'] = pd.to_datetime(df_high_end_cus['last_date_of_month_6']).dt.day
df_high_end_cus['last_date_@_month_7'] = pd.to_datetime(df_high_end_cus['last_date_of_month_7']).dt.day
df_high_end_cus['last_date_@_month_8'] = pd.to_datetime(df_high_end_cus['last_date_of_month_8']).dt.day
df_high_end_cus['last_date_@_month_9'] = pd.to_datetime(df_high_end_cus['last_date_of_month_9']).dt.day

In [21]:
df_high_end_cus = df_high_end_cus.drop(columns = date_cols,axis=1) # sropping all the original date-time columns
In [22]:
df_high_end_cus = df_high_end_cus.fillna(df_high_end_cus.median()) # filling columns with days ..with median 
In [23]:
df_high_end_cus.isna().sum()
Out[23]:
mobile_number            0
circle_id                0
loc_og_t2o_mou           0
std_og_t2o_mou           0
loc_ic_t2o_mou           0
arpu_6                   0
arpu_7                   0
arpu_8                   0
arpu_9                   0
onnet_mou_6              0
onnet_mou_7              0
onnet_mou_8              0
onnet_mou_9              0
offnet_mou_6             0
offnet_mou_7             0
offnet_mou_8             0
offnet_mou_9             0
roam_ic_mou_6            0
roam_ic_mou_7            0
roam_ic_mou_8            0
roam_ic_mou_9            0
roam_og_mou_6            0
roam_og_mou_7            0
roam_og_mou_8            0
roam_og_mou_9            0
loc_og_t2t_mou_6         0
loc_og_t2t_mou_7         0
loc_og_t2t_mou_8         0
loc_og_t2t_mou_9         0
loc_og_t2m_mou_6         0
loc_og_t2m_mou_7         0
loc_og_t2m_mou_8         0
loc_og_t2m_mou_9         0
loc_og_t2f_mou_6         0
loc_og_t2f_mou_7         0
loc_og_t2f_mou_8         0
loc_og_t2f_mou_9         0
loc_og_t2c_mou_6         0
loc_og_t2c_mou_7         0
loc_og_t2c_mou_8         0
loc_og_t2c_mou_9         0
loc_og_mou_6             0
loc_og_mou_7             0
loc_og_mou_8             0
loc_og_mou_9             0
std_og_t2t_mou_6         0
std_og_t2t_mou_7         0
std_og_t2t_mou_8         0
std_og_t2t_mou_9         0
std_og_t2m_mou_6         0
std_og_t2m_mou_7         0
std_og_t2m_mou_8         0
std_og_t2m_mou_9         0
std_og_t2f_mou_6         0
std_og_t2f_mou_7         0
std_og_t2f_mou_8         0
std_og_t2f_mou_9         0
std_og_t2c_mou_6         0
std_og_t2c_mou_7         0
std_og_t2c_mou_8         0
std_og_t2c_mou_9         0
std_og_mou_6             0
std_og_mou_7             0
std_og_mou_8             0
std_og_mou_9             0
isd_og_mou_6             0
isd_og_mou_7             0
isd_og_mou_8             0
isd_og_mou_9             0
spl_og_mou_6             0
spl_og_mou_7             0
spl_og_mou_8             0
spl_og_mou_9             0
og_others_6              0
og_others_7              0
og_others_8              0
og_others_9              0
total_og_mou_6           0
total_og_mou_7           0
total_og_mou_8           0
total_og_mou_9           0
loc_ic_t2t_mou_6         0
loc_ic_t2t_mou_7         0
loc_ic_t2t_mou_8         0
loc_ic_t2t_mou_9         0
loc_ic_t2m_mou_6         0
loc_ic_t2m_mou_7         0
loc_ic_t2m_mou_8         0
loc_ic_t2m_mou_9         0
loc_ic_t2f_mou_6         0
loc_ic_t2f_mou_7         0
loc_ic_t2f_mou_8         0
loc_ic_t2f_mou_9         0
loc_ic_mou_6             0
loc_ic_mou_7             0
loc_ic_mou_8             0
loc_ic_mou_9             0
std_ic_t2t_mou_6         0
std_ic_t2t_mou_7         0
std_ic_t2t_mou_8         0
std_ic_t2t_mou_9         0
std_ic_t2m_mou_6         0
std_ic_t2m_mou_7         0
std_ic_t2m_mou_8         0
std_ic_t2m_mou_9         0
std_ic_t2f_mou_6         0
std_ic_t2f_mou_7         0
std_ic_t2f_mou_8         0
std_ic_t2f_mou_9         0
std_ic_t2o_mou_6         0
std_ic_t2o_mou_7         0
std_ic_t2o_mou_8         0
std_ic_t2o_mou_9         0
std_ic_mou_6             0
std_ic_mou_7             0
std_ic_mou_8             0
std_ic_mou_9             0
total_ic_mou_6           0
total_ic_mou_7           0
total_ic_mou_8           0
total_ic_mou_9           0
spl_ic_mou_6             0
spl_ic_mou_7             0
spl_ic_mou_8             0
spl_ic_mou_9             0
isd_ic_mou_6             0
isd_ic_mou_7             0
isd_ic_mou_8             0
isd_ic_mou_9             0
ic_others_6              0
ic_others_7              0
ic_others_8              0
ic_others_9              0
total_rech_num_6         0
total_rech_num_7         0
total_rech_num_8         0
total_rech_num_9         0
total_rech_amt_6         0
total_rech_amt_7         0
total_rech_amt_8         0
total_rech_amt_9         0
max_rech_amt_6           0
max_rech_amt_7           0
max_rech_amt_8           0
max_rech_amt_9           0
last_day_rch_amt_6       0
last_day_rch_amt_7       0
last_day_rch_amt_8       0
last_day_rch_amt_9       0
total_rech_data_6        0
total_rech_data_7        0
total_rech_data_8        0
total_rech_data_9        0
max_rech_data_6          0
max_rech_data_7          0
max_rech_data_8          0
max_rech_data_9          0
count_rech_2g_6          0
count_rech_2g_7          0
count_rech_2g_8          0
count_rech_2g_9          0
count_rech_3g_6          0
count_rech_3g_7          0
count_rech_3g_8          0
count_rech_3g_9          0
av_rech_amt_data_6       0
av_rech_amt_data_7       0
av_rech_amt_data_8       0
av_rech_amt_data_9       0
vol_2g_mb_6              0
vol_2g_mb_7              0
vol_2g_mb_8              0
vol_2g_mb_9              0
vol_3g_mb_6              0
vol_3g_mb_7              0
vol_3g_mb_8              0
vol_3g_mb_9              0
arpu_3g_6                0
arpu_3g_7                0
arpu_3g_8                0
arpu_3g_9                0
arpu_2g_6                0
arpu_2g_7                0
arpu_2g_8                0
arpu_2g_9                0
monthly_2g_6             0
monthly_2g_7             0
monthly_2g_8             0
monthly_2g_9             0
sachet_2g_6              0
sachet_2g_7              0
sachet_2g_8              0
sachet_2g_9              0
monthly_3g_6             0
monthly_3g_7             0
monthly_3g_8             0
monthly_3g_9             0
sachet_3g_6              0
sachet_3g_7              0
sachet_3g_8              0
sachet_3g_9              0
aon                      0
aug_vbc_3g               0
jul_vbc_3g               0
jun_vbc_3g               0
sep_vbc_3g               0
avg_rech_good_month      0
date_last_rech_data_9    0
date_last_rech_data_8    0
date_last_rech_data_7    0
date_last_rech_data_6    0
date_last_rech_6         0
date_last_rech_7         0
date_last_rech_8         0
date_last_rech_9         0
last_date_@_month_6      0
last_date_@_month_7      0
last_date_@_month_8      0
last_date_@_month_9      0
dtype: int64
In [24]:
df_high_end_cus.shape # final shape of dataset after null value treatment
Out[24]:
(30011, 219)
In [25]:
churn_attrs = ['total_ic_mou_9','total_og_mou_9','vol_2g_mb_9','vol_3g_mb_9']


In [26]:
df_high_end_cus['churn'] = df_high_end_cus.apply(lambda row:1 if ((row.total_ic_mou_9 == 0 or row.total_og_mou_9==0)
                                                                  and (row.vol_2g_mb_9 == 0 or row.vol_3g_mb_9 ==0))
                                                                else 0 , axis = 1)
    
In [27]:
df_high_end_cus['churn'].value_counts()
Out[27]:
0    26964
1     3047
Name: churn, dtype: int64

so , as we see a customer churns churns if he/she does not any call service(incoming/outgoing) or internet services (2G/3G). Based on the interpretation above :

  1. 3047 churned customers
  2. 26964 non-churned customers
In [28]:
# Now , we drop all the columns that act as data from the churn month - i.e month of september(9)
In [29]:
drop_churn_cols = []
col_list = df_high_end_cus.columns.values
for x in range(len(col_list)):
    if(col_list[x][-1]=='9'):
        drop_churn_cols.append(col_list[x])
In [30]:
drop_churn_cols 
Out[30]:
['arpu_9',
 'onnet_mou_9',
 'offnet_mou_9',
 'roam_ic_mou_9',
 'roam_og_mou_9',
 'loc_og_t2t_mou_9',
 'loc_og_t2m_mou_9',
 'loc_og_t2f_mou_9',
 'loc_og_t2c_mou_9',
 'loc_og_mou_9',
 'std_og_t2t_mou_9',
 'std_og_t2m_mou_9',
 'std_og_t2f_mou_9',
 'std_og_t2c_mou_9',
 'std_og_mou_9',
 'isd_og_mou_9',
 'spl_og_mou_9',
 'og_others_9',
 'total_og_mou_9',
 'loc_ic_t2t_mou_9',
 'loc_ic_t2m_mou_9',
 'loc_ic_t2f_mou_9',
 'loc_ic_mou_9',
 'std_ic_t2t_mou_9',
 'std_ic_t2m_mou_9',
 'std_ic_t2f_mou_9',
 'std_ic_t2o_mou_9',
 'std_ic_mou_9',
 'total_ic_mou_9',
 'spl_ic_mou_9',
 'isd_ic_mou_9',
 'ic_others_9',
 'total_rech_num_9',
 'total_rech_amt_9',
 'max_rech_amt_9',
 'last_day_rch_amt_9',
 'total_rech_data_9',
 'max_rech_data_9',
 'count_rech_2g_9',
 'count_rech_3g_9',
 'av_rech_amt_data_9',
 'vol_2g_mb_9',
 'vol_3g_mb_9',
 'arpu_3g_9',
 'arpu_2g_9',
 'monthly_2g_9',
 'sachet_2g_9',
 'monthly_3g_9',
 'sachet_3g_9',
 'date_last_rech_data_9',
 'date_last_rech_9',
 'last_date_@_month_9']
In [31]:
df_high_end_cus=df_high_end_cus.drop(drop_churn_cols,axis=1)
In [32]:
df_high_end_cus.shape #final dataframe after removing all churn month attributes and null value treated
Out[32]:
(30011, 168)
In [33]:
#Get Correlation of "Churn" with other variables:
plt.figure(figsize=(30,10))
df_high_end_cus.corr()['churn'].sort_values(ascending = False).plot(kind='bar')
plt.show()
Notebook Image
In [34]:
ax = sns.distplot(df_high_end_cus['arpu_6'], hist=True, kde=False, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
ax.set_ylabel('Churn')
ax.set_xlabel('Average revenue per user in july')
ax.set_title('Relation between revenue and churn rate')
Out[34]:
Text(0.5, 1.0, 'Relation between revenue and churn rate')
Notebook Image
In [35]:
ax = sns.distplot(df_high_end_cus['arpu_7'], hist=True, kde=False, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
ax.set_ylabel('Churn')
ax.set_xlabel('Average revenue per user in August')
ax.set_title('Relation between revenue and churn rate')
Out[35]:
Text(0.5, 1.0, 'Relation between revenue and churn rate')
Notebook Image
In [36]:
df_high_end_cus=df_high_end_cus.drop(columns=['mobile_number','circle_id'])

We can drop column mobile_number,circle_id as it does not help in predicting churning

In [37]:
df_high_end_cus.corr()

Out[37]:

We see above from the above heatmap that some of the correlation have Null Values(whitespaces) as these columns have same value and do not change, hence the standard deviation of these variables is 0 , which leads to Null values of correlation.

Factor Analysis to check for multicollinearity
In [38]:
from sklearn.decomposition import FactorAnalysis
FA = FactorAnalysis(n_components = 3).fit_transform(df_high_end_cus.values)
In [39]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
plt.title('Factor Analysis Components')
plt.scatter(FA[:,0], FA[:,1])
plt.scatter(FA[:,1], FA[:,2])
plt.scatter(FA[:,2], FA[:,0])


Out[39]:
<matplotlib.collections.PathCollection at 0x228902ed4c8>
Notebook Image

We see that, the data is highly correlated and hence, we have to remove multicollinearity. Here, every group of factor is a set of highly correlated variables/columns

In [40]:
#report = pp.ProfileReport(df_high_end_cus)
#report.to_file('output_report.html')

Dropping variables with high multicollinearity as per report from pandas-profile.

In [41]:
df_high_end_cus = df_high_end_cus.drop(columns=['arpu_3g_6','arpu_3g_7','arpu_3g_8','isd_og_mou_7','isd_og_mou_8',
                                               'sachet_2g_6','sachet_2g_7','sachet_2g_8','total_rech_amt_6',
                                               'total_rech_amt_7','total_rech_amt_8'])

Dropping variables with constant values as per report from pandas-profile.

In [42]:
df_high_end_cus = df_high_end_cus.drop(columns=['last_date_@_month_6','last_date_@_month_7','last_date_@_month_8',
                                               'loc_ic_t2o_mou','loc_og_t2o_mou','std_ic_t2o_mou_6','std_ic_t2o_mou_7',
                                               'std_ic_t2o_mou_8','std_og_t2c_mou_6','std_og_t2c_mou_7','std_og_t2c_mou_8',
                                               'std_og_t2o_mou'])
In [43]:
df_high_end_cus.shape
Out[43]:
(30011, 143)

Now, identifying skewed columns and do the necessary transformations.

In [44]:
sns.distplot(df_high_end_cus['avg_rech_good_month'])
plt.show()
Notebook Image
In [45]:
#since, the column avg_rech_good_month is highly skewed, we can do log transformation.
In [46]:
df_high_end_cus['avg_rech_good_month_trnsfrm'] = np.log(df_high_end_cus['avg_rech_good_month'])
df_high_end_cus = df_high_end_cus.drop(columns=['avg_rech_good_month'])
In [47]:
sns.distplot(df_high_end_cus['avg_rech_good_month_trnsfrm'])
plt.show()
Notebook Image
In [48]:
sns.distplot(df_high_end_cus['ic_others_6'])
plt.show()
Notebook Image
In [49]:
df_high_end_cus['ic_others_6_trnsfrm']=np.sqrt(df_high_end_cus['ic_others_6'])
df_high_end_cus['ic_others_8_trnsfrm']=np.sqrt(df_high_end_cus['ic_others_8'])
df_high_end_cus['isd_ic_mou_7_trnsfrm']=np.sqrt(df_high_end_cus['isd_ic_mou_7'])
df_high_end_cus['isd_ic_mou_6_trnsfrm']=np.sqrt(df_high_end_cus['isd_ic_mou_6'])
df_high_end_cus['loc_og_t2c_mou_7_trnsfrm']=np.sqrt(df_high_end_cus['loc_og_t2c_mou_7'])
df_high_end_cus['og_others_7_trnsfrm']=np.sqrt(df_high_end_cus['og_others_7'])
df_high_end_cus['og_others_8_trnsfrm']=np.sqrt(df_high_end_cus['og_others_8'])
df_high_end_cus['spl_ic_mou_6_trnsfrm']=np.sqrt(df_high_end_cus['spl_ic_mou_6'])
df_high_end_cus['spl_ic_mou_7_trnsfrm']=np.sqrt(df_high_end_cus['spl_ic_mou_7'])
df_high_end_cus['std_ic_t2f_mou_6_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2f_mou_6'])
df_high_end_cus['std_ic_t2f_mou_7_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2f_mou_7'])
df_high_end_cus['std_ic_t2f_mou_8_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2f_mou_8'])
df_high_end_cus['std_ic_t2t_mou_6_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2t_mou_6'])
df_high_end_cus['std_ic_t2t_mou_7_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2t_mou_7'])
df_high_end_cus['std_ic_t2t_mou_8_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2t_mou_8'])

cols_skewed_sqrt = ['ic_others_6','ic_others_8','isd_ic_mou_7','isd_ic_mou_6','loc_og_t2c_mou_7','og_others_7',
                   'og_others_8','spl_ic_mou_6','spl_ic_mou_7','std_ic_t2f_mou_6','std_ic_t2f_mou_7','std_ic_t2f_mou_8',
                   'std_ic_t2t_mou_6','std_ic_t2t_mou_7','std_ic_t2t_mou_8']


df_high_end_cus = df_high_end_cus.drop(columns=cols_skewed_sqrt,axis=1)
In [50]:
#report_1 = pp.ProfileReport(df_high_end_cus)
#report_1.to_file('output_report_2.html')
In [51]:
df_high_end_cus['ic_others_7_trnsfrm']=np.sqrt(df_high_end_cus['ic_others_7'])
df_high_end_cus['isd_og_mou_6_trnsfrm']=np.sqrt(df_high_end_cus['isd_og_mou_6'])

df_high_end_cus = df_high_end_cus.drop(columns=['ic_others_7','isd_og_mou_6'])

In [52]:
FA_1 = FactorAnalysis(n_components = 3).fit_transform(df_high_end_cus.values)
In [53]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
plt.title('Factor Analysis Components')
plt.scatter(FA_1[:,0],FA_1[:,1])
plt.scatter(FA_1[:,1], FA_1[:,2])
plt.scatter(FA_1[:,2],FA_1[:,0])
plt.show()

Notebook Image
In [54]:
#report_2 = pp.ProfileReport(df_high_end_cus)
#report_2.to_file('output_report_3.html')

As we can see from the above factor plot, that the multicollinearity among the datasets have been reduced due to removal of the above mentioned columns as per pandas profiling.

In [55]:
df_high_end_cus.corr()['churn'].sort_values(ascending = False)
Out[55]:
churn                          1.000000
std_og_mou_6                   0.131846
std_og_t2m_mou_6               0.099027
std_og_t2t_mou_6               0.093168
roam_og_mou_7                  0.092717
roam_og_mou_8                  0.072746
total_og_mou_6                 0.072193
onnet_mou_6                    0.071193
roam_ic_mou_7                  0.069398
total_rech_num_6               0.064926
roam_ic_mou_8                  0.063417
roam_og_mou_6                  0.061005
offnet_mou_6                   0.058547
arpu_6                         0.058299
std_og_mou_7                   0.049341
roam_ic_mou_6                  0.045281
std_og_t2m_mou_7               0.039915
std_ic_t2t_mou_6_trnsfrm       0.032413
std_og_t2t_mou_7               0.029499
isd_og_mou_6_trnsfrm           0.027086
og_others_6                    0.023838
max_rech_data_7                0.021388
spl_og_mou_6                   0.019219
max_rech_data_6                0.016762
arpu_2g_6                      0.016439
std_ic_mou_6                   0.014432
max_rech_data_8                0.010959
std_ic_t2m_mou_6               0.009146
avg_rech_good_month_trnsfrm    0.007734
loc_og_t2c_mou_7_trnsfrm       0.007074
last_day_rch_amt_6             0.006301
date_last_rech_6               0.006051
loc_og_t2c_mou_6               0.005826
av_rech_amt_data_6             0.005406
spl_og_mou_7                   0.004479
onnet_mou_7                    0.004469
isd_ic_mou_6_trnsfrm           0.002928
og_others_7_trnsfrm            0.001932
date_last_rech_data_6          0.001892
max_rech_amt_6                 0.000408
sachet_3g_6                   -0.000659
monthly_3g_6                  -0.000746
count_rech_3g_6               -0.000943
date_last_rech_data_8         -0.001495
arpu_2g_7                     -0.001693
sachet_3g_7                   -0.001900
spl_ic_mou_7_trnsfrm          -0.004778
og_others_8_trnsfrm           -0.005165
date_last_rech_data_7         -0.005310
av_rech_amt_data_7            -0.005919
offnet_mou_7                  -0.005927
total_rech_data_7             -0.006471
std_ic_t2t_mou_7_trnsfrm      -0.006982
vol_3g_mb_6                   -0.007034
total_rech_data_6             -0.007746
count_rech_2g_6               -0.009005
vol_2g_mb_6                   -0.010459
ic_others_6_trnsfrm           -0.012010
count_rech_3g_7               -0.013208
spl_ic_mou_6_trnsfrm          -0.013247
total_rech_num_7              -0.017163
std_ic_mou_7                  -0.017219
std_ic_t2m_mou_7              -0.017647
isd_ic_mou_7_trnsfrm          -0.017897
vol_2g_mb_7                   -0.020292
count_rech_2g_7               -0.020302
vol_3g_mb_7                   -0.021298
monthly_3g_7                  -0.022068
std_og_t2f_mou_6              -0.023002
max_rech_amt_7                -0.025411
total_og_mou_7                -0.025698
std_og_t2f_mou_7              -0.025904
sachet_3g_8                   -0.030317
jun_vbc_3g                    -0.031576
arpu_7                        -0.032264
isd_ic_mou_8                  -0.033997
loc_og_t2c_mou_8              -0.035344
total_rech_data_8             -0.035385
ic_others_7_trnsfrm           -0.038166
loc_og_t2t_mou_6              -0.040706
std_og_t2f_mou_8              -0.040923
av_rech_amt_data_8            -0.041120
loc_ic_t2t_mou_6              -0.042433
sep_vbc_3g                    -0.043487
jul_vbc_3g                    -0.046368
count_rech_2g_8               -0.048568
arpu_2g_8                     -0.049299
monthly_2g_6                  -0.049323
last_day_rch_amt_7            -0.050486
loc_og_t2f_mou_6              -0.050514
vol_2g_mb_8                   -0.051534
std_ic_t2f_mou_6_trnsfrm      -0.051998
spl_og_mou_8                  -0.054581
count_rech_3g_8               -0.055715
loc_ic_t2f_mou_6              -0.056275
spl_ic_mou_8                  -0.057730
std_ic_t2f_mou_7_trnsfrm      -0.058405
std_ic_t2m_mou_8              -0.058697
loc_og_t2f_mou_7              -0.059601
monthly_3g_8                  -0.060327
loc_ic_t2f_mou_7              -0.060758
vol_3g_mb_8                   -0.061113
loc_ic_t2t_mou_7              -0.061258
monthly_2g_7                  -0.061820
loc_og_t2t_mou_7              -0.063534
loc_ic_t2m_mou_6              -0.065530
total_ic_mou_6                -0.065879
std_ic_mou_8                  -0.068913
std_og_t2t_mou_8              -0.072706
std_og_t2m_mou_8              -0.073009
ic_others_8_trnsfrm           -0.073157
loc_og_t2m_mou_6              -0.074936
loc_ic_mou_6                  -0.075223
loc_og_mou_6                  -0.076626
aug_vbc_3g                    -0.081413
loc_og_t2f_mou_8              -0.083690
monthly_2g_8                  -0.083851
loc_ic_t2f_mou_8              -0.085501
loc_og_t2t_mou_8              -0.089523
onnet_mou_8                   -0.090815
std_og_mou_8                  -0.092509
std_ic_t2t_mou_8_trnsfrm      -0.096043
loc_ic_t2t_mou_8              -0.097180
std_ic_t2f_mou_8_trnsfrm      -0.097314
loc_ic_t2m_mou_7              -0.101247
loc_ic_mou_7                  -0.110967
loc_og_t2m_mou_7              -0.111377
total_ic_mou_7                -0.113269
loc_og_mou_7                  -0.115236
offnet_mou_8                  -0.119262
last_day_rch_amt_8            -0.125655
date_last_rech_7              -0.134863
max_rech_amt_8                -0.136965
aon                           -0.138380
loc_ic_t2m_mou_8              -0.151994
loc_og_mou_8                  -0.156810
loc_og_t2m_mou_8              -0.157403
loc_ic_mou_8                  -0.163109
total_rech_num_8              -0.167651
date_last_rech_8              -0.190048
arpu_8                        -0.194774
total_og_mou_8                -0.195494
total_ic_mou_8                -0.207454
Name: churn, dtype: float64
In [56]:
sns.boxplot(df_high_end_cus['std_og_t2t_mou_6'])
Out[56]:
<matplotlib.axes._subplots.AxesSubplot at 0x22890d40508>
Notebook Image
In [57]:
df_high_end_cus.std_og_t2t_mou_6.quantile(0.99)
Out[57]:
1885.1980000000017
In [58]:
max(df_high_end_cus['std_og_t2t_mou_6'])
Out[58]:
7366.58

As we can see that for the column std_og_t2t_mou_6,there is a significant difference between the maximum value and the value of 99 th percentile, this shows there are clearly many outliers.

As a strategy to remove outliers, we retain values till 99th percentile of the top 10 correlated columns with column churn.

In [59]:
churn_corr_cols = ['std_og_mou_6','std_og_t2m_mou_6','std_og_t2t_mou_6','roam_og_mou_7','roam_og_mou_8','total_og_mou_6',
                  'onnet_mou_6','roam_ic_mou_7','total_rech_num_6','roam_ic_mou_8']
In [60]:
df_high_end_cus = df_high_end_cus[df_high_end_cus.std_og_mou_6 < df_high_end_cus.std_og_mou_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.std_og_t2m_mou_6 < df_high_end_cus.std_og_t2m_mou_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.std_og_t2t_mou_6 < df_high_end_cus.std_og_t2t_mou_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.roam_og_mou_7 < df_high_end_cus.roam_og_mou_7.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.roam_og_mou_8 < df_high_end_cus.roam_og_mou_8.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.total_og_mou_6 < df_high_end_cus.total_og_mou_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.onnet_mou_6 < df_high_end_cus.onnet_mou_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.roam_ic_mou_7 < df_high_end_cus.roam_ic_mou_7.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.total_rech_num_6 < df_high_end_cus.total_rech_num_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.roam_ic_mou_8 < df_high_end_cus.roam_ic_mou_8.quantile(.99)]



In [61]:
df_high_end_cus.shape #after maximum possible outlier treatment
Out[61]:
(27134, 143)

Splitting into train - test data

In [62]:
X = df_high_end_cus.drop(columns=['churn'],axis=1)
y = df_high_end_cus['churn']
In [63]:
# split into train and test with ratio of 80% and 20%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.8,
                                                    test_size = 0.2, random_state=100)

Handling Class Imbalance

In [64]:
y_train.value_counts()
Out[64]:
0    19739
1     1968
Name: churn, dtype: int64
In [65]:
y_test.value_counts()
Out[65]:
0    4910
1     517
Name: churn, dtype: int64

As , we see there is a clear class imbalance of the customers who have churned and not churned. So, we apply methods like SMOTE(Synthetic Minority OverSampling Technique) to upsample the minority class of data points in churn column.

In [66]:
!pip install imblearn
from imblearn.over_sampling import SMOTE
Requirement already satisfied: imblearn in c:\users\arijit das\anaconda3\lib\site-packages (0.0) Requirement already satisfied: imbalanced-learn in c:\users\arijit das\anaconda3\lib\site-packages (from imblearn) (0.5.0) Requirement already satisfied: joblib>=0.11 in c:\users\arijit das\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (0.13.2) Requirement already satisfied: numpy>=1.11 in c:\users\arijit das\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.16.5) Requirement already satisfied: scipy>=0.17 in c:\users\arijit das\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.3.1) Requirement already satisfied: scikit-learn>=0.21 in c:\users\arijit das\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (0.21.3)
In [67]:
sm = SMOTE(random_state=27, ratio=1.0)
X_train, y_train = sm.fit_sample(X_train, y_train)
In [68]:
np.bincount(y_train) #19739 rows of each class for the column churn
Out[68]:
array([19739, 19739], dtype=int64)
In [69]:
# Converting n-arrays to dataframe
X_train_df = pd.DataFrame(X_train)
y_train_df = pd.DataFrame(y_train)
In [70]:
X_train_df.columns = X.columns

Feature Scaling

In [71]:
from sklearn.preprocessing import StandardScaler
In [72]:
scaler = StandardScaler()

X_train_df_scaled = scaler.fit_transform(X_train_df)

Logistic Regression

Model with simple logistic regression using SAGA (Stochastic Average Gradient descent solver) that includes both L1 and L2 regularisation.

In [73]:
smote = LogisticRegression(solver='saga').fit(X_train_df_scaled, y_train_df)
In [74]:
X_test = scaler.transform(X_test)
In [75]:
smote_pred = smote.predict(X_test)
In [76]:
print(accuracy_score(y_test, smote_pred))
    #0.84

# f1 score
print(f1_score(y_test, smote_pred))
    #0.51

print(recall_score(y_test, smote_pred))
    #0.83
    
print(precision_score(y_test, smote_pred))
   #0.37
0.8494564215957251 0.5145573380867499 0.8375241779497099 0.3713550600343053

Logistic Regression with PCA

In [77]:
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import explained_variance_score
In [78]:
pca = PCA(n_components=40,random_state=100,svd_solver='randomized')
In [79]:
Xtrain_reduced = pca.fit_transform(X_train_df_scaled)
Xtest_reduced = pca.transform(X_test)

regrpca = LogisticRegression()

# Train the model using the principal components of the transformed training sets
regrpca.fit(Xtrain_reduced, y_train_df)
# Make predictions using the principal components of the transformed testing set
y_pred = regrpca.predict(Xtest_reduced)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('R2 score: %.2f' % r2_score(y_test, y_pred))

Mean squared error: 0.19 R2 score: -1.15
In [80]:
sum(pca.explained_variance_ratio_)
Out[80]:
0.7986403605500407
In [81]:
#plotting a scree plot
fig = plt.figure(figsize = (12,8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()
Notebook Image
In [82]:
print(accuracy_score(y_test, y_pred))
    #0.76

# f1 score
print(f1_score(y_test, y_pred))
    #0.393

print(recall_score(y_test, y_pred))
    #0.814
    
print(precision_score(y_test, y_pred))
   #0.259
0.8148148148148148 0.45646295294753925 0.816247582205029 0.31681681681681684

We see that the accuracy scores, precision,recall scores are pretty low for logistic regression model with PCA components

Random Forest Classifier

Default Hyperparameters

Let's first fit a random forest model with default hyperparameters.

In [83]:
# Importing random forest classifier from sklearn library
from sklearn.ensemble import RandomForestClassifier

# Running the random forest with default parameters.
rfc = RandomForestClassifier()
In [84]:
# fit
rfc.fit(X_train_df_scaled,y_train_df)
Out[84]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
In [85]:
# Making predictions
predictions = rfc.predict(X_test)
In [86]:
# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
In [87]:
# Let's check the report of our default model
print(classification_report(y_test,predictions))
precision recall f1-score support 0 0.96 0.96 0.96 4910 1 0.60 0.59 0.59 517 accuracy 0.92 5427 macro avg 0.78 0.77 0.78 5427 weighted avg 0.92 0.92 0.92 5427
In [88]:
# Printing confusion matrix
print(confusion_matrix(y_test,predictions))
[[4704 206] [ 211 306]]
In [89]:
print(accuracy_score(y_test,predictions))
0.9231619679380874

Tuning max_depth Let's try to find the optimum values for max_depth and understand how the value of max_depth impacts the overall accuracy of the ensemble.

In [90]:
# GridSearchCV to find optimal n_estimators
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'max_depth': range(2, 20, 5)}

# instantiate the model
rf = RandomForestClassifier()


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
rf.fit(X_train_df_scaled, y_train_df)
Out[90]:
GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='warn', n_jobs=None, param_grid={'max_depth': range(2, 20, 5)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)
In [91]:
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
Out[91]:
In [92]:
# plotting accuracies with max_depth
plt.figure()
plt.plot(scores["param_max_depth"], 
         scores["split0_test_score"], 
         label="training accuracy")
plt.plot(scores["param_max_depth"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("max_depth")
plt.ylabel("Accuracy")
plt.legend()
plt.show()
Notebook Image

Tuning n_estimators

In [ ]:
# GridSearchCV to find optimal n_estimators
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'n_estimators': range(100, 1500, 400)}

# instantiate the model (note we are specifying a max_depth)
rf = RandomForestClassifier(max_depth=4)


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
rf.fit(X_train_df_scaled, y_train_df)
In [ ]:
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
In [ ]:
# plotting accuracies with n_estimators
plt.figure()
plt.plot(scores["param_n_estimators"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_n_estimators"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("n_estimators")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

Tuning max_features

In [ ]:
# GridSearchCV to find optimal max_features
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'max_features': [4, 8, 14, 20, 24]}

# instantiate the model
rf = RandomForestClassifier(max_depth=4)


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
rf.fit(X_train_df_scaled, y_train_df)
In [ ]:
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
In [ ]:
# plotting accuracies with max_features
plt.figure()
plt.plot(scores["param_max_features"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_max_features"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("max_features")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

Tuning min_samples_leaf

In [ ]:
# GridSearchCV to find optimal min_samples_leaf
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'min_samples_leaf': range(100, 400, 50)}

# instantiate the model
rf = RandomForestClassifier()


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
rf.fit(X_train_df_scaled, y_train_df)
In [ ]:
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
In [ ]:
# plotting accuracies with min_samples_leaf
plt.figure()
plt.plot(scores["param_min_samples_leaf"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_min_samples_leaf"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("min_samples_leaf")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

Tuning min_samples_split

In [ ]:
# GridSearchCV to find optimal min_samples_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'min_samples_split': range(200, 500, 50)}

# instantiate the model
rf = RandomForestClassifier()


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
rf.fit(X_train_df_scaled, y_train_df)
In [ ]:
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
In [ ]:
# plotting accuracies with min_samples_split
plt.figure()
plt.plot(scores["param_min_samples_split"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_min_samples_split"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("min_samples_split")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

Grid Search to Find Optimal Hyperparameters

In [ ]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [4,8,10],
    'min_samples_leaf': range(100, 400, 200),
    'min_samples_split': range(200, 500, 200),
    'n_estimators': [100,200, 300], 
    'max_features': [5, 10]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1,verbose = 1)
In [ ]:
# Fit the grid search to the data
grid_search.fit(X_train_df_scaled, y_train_df)
In [ ]:
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)

Fitting the final model with the best parameters obtained from grid search

In [ ]:
# model with the best hyperparameters
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(bootstrap=True,
                             max_depth=10,
                             min_samples_leaf=100, 
                             min_samples_split=200,
                             max_features=10,
                             n_estimators=100)
In [ ]:
# fit
rfc.fit(X_train_df_scaled, y_train_df)
In [ ]:
# predict
predictions = rfc.predict(X_test)
In [ ]:
# evaluation metrics
from sklearn.metrics import classification_report,confusion_matrix
In [ ]:
print(classification_report(y_test,predictions))
In [ ]:
print(confusion_matrix(y_test,predictions))

Model Building

Let's build a linear SVM mode now. The SVC() class does that in sklearn. We highly recommend reading the documentation at least once.

In [ ]:
help(SVC)
In [ ]:
# Model building

# instantiate an object of class SVC()
# note that we are using cost C=1
model = SVC(C = 1)

# fit
model.fit(X_train_df_scaled, y_train_df)

# predict
y_pred = model.predict(smote_pred)
In [ ]:
# Evaluate the model using confusion matrix 
from sklearn import metrics
metrics.confusion_matrix(y_true=y_test, y_pred=y_pred)
In [ ]:
# print other metrics

# accuracy
print("accuracy", metrics.accuracy_score(y_test, y_pred))

# precision
print("precision", metrics.precision_score(y_test, y_pred))

# recall/sensitivity
print("recall", metrics.recall_score(y_test, y_pred))

In [ ]:
# specificity (% of hams correctly classified)
print("specificity", 811/(811+38))

Hyperparameter Tuning

In [ ]:
help(metrics.confusion_matrix)

K-Fold Cross Validation

Let's first run a simple k-fold cross validation to get a sense of the average metrics as computed over multiple folds. the easiest way to do cross-validation is to use the cross_val_score() function.

In [ ]:
# creating a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True, random_state = 4)

# instantiating a model with cost=1
model = SVC(C = 1)
In [ ]:
# computing the cross-validation scores 
# note that the argument cv takes the 'folds' object, and
# we have specified 'accuracy' as the metric

cv_results = cross_val_score(model, X_train_df_scaled, y_train_df, cv = folds, scoring = 'accuracy') 
In [ ]:
# print 5 accuracies obtained from the 5 folds
print(cv_results)
print("mean accuracy = {}".format(cv_results.mean()))

Grid Search to Find Optimal Hyperparameter C

In [ ]:
# specify range of parameters (C) as a list
params = {"C": [0.1, 1, 10, 100, 1000]}

model = SVC()

# set up grid search scheme
# note that we are still using the 5 fold CV scheme we set up earlier
model_cv = GridSearchCV(estimator = model, param_grid = params, 
                        scoring= 'accuracy', 
                        cv = folds, 
                        verbose = 1,
                       return_train_score=True)      
In [ ]:
# fit the model - it will fit 5 folds across all values of C
model_cv.fit(X_train_df_scaled, y_train_df)  
In [ ]:
# results of grid search CV
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results

To get a better sense of how training and test accuracy varies with C, let's plot the tranining and test accuracies against C.

In [ ]:
# plot of C versus train and test scores

plt.figure(figsize=(8, 6))
plt.plot(cv_results['param_C'], cv_results['mean_test_score'])
plt.plot(cv_results['param_C'], cv_results['mean_train_score'])
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.legend(['test accuracy', 'train accuracy'], loc='upper left')
plt.xscale('log')
In [ ]:
best_score = model_cv.best_score_
best_C = model_cv.best_params_['C']

print(" The highest test accuracy is {0} at C = {1}".format(best_score, best_C))
In [ ]:
# model with the best value of C
model = SVC(C=best_C)

# fit
model.fit(X_train_df_scaled, y_train_df)

# predict
y_pred = model.predict(smote_pred)
In [ ]:
# metrics
# print other metrics

# accuracy
print("accuracy", metrics.accuracy_score(y_test, y_pred))

# precision
print("precision", metrics.precision_score(y_test, y_pred))

# recall/sensitivity
print("recall", metrics.recall_score(y_test, y_pred))

Optimising for Other Evaluation Metrics

In [ ]:
# specify params
params = {"C": [0.1, 1, 10, 100, 1000]}

# specify scores/metrics in an iterable
scores = ['accuracy', 'precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for {}".format(score))
    
    # set up GridSearch for score metric
    clf = GridSearchCV(SVC(), 
                       params, 
                       cv=folds,
                       scoring=score,
                       return_train_score=True)
    # fit
    clf.fit(X_train, y_train)

    print(" The highest {0} score is {1} at C = {2}".format(score, clf.best_score_, clf.best_params_))
    print("\n")
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: