Jovian
⭐️
Sign In
In [212]:
import os
os.environ["PATH"] += os.pathsep + 'D:/POC/Churn/references/graphviz-2.38/release/bin'
In [316]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# from PIL import Image
import itertools
import warnings
warnings.filterwarnings("ignore")
import io
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff

In [777]:
# variable
synthetic_data = False

# loading dataset
df = pd.read_csv("../data/raw/Telco-Customer-Churn.csv")
# df.head()
In [778]:
## drop the duplicate rows

df_c = df.copy()

df_c.drop(["customerID", "Churn"], axis=1, inplace=True)
# df_c[df_c.duplicated()]


# df.drop(df_c[df_c.duplicated()].index, inplace = True) 
df.drop(df_c[df_c.duplicated()].index, inplace=True)
In [779]:
# df[(df['TotalCharges'] > 800) & (df['TotalCharges'] <= 3000)].shape 
# df[df['TotalCharges'] < 1734].shape
# df[df['MonthlyCharges'] < 118].shape
# df['TotalCharges'].min()

# 8684/3
df.shape

Out[779]:
(7003, 21)
In [780]:
#Replacing spaces with null values in total charges column
df['TotalCharges'] = df["TotalCharges"].replace(" ",np.nan)

#Dropping null values from total charges column which contain .15% missing data 
df = df[df["TotalCharges"].notnull()]
df = df.reset_index()[df.columns]

#convert to float type
df["TotalCharges"] = df["TotalCharges"].astype(float)

#replace 'No internet service' to No for the following columns
replace_cols = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies']
for i in replace_cols : 
    df[i]  = df[i].replace({'No internet service' : 'No'})
    
#replace values
df["SeniorCitizen"] = df["SeniorCitizen"].replace({1:"Yes",0:"No"})
In [781]:
# #Tenure to categorical column
# def tenure_lab(df) :
    
#     if df["tenure"] <= 12 :
#         return "Tenure_0-12"
#     elif (df["tenure"] > 12) & (df["tenure"] <= 24 ):
#         return "Tenure_12-24"
#     elif (df["tenure"] > 24) & (df["tenure"] <= 48) :
#         return "Tenure_24-48"
#     elif (df["tenure"] > 48) & (df["tenure"] <= 60) :
#         return "Tenure_48-60"
#     elif df["tenure"] > 60 :
#         return "Tenure_gt_60"
# df["tenure_group"] = df.apply(lambda df:tenure_lab(df),
#                                       axis = 1)
# # 1734, 3466, 5199, 6932, 8665 groups for total charges
# #total charges to categorical column
# def TotalCharges_lab(df) :
    
#     if df["TotalCharges"] <= 1734 :
#         return "TotalCharges_0-1734"
#     elif (df["TotalCharges"] > 1734) & (df["TotalCharges"] <= 3465 ):
#         return "TotalCharges_1734-3465"
#     elif (df["TotalCharges"] > 3465) & (df["TotalCharges"] <= 5200) :
#         return "TotalCharges_3465-5200"
#     elif (df["TotalCharges"] > 5200) & (df["TotalCharges"] <= 6933) :
#         return "TotalCharges_5200-6933"
#     elif df["TotalCharges"] > 6933 :
#         return "TotalCharges_gt_6933"
# df["TotalCharges_group"] = df.apply(lambda df:TotalCharges_lab(df),
#                                       axis = 1)

# # 38, 58, 78, 98, 118 groups for total charges
# #monthly charges to categorical column
# def MonthlyCharges_lab(df) :
    
#     if df["MonthlyCharges"] <= 38 :
#         return "MonthlyCharges_0-38"
#     elif (df["MonthlyCharges"] > 38) & (df["MonthlyCharges"] <= 58 ):
#         return "MonthlyCharges_38-58"
#     elif (df["MonthlyCharges"] > 58) & (df["MonthlyCharges"] <= 78) :
#         return "MonthlyCharges_58-78"
#     elif (df["MonthlyCharges"] > 78) & (df["MonthlyCharges"] <= 98) :
#         return "MonthlyCharges_78-98"
#     elif df["MonthlyCharges"] > 98 :
#         return "MonthlyCharges_gt_98"
# df["MonthlyCharges_group"] = df.apply(lambda df:MonthlyCharges_lab(df),
#                                       axis = 1)
72/3
Out[781]:
24.0
In [782]:
#Tenure to categorical column
def tenure_lab(df) :
    
    if df["tenure"] <= 24 :
        return "Tenure_0-24"
    elif (df["tenure"] > 24) & (df["tenure"] <= 48 ):
        return "Tenure_24-48"
    elif df["tenure"] > 48:
        return "Tenure_gt_48"
df["tenure_group"] = df.apply(lambda df:tenure_lab(df),
                                      axis = 1)
# 1734, 3466, 5199, 6932, 8665 groups for total charges
#total charges to categorical column
def TotalCharges_lab(df) :
    
    if df["TotalCharges"] <= 800 :
        return "TotalCharges_0-800"
    elif (df["TotalCharges"] > 800) & (df["TotalCharges"] <= 3000 ):
        return "TotalCharges_800-3000"
    elif df["TotalCharges"] > 3000 :
        return "TotalCharges_gt_3000"
df["TotalCharges_group"] = df.apply(lambda df:TotalCharges_lab(df),
                                      axis = 1)

# 38, 58, 78, 98, 118 groups for total charges
#monthly charges to categorical column
def MonthlyCharges_lab(df) :
    
    if df["MonthlyCharges"] <= 52 :
        return "MonthlyCharges_0-52"
    elif (df["MonthlyCharges"] > 52) & (df["MonthlyCharges"] <= 85 ):
        return "MonthlyCharges_52-85"
    elif df["MonthlyCharges"] > 85 :
        return "MonthlyCharges_gt_85"
df["MonthlyCharges_group"] = df.apply(lambda df:MonthlyCharges_lab(df),
                                      axis = 1)
In [ ]:
 
In [810]:

df.columns
Out[810]:
(9919,)

data preprocessing

In [784]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#customer id col
Id_col     = ['customerID']
#Target columns
target_col = ["Churn"]

# drop tenure, TotalCharges
## changed by lokesh
# df = df.drop(columns = ["tenure", "TotalCharges", "MonthlyCharges"], axis = 1)
# df = df.drop(columns = ["tenure_group", "TotalCharges_group", "MonthlyCharges_group"], axis = 1)
## end


#categorical columns
cat_cols   = df.nunique()[df.nunique() < 6].keys().tolist()
cat_cols   = [x for x in cat_cols if x not in target_col]
#numerical columns
num_cols   = [x for x in df.columns if x not in cat_cols + target_col + Id_col]
#Binary columns with 2 values
bin_cols   = df.nunique()[df.nunique() == 2].keys().tolist()
#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]

#Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    df[i] = le.fit_transform(df[i])
    
#Duplicating columns for multi value columns
df = pd.get_dummies(data = df, columns = multi_cols )


#Scaling Numerical columns
if len(num_cols) > 0 :
    std = StandardScaler()
    scaled = std.fit_transform(df[num_cols])
    scaled = pd.DataFrame(scaled, columns=num_cols)
    #dropping original values merging scaled values for numerical columns
    df = df.drop(columns = num_cols,axis = 1)
    df = df.merge(scaled,left_index=True,right_index=True,how = "left")
    
# copy the data
df_telcom_og = df.copy()
In [807]:
cls = df.columns
colmns = []
for coln in list(cls.values):
    if coln not in ['customerID', 'Churn']:
        colmns.append(coln)

# colmns
# array(df.columns.values)
In [808]:
## apply polynomials for tenure, monthly charges, total charges
from sklearn.preprocessing import PolynomialFeatures
polyFeatures = PolynomialFeatures(3)


X = df[colmns]
val = polyFeatures.fit_transform(X)

df = df.merge(pd.DataFrame(val),left_index=True,right_index=True,how = "left")

Variable summary

In [756]:
summary = (df[[i for i in df.columns if i not in Id_col]].
           describe().transpose().reset_index())

summary = summary.rename(columns = {"index" : "feature"})
summary = np.around(summary,3)

val_lst = [summary['feature'], summary['count'],
           summary['mean'],summary['std'],
           summary['min'], summary['25%'],
           summary['50%'], summary['75%'], summary['max']]

trace  = go.Table(header = dict(values = summary.columns.tolist(),
                                line = dict(color = ['#506784']),
                                fill = dict(color = ['#119DFF']),
                               ),
                  cells  = dict(values = val_lst,
                                line = dict(color = ['#506784']),
                                fill = dict(color = ["lightgrey",'#F5F8FF'])
                               ),
                  columnwidth = [200,60,100,100,60,60,80,80,80])
layout = go.Layout(dict(title = "Variable Summary"))
figure = go.Figure(data=[trace],layout=layout)
py.iplot(figure)
In [757]:
df.columns
Out[757]:
Index([                             'customerID',
                                        'gender',
                                 'SeniorCitizen',
                                       'Partner',
                                    'Dependents',
                                  'PhoneService',
                                'OnlineSecurity',
                                  'OnlineBackup',
                              'DeviceProtection',
                                   'TechSupport',
                                   'StreamingTV',
                               'StreamingMovies',
                              'PaperlessBilling',
                                         'Churn',
                              'MultipleLines_No',
                'MultipleLines_No phone service',
                             'MultipleLines_Yes',
                           'InternetService_DSL',
                   'InternetService_Fiber optic',
                            'InternetService_No',
                       'Contract_Month-to-month',
                             'Contract_One year',
                             'Contract_Two year',
       'PaymentMethod_Bank transfer (automatic)',
         'PaymentMethod_Credit card (automatic)',
                'PaymentMethod_Electronic check',
                    'PaymentMethod_Mailed check',
                                         '0_1.0',
                                        'tenure',
                                'MonthlyCharges',
                                  'TotalCharges',
                                               1,
                                               2,
                                               3,
                                               4,
                                               5,
                                               6,
                                               7,
                                               8,
                                               9],
      dtype='object')

Identify Highly correlated features

In [811]:
df_copy = df.copy()
df_copy.drop(["customerID"], axis=1, inplace=True)
# Create correlation matrix
corr_matrix = df_copy.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

## removing highly correlated data
df_copy.drop(to_drop, axis=1, inplace=True)
df_copy['customerID'] = df['customerID']
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-811-bdebb74ce152> in <module> 2 df_copy.drop(["customerID"], axis=1, inplace=True) 3 # Create correlation matrix ----> 4 corr_matrix = df_copy.corr().abs() 5 6 # Select upper triangle of correlation matrix c:\users\lokeswara.reddy\appdata\local\continuum\miniconda3\lib\site-packages\pandas\core\frame.py in corr(self, method, min_periods) 7003 7004 if method == 'pearson': -> 7005 correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods) 7006 elif method == 'spearman': 7007 correl = libalgos.nancorr_spearman(ensure_float64(mat), KeyboardInterrupt:
In [759]:
#correlation
correlation = df_copy.corr()
#tick labels
matrix_cols = correlation.columns.tolist()
#convert to array
corr_array  = np.array(correlation)

#Plotting
trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   colorscale = "Viridis",
                   colorbar   = dict(title = "Pearson Correlation coefficient",
                                     titleside = "right"
                                    ) ,
                  )

layout = go.Layout(dict(title = "Correlation Matrix for variables",
                        autosize = False,
                        height  = 720,
                        width   = 800,
                        margin  = dict(r = 0 ,l = 210,
                                       t = 25,b = 210,
                                      ),
                        yaxis   = dict(tickfont = dict(size = 9)),
                        xaxis   = dict(tickfont = dict(size = 9))
                       )
                  )

data = [trace]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
In [760]:
## applying PCA
df_copy.columns
Out[760]:
Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn',
       'MultipleLines_No', 'MultipleLines_Yes', 'InternetService_DSL',
       'InternetService_Fiber optic', 'InternetService_No',
       'Contract_Month-to-month', 'Contract_One year', 'Contract_Two year',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check', '0_1.0',
       'tenure', 'MonthlyCharges', 'TotalCharges', 'customerID'],
      dtype='object')
In [761]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components = 2)

# X = df_copy[[i for i in df_copy.columns if i not in Id_col + target_col]]
# Y = df_copy[target_col + Id_col]

# principal_components = pca.fit_transform(X)
# pca_data = pd.DataFrame(principal_components,columns = ["PC1","PC2"])
# pca_data = pca_data.merge(Y,left_index=True,right_index=True,how="left")
# pca_data["Churn"] = pca_data["Churn"].replace({1:"Churn",0:"Not Churn"})

# def pca_scatter(target,color) :
#     tracer = go.Scatter(x = pca_data[pca_data["Churn"] == target]["PC1"] ,
#                         y = pca_data[pca_data["Churn"] == target]["PC2"],
#                         name = target,mode = "markers",
#                         marker = dict(color = color,
#                                       line = dict(width = .5),
#                                       symbol =  "diamond-open"),
#                         text = ("Customer Id : " + 
#                                 pca_data[pca_data["Churn"] == target]['customerID'])
#                        )
#     return tracer

# layout = go.Layout(dict(title = "Visualising data with principal components",
#                         plot_bgcolor  = "rgb(243,243,243)",
#                         paper_bgcolor = "rgb(243,243,243)",
#                         xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
#                                      title = "principal component 1",
#                                      zerolinewidth=1,ticklen=5,gridwidth=2),
#                         yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
#                                      title = "principal component 2",
#                                      zerolinewidth=1,ticklen=5,gridwidth=2),
#                         height = 600
#                        )
#                   )
# trace1 = pca_scatter("Churn",'red')
# trace2 = pca_scatter("Not Churn",'royalblue')
# data = [trace2,trace1]
# fig = go.Figure(data=data,layout=layout)
# py.iplot(fig)
In [762]:
# df_copy.info()

synthotic minority over sampling technique SMOTE

In [763]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

#splitting train and test data 
train,test = train_test_split(df_copy,test_size = .25 ,random_state = 111)
    
##seperating dependent and independent variables
cols    = [i for i in df_copy.columns if i not in Id_col + target_col]
train_X = train[cols]
train_Y = train[target_col]
test_X  = test[cols]
test_Y  = test[target_col]


if synthetic_data == True:
    cols    = [i for i in df_copy.columns if i not in Id_col+target_col]

    smote_X = df_copy[cols]
    smote_Y = df_copy[target_col]

    
    #Split train and test data
    smote_train_X,smote_test_X,smote_train_Y,smote_test_Y = train_test_split(smote_X,smote_Y,
                                                                             test_size = .25 ,
                                                                             random_state = 111)
    
    #oversampling minority class using smote
    os = SMOTE(random_state = 0)
    os_smote_X,os_smote_Y = os.fit_sample(smote_train_X,smote_train_Y.iloc[:, 0])
    os_smote_X = pd.DataFrame(data = os_smote_X,columns=cols)
    os_smote_Y = pd.DataFrame(data = os_smote_Y,columns=target_col)
    train_X = os_smote_X
    train_Y = os_smote_Y
    


Logistic regresson

In [764]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score
from yellowbrick.classifier import DiscriminationThreshold


#Function attributes
#dataframe     - processed dataframe
#Algorithm     - Algorithm used 
#training_x    - predictor variables dataframe(training)
#testing_x     - predictor variables dataframe(testing)
#training_y    - target variable(training)
#training_y    - target variable(testing)
#cf - ["coefficients","features"](cooefficients for logistic 
                                 #regression,features for tree based models)

#threshold_plot - if True returns threshold plot for model
    
def df_churn_prediction(algorithm,training_x,testing_x,
                             training_y,testing_y,cols,cf,threshold_plot) :
    
    #model
    algorithm.fit(training_x,training_y)
    predictions   = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)
    #coeffs
    if   cf == "coefficients" :
        coefficients  = pd.DataFrame(algorithm.coef_.ravel())
    elif cf == "features" :
        coefficients  = pd.DataFrame(algorithm.feature_importances_)
        
    column_df     = pd.DataFrame(cols)
    coef_sumry    = (pd.merge(coefficients,column_df,left_index= True,
                              right_index= True, how = "left"))
    coef_sumry.columns = ["coefficients","features"]
    coef_sumry    = coef_sumry.sort_values(by = "coefficients",ascending = False)
    
    print (algorithm)
    print ("\n Classification report : \n",classification_report(testing_y,predictions))
    print ("Accuracy   Score : ",accuracy_score(testing_y,predictions))
    #confusion matrix
    conf_matrix = confusion_matrix(testing_y,predictions)
    #roc_auc_score
    model_roc_auc = roc_auc_score(testing_y,predictions) 
    print ("Area under curve : ",model_roc_auc,"\n")
    
    print("Coefficients: ", coef_sumry)

In [765]:
logit  = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# logit = LogisticRegression()

df_churn_prediction(logit,train_X,test_X,train_Y,test_Y,
                         cols,"coefficients",threshold_plot = False)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False) Classification report : precision recall f1-score support 0 0.85 0.90 0.87 1266 1 0.69 0.58 0.63 482 accuracy 0.81 1748 macro avg 0.77 0.74 0.75 1748 weighted avg 0.80 0.81 0.81 1748 Accuracy Score : 0.8117848970251716 Area under curve : 0.7402984536521734 Coefficients: coefficients features 27 0.655958 TotalCharges 17 0.472292 Contract_Month-to-month 15 0.401283 InternetService_Fiber optic 11 0.317263 PaperlessBilling 10 0.226103 StreamingMovies 1 0.219837 SeniorCitizen 26 0.180621 MonthlyCharges 22 0.148658 PaymentMethod_Electronic check 9 0.101121 StreamingTV 0 -0.013036 gender 2 -0.026520 Partner 13 -0.079705 MultipleLines_Yes 7 -0.101681 DeviceProtection 18 -0.112550 Contract_One year 3 -0.125572 Dependents 14 -0.138158 InternetService_DSL 23 -0.167136 PaymentMethod_Mailed check 20 -0.234530 PaymentMethod_Bank transfer (automatic) 6 -0.245756 OnlineBackup 21 -0.257916 PaymentMethod_Credit card (automatic) 12 -0.317528 MultipleLines_No 5 -0.379847 OnlineSecurity 4 -0.397233 PhoneService 8 -0.437841 TechSupport 24 -0.510924 0_1.0 16 -0.774049 InternetService_No 19 -0.870666 Contract_Two year 25 -1.386995 tenure
In [766]:
test_X.shape
df.head()
Out[766]:

Nearest neighbor algorithm

In [767]:
def df_churn_prediction_alg(algorithm,training_x,testing_x,
                                 training_y,testing_y,threshold_plot = True) :
    
    #model
    algorithm.fit(training_x,training_y)
    predictions   = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)
    
    print (algorithm)
    print ("\n Classification report : \n",classification_report(testing_y,predictions))
    print ("Accuracy Score   : ",accuracy_score(testing_y,predictions))
    #confusion matrix
    conf_matrix = confusion_matrix(testing_y,predictions)
    #roc_auc_score
    model_roc_auc = roc_auc_score(testing_y,predictions) 
    print ("Area under curve : ",model_roc_auc)
In [768]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
telecom_churn_prediction_alg(knn,train_X,test_X,train_Y,test_Y,threshold_plot = True)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform') Classification report : precision recall f1-score support 0 0.83 0.85 0.84 1266 1 0.58 0.54 0.56 482 accuracy 0.77 1748 macro avg 0.71 0.70 0.70 1748 weighted avg 0.76 0.77 0.76 1748 Accuracy Score : 0.7665903890160183 Area under curve : 0.6956074282380549

Random Forest

In [769]:
from sklearn.ensemble import RandomForestClassifier

def plot_tree_randomforest(columns,nf_estimators,
                           estimated_tree,maximum_depth,
                           criterion_type,model_performance = None) :
    
    dataframe = df[columns + target_col].copy()
    
    #train and test datasets
    rf_x     = dataframe[[i for i in columns if i not in target_col]]
    rf_y     = dataframe[target_col]
    
    #random forest classifier
    rfc   = RandomForestClassifier(n_estimators = nf_estimators,
                                   max_depth = maximum_depth,
                                   criterion = criterion_type,
                                  )
    rfc.fit(rf_x,rf_y)
    
    estimated_tree = rfc.estimators_[estimated_tree]
    
    
    #model performance
    if model_performance == True :
        df_churn_prediction(rfc,
                                 rf_x,test_X[columns],
                                 rf_y,test_Y,
                                 columns,"features",threshold_plot = True)
        

cols1 = [ i for i in train_X.columns if i not in target_col + Id_col] 
plot_tree_randomforest(cols1,100,99,3,"entropy",True)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='entropy', max_depth=3, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False) Classification report : precision recall f1-score support 0 0.79 0.96 0.87 1266 1 0.76 0.35 0.48 482 accuracy 0.79 1748 macro avg 0.78 0.65 0.67 1748 weighted avg 0.79 0.79 0.76 1748 Accuracy Score : 0.7906178489702517 Area under curve : 0.6543791338092335 Coefficients: coefficients features 17 0.270784 Contract_Month-to-month 19 0.141493 Contract_Two year 25 0.135982 tenure 15 0.110598 InternetService_Fiber optic 22 0.063074 PaymentMethod_Electronic check 27 0.058901 TotalCharges 26 0.057956 MonthlyCharges 16 0.048834 InternetService_No 18 0.025593 Contract_One year 11 0.016113 PaperlessBilling 8 0.013049 TechSupport 5 0.009675 OnlineSecurity 3 0.009312 Dependents 14 0.009068 InternetService_DSL 9 0.006434 StreamingTV 10 0.004564 StreamingMovies 23 0.003788 PaymentMethod_Mailed check 2 0.002906 Partner 20 0.002364 PaymentMethod_Bank transfer (automatic) 4 0.002341 PhoneService 21 0.001758 PaymentMethod_Credit card (automatic) 7 0.001649 DeviceProtection 13 0.001087 MultipleLines_Yes 6 0.001077 OnlineBackup 1 0.000879 SeniorCitizen 12 0.000524 MultipleLines_No 0 0.000196 gender 24 0.000000 0_1.0

Gaussian Naive Bayes

In [770]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB(priors=None)

telecom_churn_prediction_alg(gnb,train_X,test_X,train_Y,test_Y)
GaussianNB(priors=None, var_smoothing=1e-09) Classification report : precision recall f1-score support 0 0.91 0.74 0.82 1266 1 0.54 0.82 0.65 482 accuracy 0.76 1748 macro avg 0.73 0.78 0.74 1748 weighted avg 0.81 0.76 0.77 1748 Accuracy Score : 0.7608695652173914 Area under curve : 0.7790243390821552

support vector machines

In [771]:
from sklearn.svm import SVC

#Support vector classifier
#using linear hyper plane
svc_lin  = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
               decision_function_shape='ovr', degree=3, gamma=1.0, kernel='linear',
               max_iter=-1, probability=True, random_state=None, shrinking=True,
               tol=0.001, verbose=False)

cols = [i for i in telcom.columns if i not in Id_col + target_col]
df_churn_prediction(svc_lin,train_X,test_X,train_Y,test_Y,
                         cols,"coefficients",threshold_plot = False)


## tunning parameters
svc_rbf  = SVC(C=1.0, kernel='rbf', 
               degree= 3, gamma=1.0, 
               coef0=0.0, shrinking=True,
               probability=True,tol=0.001,
               cache_size=200, class_weight=None,
               verbose=False,max_iter= -1,
               random_state=None)

df_churn_prediction_alg(svc_rbf,train_X,test_X,train_Y,test_Y,threshold_plot = False)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=1.0, kernel='linear', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) Classification report : precision recall f1-score support 0 0.84 0.90 0.87 1266 1 0.68 0.56 0.62 482 accuracy 0.81 1748 macro avg 0.76 0.73 0.74 1748 weighted avg 0.80 0.81 0.80 1748 Accuracy Score : 0.8060640732265446 Area under curve : 0.7305674093593703 Coefficients: coefficients features 15 9.128103e-01 OnlineSecurity_Yes 10 2.854952e-01 InternetService_DSL 22 2.193375e-01 TechSupport_No 9 1.919003e-01 MultipleLines_Yes 11 1.869203e-01 InternetService_Fiber optic 1 1.693601e-01 SeniorCitizen 17 1.016844e-01 OnlineBackup_No internet service 13 6.722165e-02 OnlineSecurity_No 19 2.088902e-02 DeviceProtection_No 24 2.273737e-13 TechSupport_Yes 23 -1.413284e-02 TechSupport_No internet service 0 -2.735616e-02 gender 7 -3.302908e-02 MultipleLines_No 2 -3.519743e-02 Partner 26 -6.489857e-02 StreamingTV_No internet service 3 -6.561999e-02 Dependents 20 -9.151070e-02 DeviceProtection_No internet service 6 -1.001725e-01 MonthlyCharges 4 -1.097853e-01 PhoneService 21 -1.136940e-01 DeviceProtection_Yes 18 -1.225734e-01 OnlineBackup_Yes 12 -1.770069e-01 InternetService_No 5 -1.861484e-01 PaperlessBilling 14 -2.998883e-01 OnlineSecurity_No internet service 8 -3.010144e-01 MultipleLines_No phone service 25 -3.987188e-01 StreamingTV_No 27 -4.038551e-01 StreamingTV_Yes 16 -6.129220e-01 OnlineBackup_No SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=1.0, kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) Classification report : precision recall f1-score support 0 0.81 0.93 0.86 1266 1 0.68 0.41 0.51 482 accuracy 0.78 1748 macro avg 0.74 0.67 0.69 1748 weighted avg 0.77 0.78 0.77 1748 Accuracy Score : 0.7848970251716247 Area under curve : 0.6690592777592049

Light GBM

In [772]:
from lightgbm import LGBMClassifier

lgbm_c = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                        learning_rate=0.5, max_depth=7, min_child_samples=20,
                        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
                        n_jobs=-1, num_leaves=500, objective='binary', random_state=None,
                        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
                        subsample_for_bin=200000, subsample_freq=0)

cols = [i for i in telcom.columns if i not in Id_col + target_col]
df_churn_prediction(lgbm_c,train_X,test_X,train_Y,test_Y,
                         cols,"features",threshold_plot = True)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.5, max_depth=7, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=500, objective='binary', random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0) Classification report : precision recall f1-score support 0 0.82 0.86 0.84 1266 1 0.58 0.52 0.55 482 accuracy 0.76 1748 macro avg 0.70 0.69 0.69 1748 weighted avg 0.76 0.76 0.76 1748 Accuracy Score : 0.7637299771167048 Area under curve : 0.689135906865155 Coefficients: coefficients features 26 1115 StreamingTV_No internet service 27 919 StreamingTV_Yes 25 568 StreamingTV_No 0 163 gender 2 120 Partner 11 96 InternetService_Fiber optic 3 93 Dependents 6 73 MonthlyCharges 7 68 MultipleLines_No 20 67 DeviceProtection_No internet service 22 66 TechSupport_No 5 63 PaperlessBilling 1 61 SeniorCitizen 8 53 MultipleLines_No phone service 21 51 DeviceProtection_Yes 9 49 MultipleLines_Yes 10 49 InternetService_DSL 23 48 TechSupport_No internet service 18 45 OnlineBackup_Yes 12 45 InternetService_No 13 33 OnlineSecurity_No 19 31 DeviceProtection_No 17 30 OnlineBackup_No internet service 4 14 PhoneService 14 14 OnlineSecurity_No internet service 15 8 OnlineSecurity_Yes 16 8 OnlineBackup_No 24 0 TechSupport_Yes

XGBoost classifier

In [773]:
from xgboost import XGBClassifier

xgc = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                    colsample_bytree=1, gamma=0, learning_rate=0.9, max_delta_step=0,
                    max_depth = 7, min_child_weight=1, missing=None, n_estimators=100,
                    n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
                    reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                    silent=True, subsample=1)


df_churn_prediction(xgc,train_X,test_X,train_Y,test_Y,
                         cols,"features",threshold_plot = True)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.9, max_delta_step=0, max_depth=7, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, nthread=None, objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=1, verbosity=1) Classification report : precision recall f1-score support 0 0.83 0.85 0.84 1266 1 0.58 0.53 0.55 482 accuracy 0.76 1748 macro avg 0.70 0.69 0.69 1748 weighted avg 0.76 0.76 0.76 1748 Accuracy Score : 0.7625858123569794 Area under curve : 0.6902732165214711 Coefficients: coefficients features 17 0.370301 OnlineBackup_No internet service 15 0.245801 OnlineSecurity_Yes 19 0.053852 DeviceProtection_No 4 0.029450 PhoneService 14 0.022781 OnlineSecurity_No internet service 25 0.020593 StreamingTV_No 13 0.018760 OnlineSecurity_No 18 0.018739 OnlineBackup_Yes 10 0.016788 InternetService_DSL 22 0.016625 TechSupport_No 8 0.015951 MultipleLines_No phone service 5 0.014648 PaperlessBilling 11 0.012840 InternetService_Fiber optic 27 0.012445 StreamingTV_Yes 26 0.011778 StreamingTV_No internet service 7 0.011411 MultipleLines_No 21 0.011255 DeviceProtection_Yes 3 0.011037 Dependents 1 0.010974 SeniorCitizen 6 0.010869 MonthlyCharges 9 0.010686 MultipleLines_Yes 12 0.010544 InternetService_No 2 0.009832 Partner 0 0.009778 gender 20 0.008037 DeviceProtection_No internet service 23 0.007887 TechSupport_No internet service 16 0.006339 OnlineBackup_No 24 0.000000 TechSupport_Yes

comaprision of model accuracy

In [813]:
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier

#gives model report in dataframe
def model_report(model,training_x,testing_x,training_y,testing_y,name) :
    model.fit(training_x,training_y)
    predictions  = model.predict(testing_x)
    accuracy     = accuracy_score(testing_y,predictions)
    recallscore  = recall_score(testing_y,predictions)
    precision    = precision_score(testing_y,predictions)
    roc_auc      = roc_auc_score(testing_y,predictions)
    f1score      = f1_score(testing_y,predictions) 
    
    df = pd.DataFrame({"Model"           : [name],
                       "Accuracy_score"  : [accuracy],
                       "Recall_score"    : [recallscore],
                       "Precision"       : [precision],
                       "f1_score"        : [f1score],
                       "Area_under_curve": [roc_auc],
                      })
    return df

#outputs for every model
model1 = model_report(logit,train_X,test_X,train_Y,test_Y,
                      "Logistic Regression(Baseline_model)")
decision_tree = DecisionTreeClassifier(max_depth = 9,
                                       random_state = 123,
                                       splitter  = "best",
                                       criterion = "gini",
                                      )
model4 = model_report(decision_tree,train_X,test_X,train_Y,test_Y,
                      "Decision Tree")
model5 = model_report(knn,train_X,test_X,train_Y,test_Y,
                      "KNN Classifier")
rfc = RandomForestClassifier(n_estimators = 1000,
                             random_state = 123,
                             max_depth = 9,
                             criterion = "gini")
model6 = model_report(rfc,train_X,test_X,train_Y,test_Y,
                      "Random Forest Classifier")
model7 = model_report(gnb,train_X,test_X,train_Y,test_Y,
                      "Naive Bayes")
model8 = model_report(svc_lin,train_X,test_X,train_Y,test_Y,
                      "SVM Classifier Linear")
model9 = model_report(svc_rbf,train_X,test_X,train_Y,test_Y,
                      "SVM Classifier RBF")
model10 = model_report(lgbm_c,train_X,test_X,train_Y,test_Y,
                      "LGBM Classifier")
model11 = model_report(xgc,train_X,test_X,train_Y,test_Y,
                      "XGBoost Classifier")

# #concat all models
# model_performances = pd.concat([model1,
#                                 model4,model5,model6,
#                                 model7,model8,model9,
#                                 model10,model11],axis = 0).reset_index()

#concat all models
model_performances = pd.concat([model11,
                                model10,model9,model8,
                                model7,model6,model5,
                                model4,model1],axis = 0).reset_index()

model_performances = model_performances.drop(columns = "index",axis =1)

table  = ff.create_table(np.round(model_performances,4))

py.iplot(table)
In [816]:
model_performances
def output_tracer(metric,color) :
    tracer = go.Bar(y = model_performances["Model"] ,
                    x = model_performances[metric],
                    orientation = "h",name = metric ,
                    marker = dict(line = dict(width =.7),
                                  color = color)
                   )
    return tracer

layout = go.Layout(dict(title = "Model performances",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "metric",
                                     zerolinewidth=1,
                                     ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        margin = dict(l = 250),
                        height = 780
                       )
                  )


trace1  = output_tracer("Accuracy_score","#6699FF")
trace2  = output_tracer('Recall_score',"red")
trace3  = output_tracer('Precision',"#33CC99")
trace4  = output_tracer('f1_score',"lightgrey")

# data = [trace1,trace2,trace3,trace4]
data = [trace1,trace2]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)


In [812]:
model_performances
Out[812]:
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [309]:
train,test = train_test_split(df,test_size = .25 ,random_state = 111)
In [311]:
train.shape
Out[311]:
(5282, 21)
In [312]:
test.shape
Out[312]:
(1761, 21)
In [313]:
train.columns
Out[313]:
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')
In [314]:
train.to_csv("../data/raw/train_data.csv")
In [315]:
test.to_csv("../data/raw/test_data.csv")
In [ ]: