Jovian
⭐️
Sign In
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score 
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

df = pd.read_csv('telecom_churn_data.csv')
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) <ipython-input-2-756a908e2200> in <module> 3 import matplotlib.pyplot as plt 4 import seaborn as sns ----> 5 import pandas_profiling as pp 6 from scipy import stats 7 from sklearn.linear_model import LogisticRegression ModuleNotFoundError: No module named 'pandas_profiling'
In [ ]:
import warnings
warnings.filterwarnings('ignore')
In [ ]:
df.shape

Original Dataset has 99999 rows and 226 columns

In [ ]:
df.head()
In [ ]:
df.describe()

Since, the analysis is for high value customers,hence we should look for customers who pay an amount greater than 70 th percentile of average recharge done in the first two months(6,7)

In [ ]:
# prepare a derived column for high return customer based on condition given
# prepare a column for usage of data ..separate for 2g and 3g 
# prepare a column for usage of mobile data..3g use max or 2g max?
# customer spends mostly call services / data services ? (done)

In [ ]:
df['avg_rech_good_month']= (df['total_rech_amt_6']+df['total_rech_amt_7'])/2
In [ ]:
df['avg_rech_good_month'].quantile(0.7)
In [ ]:
df_high_end_cus= df[df['avg_rech_good_month']>=368.5]
In [ ]:
df_high_end_cus.shape
In [ ]:
df_high_end_cus.isna().sum()/len(df_high_end_cus) *100  # % of missing data /NA Values in the dataframe

As we see that few columns like date_of_last_rech_data_6,date_of_last_rech_data_7,date_of_last_rech_data_8, date_of_last_rech_data_9,total_rech_data_6,total_rech_data_7,total_rech_data_8,total_rech_data_9, max_rech_data_6,max_rech_data_7,max_rech_data_8,max_rech_data_9,count_rech_2g_6,count_rech_2g_7, count_rech_2g_8,count_rech_2g_9,count_rech_3g_6,count_rech_3g_7,count_rech_3g_8,count_rech_3g_9, av_rech_amt_data_6,av_rech_amt_data_7,av_rech_amt_data_8,av_rech_amt_data_9,arpu_3g_6,arpu_3g_7, arpu_3g_8,arpu_3g_9,fb_user_6,fb_user_7,fb_user_8,fb_user_9 have high null values , i.e above 50 percent . Hence, we can drop them based on their significance.

For columns like fb_user_6,fb_user_7,fb_user_8,fb_user_9, as because usage of facebook might not be a factor for deciding whether customer will churn or not.

In [ ]:
df_high_end_cus = df_high_end_cus.drop(columns=['fb_user_6','fb_user_7','fb_user_8','fb_user_9'],axis=1)

For columns like count_rech_2g_6,count_rech_2g_7,count_rech_2g_8,count_rech_2g_9, count_rech_3g_6, count_rech_3g_7,count_rech_3g_8,count_rech_3g_9, we should not remove these columns as they might be helpful in predicting churning of customers.

For columns like max_rech_data_9,max_rech_data_8,max_rech_data_7,max_rech_data_6, total_rech_data_9, total_rech_data_8,total_rech_data_7,total_rech_data_6, we retain recharge data.

For columns like arpu_3g_9,arpu_3g_8,arpu_3g_7,arpu_3g_6, arpu_2g_9, arpu_2g_8,arpu_2g_7,arpu_2g_6, we should retain this column as the average revenue of internet users can also help in predicting churning.

Similarly for other columns with high null values , we remove the columns

In [ ]:
cols_to_retain = ['count_rech_2g_6','count_rech_2g_7','count_rech_2g_8','count_rech_2g_9','count_rech_3g_6',
                 'count_rech_3g_7','count_rech_3g_8','count_rech_3g_9','max_rech_data_9','max_rech_data_8',
                 'max_rech_data_7','max_rech_data_6','total_rech_data_9','total_rech_data_8','total_rech_data_7',
                 'total_rech_data_6','arpu_2g_6','arpu_2g_7','arpu_2g_8','arpu_2g_9','arpu_3g_9','arpu_3g_8','arpu_3g_7','arpu_3g_6']

df_high_end_cus[cols_to_retain] = df_high_end_cus[cols_to_retain].fillna(df_high_end_cus[cols_to_retain].median())
In [ ]:
# for columns like like night pack user or not we can remove those columns as already the recharges have been covered.

df_high_end_cus = df_high_end_cus.drop(columns=['night_pck_user_6','night_pck_user_7','night_pck_user_8',
                                                'night_pck_user_9'],axis=1)

Since , columns are numeric , we can use mean values for respective columns to impute missing values.

In [ ]:
df_high_end_cus = df_high_end_cus.fillna(df_high_end_cus.median())
In [ ]:
df_high_end_cus.isna().sum() /len(df_high_end_cus) *100

For dates we have to handle data bit differently date_of_last_rech_data_9,date_of_last_rech_data_8, date_of_last_rech_data_7,date_of_last_rech_data_6,last_date_of_month_6,last_date_of_month_7,last_date_of_month_8,last_date_of_month_9.. we separate the dates as separate column for each month

In [ ]:
date_cols = ['date_of_last_rech_data_9','date_of_last_rech_data_8','date_of_last_rech_data_7','date_of_last_rech_data_6',
               'last_date_of_month_6','last_date_of_month_7','last_date_of_month_8','last_date_of_month_9',
            'date_of_last_rech_6','date_of_last_rech_7','date_of_last_rech_8','date_of_last_rech_9']
In [ ]:
#df_high_end_cus[date_cols] = df_high_end_cus[date_cols].astype('datetime64[ns]')
In [ ]:
df_high_end_cus[date_cols].isna().sum()
In [ ]:
#extracting days from the datetime format columns
df_high_end_cus['date_last_rech_data_9'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_data_9']).dt.day
df_high_end_cus['date_last_rech_data_8'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_data_8']).dt.day
df_high_end_cus['date_last_rech_data_7'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_data_7']).dt.day
df_high_end_cus['date_last_rech_data_6'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_data_6']).dt.day
df_high_end_cus['date_last_rech_6'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_6']).dt.day
df_high_end_cus['date_last_rech_7'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_7']).dt.day
df_high_end_cus['date_last_rech_8'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_8']).dt.day
df_high_end_cus['date_last_rech_9'] = pd.to_datetime(df_high_end_cus['date_of_last_rech_9']).dt.day
df_high_end_cus['last_date_@_month_6'] = pd.to_datetime(df_high_end_cus['last_date_of_month_6']).dt.day
df_high_end_cus['last_date_@_month_7'] = pd.to_datetime(df_high_end_cus['last_date_of_month_7']).dt.day
df_high_end_cus['last_date_@_month_8'] = pd.to_datetime(df_high_end_cus['last_date_of_month_8']).dt.day
df_high_end_cus['last_date_@_month_9'] = pd.to_datetime(df_high_end_cus['last_date_of_month_9']).dt.day

In [ ]:
df_high_end_cus = df_high_end_cus.drop(columns = date_cols,axis=1) # sropping all the original date-time columns
In [ ]:
df_high_end_cus = df_high_end_cus.fillna(df_high_end_cus.median()) # filling columns with days ..with median 
In [ ]:
df_high_end_cus.isna().sum()
In [ ]:
df_high_end_cus.shape # final shape of dataset after null value treatment
In [ ]:
churn_attrs = ['total_ic_mou_9','total_og_mou_9','vol_2g_mb_9','vol_3g_mb_9']


In [ ]:
df_high_end_cus['churn'] = df_high_end_cus.apply(lambda row:1 if ((row.total_ic_mou_9 == 0 or row.total_og_mou_9==0)
                                                                  and (row.vol_2g_mb_9 == 0 or row.vol_3g_mb_9 ==0))
                                                                else 0 , axis = 1)
    
In [ ]:
df_high_end_cus['churn'].value_counts()

so , as we see a customer churns churns if he/she does not any call service(incoming/outgoing) or internet services (2G/3G). Based on the interpretation above :

  1. 3047 churned customers
  2. 26964 non-churned customers
In [ ]:
# Now , we drop all the columns that act as data from the churn month - i.e month of september(9)
In [ ]:
drop_churn_cols = []
col_list = df_high_end_cus.columns.values
for x in range(len(col_list)):
    if(col_list[x][-1]=='9'):
        drop_churn_cols.append(col_list[x])
In [ ]:
drop_churn_cols 
In [ ]:
df_high_end_cus=df_high_end_cus.drop(drop_churn_cols,axis=1)
In [ ]:
df_high_end_cus.shape #final dataframe after removing all churn month attributes and null value treated
In [ ]:
#Get Correlation of "Churn" with other variables:
plt.figure(figsize=(30,10))
df_high_end_cus.corr()['churn'].sort_values(ascending = False).plot(kind='bar')
plt.show()
In [ ]:
ax = sns.distplot(df_high_end_cus['arpu_6'], hist=True, kde=False, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
ax.set_ylabel('Churn')
ax.set_xlabel('Average revenue per user in july')
ax.set_title('Relation between revenue and churn rate')
In [ ]:
ax = sns.distplot(df_high_end_cus['arpu_7'], hist=True, kde=False, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
ax.set_ylabel('Churn')
ax.set_xlabel('Average revenue per user in August')
ax.set_title('Relation between revenue and churn rate')
In [ ]:
df_high_end_cus=df_high_end_cus.drop(columns=['mobile_number','circle_id'])

We can drop column mobile_number,circle_id as it does not help in predicting churning

In [ ]:
df_high_end_cus.corr()

We see above from the above heatmap that some of the correlation have Null Values(whitespaces) as these columns have same value and do not change, hence the standard deviation of these variables is 0 , which leads to Null values of correlation.

Factor Analysis to check for multicollinearity
In [ ]:
from sklearn.decomposition import FactorAnalysis
FA = FactorAnalysis(n_components = 3).fit_transform(df_high_end_cus.values)
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
plt.title('Factor Analysis Components')
plt.scatter(FA[:,0], FA[:,1])
plt.scatter(FA[:,1], FA[:,2])
plt.scatter(FA[:,2], FA[:,0])


We see that, the data is highly correlated and hence, we have to remove multicollinearity. Here, every group of factor is a set of highly correlated variables/columns

In [ ]:
#report = pp.ProfileReport(df_high_end_cus)
#report.to_file('output_report.html')

Dropping variables with high multicollinearity as per report from pandas-profile.

In [ ]:
df_high_end_cus = df_high_end_cus.drop(columns=['arpu_3g_6','arpu_3g_7','arpu_3g_8','isd_og_mou_7','isd_og_mou_8',
                                               'sachet_2g_6','sachet_2g_7','sachet_2g_8','total_rech_amt_6',
                                               'total_rech_amt_7','total_rech_amt_8'])

Dropping variables with constant values as per report from pandas-profile.

In [ ]:
df_high_end_cus = df_high_end_cus.drop(columns=['last_date_@_month_6','last_date_@_month_7','last_date_@_month_8',
                                               'loc_ic_t2o_mou','loc_og_t2o_mou','std_ic_t2o_mou_6','std_ic_t2o_mou_7',
                                               'std_ic_t2o_mou_8','std_og_t2c_mou_6','std_og_t2c_mou_7','std_og_t2c_mou_8',
                                               'std_og_t2o_mou'])
In [ ]:
df_high_end_cus.shape

Now, identifying skewed columns and do the necessary transformations.

In [ ]:
sns.distplot(df_high_end_cus['avg_rech_good_month'])
plt.show()
In [ ]:
#since, the column avg_rech_good_month is highly skewed, we can do log transformation.
In [ ]:
df_high_end_cus['avg_rech_good_month_trnsfrm'] = np.log(df_high_end_cus['avg_rech_good_month'])
df_high_end_cus = df_high_end_cus.drop(columns=['avg_rech_good_month'])
In [ ]:
sns.distplot(df_high_end_cus['avg_rech_good_month_trnsfrm'])
plt.show()
In [ ]:
sns.distplot(df_high_end_cus['ic_others_6'])
plt.show()
In [ ]:
df_high_end_cus['ic_others_6_trnsfrm']=np.sqrt(df_high_end_cus['ic_others_6'])
df_high_end_cus['ic_others_8_trnsfrm']=np.sqrt(df_high_end_cus['ic_others_8'])
df_high_end_cus['isd_ic_mou_7_trnsfrm']=np.sqrt(df_high_end_cus['isd_ic_mou_7'])
df_high_end_cus['isd_ic_mou_6_trnsfrm']=np.sqrt(df_high_end_cus['isd_ic_mou_6'])
df_high_end_cus['loc_og_t2c_mou_7_trnsfrm']=np.sqrt(df_high_end_cus['loc_og_t2c_mou_7'])
df_high_end_cus['og_others_7_trnsfrm']=np.sqrt(df_high_end_cus['og_others_7'])
df_high_end_cus['og_others_8_trnsfrm']=np.sqrt(df_high_end_cus['og_others_8'])
df_high_end_cus['spl_ic_mou_6_trnsfrm']=np.sqrt(df_high_end_cus['spl_ic_mou_6'])
df_high_end_cus['spl_ic_mou_7_trnsfrm']=np.sqrt(df_high_end_cus['spl_ic_mou_7'])
df_high_end_cus['std_ic_t2f_mou_6_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2f_mou_6'])
df_high_end_cus['std_ic_t2f_mou_7_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2f_mou_7'])
df_high_end_cus['std_ic_t2f_mou_8_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2f_mou_8'])
df_high_end_cus['std_ic_t2t_mou_6_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2t_mou_6'])
df_high_end_cus['std_ic_t2t_mou_7_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2t_mou_7'])
df_high_end_cus['std_ic_t2t_mou_8_trnsfrm']=np.sqrt(df_high_end_cus['std_ic_t2t_mou_8'])

cols_skewed_sqrt = ['ic_others_6','ic_others_8','isd_ic_mou_7','isd_ic_mou_6','loc_og_t2c_mou_7','og_others_7',
                   'og_others_8','spl_ic_mou_6','spl_ic_mou_7','std_ic_t2f_mou_6','std_ic_t2f_mou_7','std_ic_t2f_mou_8',
                   'std_ic_t2t_mou_6','std_ic_t2t_mou_7','std_ic_t2t_mou_8']


df_high_end_cus = df_high_end_cus.drop(columns=cols_skewed_sqrt,axis=1)
In [ ]:
#report_1 = pp.ProfileReport(df_high_end_cus)
#report_1.to_file('output_report_2.html')
In [ ]:
df_high_end_cus['ic_others_7_trnsfrm']=np.sqrt(df_high_end_cus['ic_others_7'])
df_high_end_cus['isd_og_mou_6_trnsfrm']=np.sqrt(df_high_end_cus['isd_og_mou_6'])

df_high_end_cus = df_high_end_cus.drop(columns=['ic_others_7','isd_og_mou_6'])

In [ ]:
FA_1 = FactorAnalysis(n_components = 3).fit_transform(df_high_end_cus.values)
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
plt.title('Factor Analysis Components')
plt.scatter(FA_1[:,0],FA_1[:,1])
plt.scatter(FA_1[:,1], FA_1[:,2])
plt.scatter(FA_1[:,2],FA_1[:,0])
plt.show()

In [ ]:
#report_2 = pp.ProfileReport(df_high_end_cus)
#report_2.to_file('output_report_3.html')

As we can see from the above factor plot, that the multicollinearity among the datasets have been reduced due to removal of the above mentioned columns as per pandas profiling.

In [ ]:
df_high_end_cus.corr()['churn'].sort_values(ascending = False)
In [ ]:
sns.boxplot(df_high_end_cus['std_og_t2t_mou_6'])
In [ ]:
df_high_end_cus.std_og_t2t_mou_6.quantile(0.99)
In [ ]:
max(df_high_end_cus['std_og_t2t_mou_6'])

As we can see that for the column std_og_t2t_mou_6,there is a significant difference between the maximum value and the value of 99 th percentile, this shows there are clearly many outliers.

As a strategy to remove outliers, we retain values till 99th percentile of the top 10 correlated columns with column churn.

In [ ]:
churn_corr_cols = ['std_og_mou_6','std_og_t2m_mou_6','std_og_t2t_mou_6','roam_og_mou_7','roam_og_mou_8','total_og_mou_6',
                  'onnet_mou_6','roam_ic_mou_7','total_rech_num_6','roam_ic_mou_8']
In [ ]:
df_high_end_cus = df_high_end_cus[df_high_end_cus.std_og_mou_6 < df_high_end_cus.std_og_mou_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.std_og_t2m_mou_6 < df_high_end_cus.std_og_t2m_mou_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.std_og_t2t_mou_6 < df_high_end_cus.std_og_t2t_mou_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.roam_og_mou_7 < df_high_end_cus.roam_og_mou_7.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.roam_og_mou_8 < df_high_end_cus.roam_og_mou_8.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.total_og_mou_6 < df_high_end_cus.total_og_mou_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.onnet_mou_6 < df_high_end_cus.onnet_mou_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.roam_ic_mou_7 < df_high_end_cus.roam_ic_mou_7.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.total_rech_num_6 < df_high_end_cus.total_rech_num_6.quantile(.99)]
df_high_end_cus = df_high_end_cus[df_high_end_cus.roam_ic_mou_8 < df_high_end_cus.roam_ic_mou_8.quantile(.99)]



In [ ]:
df_high_end_cus.shape #after maximum possible outlier treatment

Splitting into train - test data

In [ ]:
X = df_high_end_cus.drop(columns=['churn'],axis=1)
y = df_high_end_cus['churn']
In [ ]:
# split into train and test with ratio of 80% and 20%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.8,
                                                    test_size = 0.2, random_state=100)

Handling Class Imbalance

In [ ]:
y_train.value_counts()
In [ ]:
y_test.value_counts()

As , we see there is a clear class imbalance of the customers who have churned and not churned. So, we apply methods like SMOTE(Synthetic Minority OverSampling Technique) to upsample the minority class of data points in churn column.

In [ ]:
from imblearn.over_sampling import SMOTE
In [ ]:
sm = SMOTE(random_state=27, ratio=1.0)
X_train, y_train = sm.fit_sample(X_train, y_train)
In [ ]:
np.bincount(y_train) #19739 rows of each class for the column churn
In [ ]:
# Converting n-arrays to dataframe
X_train_df = pd.DataFrame(X_train)
y_train_df = pd.DataFrame(y_train)
In [ ]:
X_train_df.columns = X.columns

Feature Scaling

In [ ]:
from sklearn.preprocessing import StandardScaler
In [ ]:
scaler = StandardScaler()

X_train_df_scaled = scaler.fit_transform(X_train_df)

Logistic Regression

Model with simple logistic regression using SAGA (Stochastic Average Gradient descent solver) that includes both L1 and L2 regularisation.

In [ ]:
smote = LogisticRegression(solver='saga').fit(X_train_df_scaled, y_train_df)
In [ ]:
X_test = scaler.transform(X_test)
In [ ]:
smote_pred = smote.predict(X_test)

In [ ]:


print(accuracy_score(y_test, smote_pred))
    #0.84

# f1 score
print(f1_score(y_test, smote_pred))
    #0.51

print(recall_score(y_test, smote_pred))
    #0.83
    
print(precision_score(y_test, smote_pred))
   #0.37

Logistic Regression with PCA

In [ ]:
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import explained_variance_score
In [ ]:
pca = PCA(n_components=40,random_state=100,svd_solver='randomized')


In [ ]:
Xtrain_reduced = pca.fit_transform(X_train_df_scaled)
Xtest_reduced = pca.transform(X_test)

regrpca = LogisticRegression()

# Train the model using the principal components of the transformed training sets
regrpca.fit(Xtrain_reduced, y_train_df)
# Make predictions using the principal components of the transformed testing set
y_pred = regrpca.predict(Xtest_reduced)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('R2 score: %.2f' % r2_score(y_test, y_pred))

In [ ]:
sum(pca.explained_variance_ratio_)
In [ ]:
#plotting a scree plot
fig = plt.figure(figsize = (12,8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()
In [ ]:
print(accuracy_score(y_test, y_pred))
    #0.76

# f1 score
print(f1_score(y_test, y_pred))
    #0.393

print(recall_score(y_test, y_pred))
    #0.814
    
print(precision_score(y_test, y_pred))
   #0.259

We see that the accuracy scores, precision,recall scores are pretty low for logistic regression model with PCA components

Random Forest Classifier

Default Hyperparameters

Let's first fit a random forest model with default hyperparameters.

In [ ]:
# Importing random forest classifier from sklearn library
from sklearn.ensemble import RandomForestClassifier

# Running the random forest with default parameters.
rfc = RandomForestClassifier()
In [ ]:
# fit
rfc.fit(X_train,y_train)
In [ ]:
# Making predictions
predictions = rfc.predict(X_test)
In [ ]:
# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
In [ ]:
# Let's check the report of our default model
print(classification_report(y_test,predictions))
In [ ]:
# Printing confusion matrix
print(confusion_matrix(y_test,predictions))
In [ ]:
print(accuracy_score(y_test,predictions))

Tuning max_depth Let's try to find the optimum values for max_depth and understand how the value of max_depth impacts the overall accuracy of the ensemble.

In [ ]:
# GridSearchCV to find optimal n_estimators
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'max_depth': range(2, 20, 5)}

# instantiate the model
rf = RandomForestClassifier()


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
rf.fit(X_train, y_train)
In [ ]:
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
In [ ]:
# plotting accuracies with max_depth
plt.figure()
plt.plot(scores["param_max_depth"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_max_depth"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("max_depth")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

Tuning n_estimators

In [ ]:
# GridSearchCV to find optimal n_estimators
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'n_estimators': range(100, 1500, 400)}

# instantiate the model (note we are specifying a max_depth)
rf = RandomForestClassifier(max_depth=4)


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
rf.fit(X_train, y_train)
In [ ]:
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
In [ ]:
# plotting accuracies with n_estimators
plt.figure()
plt.plot(scores["param_n_estimators"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_n_estimators"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("n_estimators")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

Tuning max_features

In [ ]:
# GridSearchCV to find optimal max_features
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'max_features': [4, 8, 14, 20, 24]}

# instantiate the model
rf = RandomForestClassifier(max_depth=4)


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
rf.fit(X_train, y_train)
In [ ]:
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
In [ ]:
# plotting accuracies with max_features
plt.figure()
plt.plot(scores["param_max_features"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_max_features"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("max_features")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

Tuning min_samples_leaf

In [ ]:
# GridSearchCV to find optimal min_samples_leaf
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'min_samples_leaf': range(100, 400, 50)}

# instantiate the model
rf = RandomForestClassifier()


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
rf.fit(X_train, y_train)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: