Jovian
⭐️
Sign In

Titanic Disaster Analysis

In [108]:
import pandas as pd
import numpy as np
import seaborn as sn
import logging
import sys
import warnings

Reading given Dataset

In [2]:
titanic_train_df = pd.read_csv('data/train.csv')
titanic_test_df = pd.read_csv('data/test.csv')
titanic_train_df.shape, titanic_test_df.shape
Out[2]:
((891, 12), (418, 11))
In [3]:
titanic_train_df.columns
Out[3]:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
In [4]:
titanic_test_df.columns
Out[4]:
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

Consolidating Dataset

In [5]:
titanic_train_df1 = titanic_train_df.drop(columns=['Survived'])
titanic_train_df1.shape
Out[5]:
(891, 11)
In [6]:
titanic_df = pd.concat([titanic_train_df1, titanic_test_df])
titanic_df.shape
Out[6]:
(1309, 11)
In [7]:
titanic_df.head(5)
Out[7]:

Converting sex feature to integer i.e. Male to 1 and Female to 0

In [8]:
sex_df = pd.crosstab(titanic_train_df.Survived, titanic_train_df.Sex,margins=True)
sex_df
Out[8]:
In [9]:
sex_df.drop(['All'],  axis=1).drop(['All'], axis=0).plot(kind='bar', stacked=True, color=['red','blue'], grid=False)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a2000d0f0>
Notebook Image
In [10]:
titanic_df['Sex'] = titanic_df['Sex'].replace('male', 1)
titanic_df['Sex'] = titanic_df['Sex'].replace('female', 0)
titanic_df.head(5)
Out[10]:
In [11]:
# Train Dataset
titanic_train_df['Sex'] = titanic_train_df['Sex'].replace('male', 1)
titanic_train_df['Sex'] = titanic_train_df['Sex'].replace('female', 0)
In [12]:
# Test Dataset
titanic_test_df['Sex'] = titanic_test_df['Sex'].replace('male', 1)
titanic_test_df['Sex'] = titanic_test_df['Sex'].replace('female', 0)

Converting embarked feature to interger i.e S to 0, C to 1, Q to 2

In [13]:
embarked_df = pd.crosstab(titanic_train_df.Survived, titanic_train_df.Embarked,margins=True)
embarked_df
Out[13]:
In [14]:
embarked_df.drop(['All'],  axis=1).drop(['All'], axis=0).plot(kind='bar', stacked=True, color=['green','blue', 'red'], grid=False)
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a202c6ef0>
Notebook Image
In [15]:
titanic_df['Embarked'] = titanic_df['Embarked'].replace('S', 0)
titanic_df['Embarked'] = titanic_df['Embarked'].replace('C', 1)
titanic_df['Embarked'] = titanic_df['Embarked'].replace('Q', 2)
titanic_df.head(5)
Out[15]:
In [16]:
titanic_train_df['Embarked'] = titanic_train_df['Embarked'].replace('S', 0)
titanic_train_df['Embarked'] = titanic_train_df['Embarked'].replace('C', 1)
titanic_train_df['Embarked'] = titanic_train_df['Embarked'].replace('Q', 2)
In [17]:
titanic_test_df['Embarked'] = titanic_test_df['Embarked'].replace('S', 0)
titanic_test_df['Embarked'] = titanic_test_df['Embarked'].replace('C', 1)
titanic_test_df['Embarked'] = titanic_test_df['Embarked'].replace('Q', 2)
In [18]:
titanic_df.describe()
Out[18]:

Ignoring Warnings

In [109]:
warnings.filterwarnings("ignore")

Complete Titanic Data - NA values

In [110]:
titanic_df.isna().sum()
Out[110]:
PassengerId       0
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin          1014
Embarked          0
dtype: int64

Titanic Train Data - NA values

In [111]:
titanic_train_df.isna().sum()
Out[111]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

Titanic Test Data - NA values

In [112]:
titanic_test_df.isna().sum()
Out[112]:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64
In [113]:
sn.pairplot(titanic_train_df)
Out[113]:
<seaborn.axisgrid.PairGrid at 0x1a250497b8>
Notebook Image

Crosstab for multiple variables

In [114]:
df1 = pd.crosstab([titanic_train_df.Pclass, titanic_train_df.Sex],titanic_train_df.Survived, margins=True)
df1
Out[114]:
In [115]:
df1.drop(['All'],  axis=1).drop(['All'], axis=0).plot(kind='bar', stacked=True, color=['green','blue'], grid=False)
Out[115]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a26cd6518>
Notebook Image
In [116]:
df1 = pd.crosstab([titanic_train_df.SibSp, titanic_train_df.Sex],titanic_train_df.Survived, margins=True)
df1
Out[116]:
In [117]:
df1.drop(['All'],  axis=1).drop(['All'], axis=0).plot(kind='bar', stacked=True, color=['green','blue'], grid=False)
Out[117]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a2838c1d0>
Notebook Image
In [118]:
df1 = pd.crosstab([titanic_train_df.Parch, titanic_train_df.Sex],titanic_train_df.Survived, margins=True)
df1
Out[118]:
In [119]:
df1.drop(['All'],  axis=1).drop(['All'], axis=0).plot(kind='bar', stacked=True, color=['green','blue'], grid=False)
Out[119]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c07bc50>
Notebook Image
In [120]:
df1 = pd.crosstab([titanic_train_df.Embarked, titanic_train_df.Sex],titanic_train_df.Survived, margins=True)
df1
Out[120]:
In [121]:
df1.drop(['All'],  axis=1).drop(['All'], axis=0).plot(kind='bar', stacked=True, color=['green','blue'], grid=False)
Out[121]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a283aa6a0>
Notebook Image

Outliers

In [122]:
titanic_train_df.boxplot(by='Survived', column=['Age'], grid=False)
Out[122]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a284296d8>
Notebook Image

Outliers = mean +/- 3 * standard deviation

In [123]:
outliers = titanic_train_df[titanic_train_df['Age'] > titanic_train_df['Age'].mean() + 3 * titanic_train_df['Age'].std()]
outliers
Out[123]:
In [124]:
outliers = titanic_train_df[titanic_train_df['Age'] < titanic_train_df['Age'].mean() - 3 * titanic_train_df['Age'].std()]
outliers
Out[124]:

According to business usecase there might be people above 70 to 80 can also survive.

Data Integration Document 1

Column Name Describe Accuracy (Mean) Accuracy (SD) Accuracy
(Missing Values)
Accuracy (Outliers) Reliability Domain Info (DI) Cross Tabs
(CT)
Latency
(Monthly/ Weekly)
Availability Remarks
PassengerId 655 378.02 0 0 NA NA NA Yes NA
Pclass 2.2948 0.8378 0 0 NA NA NA Yes NA
Name Describe NA NA 0 0 NA NA NA NA Yes NA
Sex 0.644 0.4789 0 0 NA Male - 1
Female - 0
NA Yes NA
Age 29.88 14.413 263 0 NA NA NA Yes NA
SibSp 0.4988 1.041 0 0 NA NA NA Yes NA
Parch 0.385 0.8655 0 0 NA NA NA Yes NA
Ticket Describe NA NA 0 0 NA NA NA NA Yes NA
Fare 33.2954 51.7586 1 0 NA NA NA Yes NA
Cabin Describe NA NA 1014 0 NA NA NA NA Yes NA
Embarked 0.3947 0.6538 2 0 NA S - 0
C - 1
Q - 2
NA Yes NA

Cleaning, Creating and Conversion

Converting Outliers, Completing Missing Values

We don't have any Outliers in the Dataset to convert
Completing Missing Age Values to mean
In [125]:
print('The median "Age" of the passengers is {0:.2f}'.format(titanic_df["Age"].median(skipna=True)))
titanic_age_median = titanic_df["Age"].median(skipna=True)
The median "Age" of the passengers is 28.00
In [126]:
titanic_df["Age"].fillna(titanic_age_median, inplace=True)
titanic_train_df['Age'].fillna(titanic_age_median, inplace=True)
titanic_test_df['Age'].fillna(titanic_age_median, inplace=True)
Completing Missing Fare Values to mean
In [127]:
print('The median "Fare" of the passengers is {0:.2f}'.format(titanic_df["Fare"].median(skipna=True)))
titanic_fare_median = titanic_df["Fare"].median(skipna=True)
The median "Fare" of the passengers is 14.45
In [128]:
titanic_df["Fare"].fillna(titanic_fare_median, inplace=True)
titanic_train_df['Fare'].fillna(titanic_fare_median, inplace=True)
titanic_test_df['Fare'].fillna(titanic_fare_median, inplace=True)
Completing Missing Embarked Values to most used value
In [129]:
titanic_df['Embarked'].value_counts()
Out[129]:
0    916
1    270
2    123
Name: Embarked, dtype: int64
In [130]:
titanic_embarked_most_used = 0
titanic_df["Embarked"].fillna(titanic_embarked_most_used, inplace=True)
titanic_train_df['Embarked'].fillna(titanic_embarked_most_used, inplace=True)
titanic_test_df['Embarked'].fillna(titanic_embarked_most_used, inplace=True)
In [131]:
titanic_df.isna().sum()
Out[131]:
PassengerId       0
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin          1014
Embarked          0
dtype: int64
Converting the Age and Embarked to Integers
In [132]:
titanic_df['Age'] = titanic_df['Age'].astype(int)
titanic_train_df['Age'] = titanic_train_df['Age'].astype(int)
titanic_test_df['Age'] = titanic_test_df['Age'].astype(int)
In [133]:
titanic_df['Embarked'] = titanic_df['Embarked'].astype(int)
titanic_train_df['Embarked'] = titanic_train_df['Embarked'].astype(int)
titanic_test_df['Embarked'] = titanic_test_df['Embarked'].astype(int)

Conversion

  • Removing the column's which are not playing important role for prediction
In [134]:
# Removing PassengerId, Name, Ticket, Cabin
df = titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
train_df = titanic_train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_df = titanic_test_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
In [135]:
train_df.head(5)
Out[135]:

Predicting Survival by not adding new features.

In [136]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
In [137]:
xv_df = train_df.drop(['Survived'], axis=1)
yv_df = train_df['Survived']
In [138]:
xv_df.head(5)
Out[138]:
In [139]:
yv_df.head(5)
Out[139]:
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

Test Size: 25%

Random State: 5
In [140]:
xv_train, xv_test, yv_train, yv_test = train_test_split(xv_df,yv_df,test_size=0.25,random_state=5)

Common model code

In [141]:
class Model:
    def __init__(self):
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None
    
    def set_data(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test
    
    def set_logger_level(self, log_level):
        logging.basicConfig(stream=sys.stdout, level=log_level, format='', datefmt=None)
        logger = logging.getLogger()
        logger.setLevel(log_level)
        return logger
    
    def apply_model(self, model, log_level=logging.INFO):
        logger = self.set_logger_level(log_level)
        logger.debug('Applying: {}'.format(model))
        
        model.fit(self.x_train, self.y_train)
        y_score = model.predict(self.x_test)
        score = model.score(self.x_test, self.y_test)
        
        logger.debug('Predict: {}'.format(y_score))
        logger.debug('Score: {}'.format(score))
        return score

m = Model()
m.set_data(xv_train, xv_test, yv_train, yv_test)

Logistic Regression

In [142]:
logit = LogisticRegression()
logit_score = m.apply_model(logit, log_level=logging.DEBUG)
Applying: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=None, penalty='l2', random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False) Predict: [0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 1 1 0 1 0 0 0 0 1 1 0 1 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 1 0 1 1 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1] Score: 0.8385650224215246

Decision Tree

In [143]:
decision_tree = DecisionTreeClassifier()
decision_tree_score = m.apply_model(decision_tree, log_level=logging.DEBUG)
Applying: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best') Predict: [0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 1 0 1 0 0 1 1 1 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 1 1 1 0 1 0 0 0 1 0 1 1 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1] Score: 0.7937219730941704

Random Forest

In [144]:
random_forest = RandomForestClassifier()
random_forest_score = m.apply_model(random_forest, log_level=logging.DEBUG)
Applying: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False) Predict: [0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1] Score: 0.8385650224215246

SVM + Kernels

In [145]:
svc = SVC()
svc_score = m.apply_model(svc, log_level=logging.DEBUG)
Applying: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto_deprecated', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) Predict: [0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0] Score: 0.7309417040358744
  • Types of kernels - ‘linear’, ‘poly’, ‘rbf’ (default), ‘sigmoid’, ‘precomputed’
In [146]:
# kernel = SVC(kernel='poly', degree=3) # Runs endlessly
# kernel_score = apply_model(kernel)
In [147]:
kernel = SVC(kernel='linear')
kernel_score = m.apply_model(kernel, log_level=logging.DEBUG)
Applying: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto_deprecated', kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) Predict: [0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 1 0 1 1 1 0 1 1 1 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1] Score: 0.8116591928251121

Gradient Boosting + XGBoost

In [148]:
gb = GradientBoostingClassifier()
gb_score = m.apply_model(gb, log_level=logging.DEBUG)
Applying: GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_iter_no_change=None, presort='auto', random_state=None, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False) Predict: [0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1] Score: 0.8295964125560538
In [149]:
!pip install xgboost
Requirement already satisfied: xgboost in /Users/kkotari/.pyenv/versions/anaconda3-2019.03/lib/python3.7/site-packages (0.90) Requirement already satisfied: scipy in /Users/kkotari/.pyenv/versions/anaconda3-2019.03/lib/python3.7/site-packages (from xgboost) (1.2.1) Requirement already satisfied: numpy in /Users/kkotari/.pyenv/versions/anaconda3-2019.03/lib/python3.7/site-packages (from xgboost) (1.16.2)
In [150]:
from xgboost import XGBClassifier
In [151]:
xgboost = XGBClassifier()
xgboost_score = m.apply_model(xgboost, log_level=logging.DEBUG)
Applying: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, nthread=None, objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=None, subsample=1, verbosity=1) Predict: [0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1] Score: 0.8385650224215246

Naive Bayes

In [152]:
gnb = GaussianNB()
gnb_score = m.apply_model(gnb, log_level=logging.DEBUG)
Applying: GaussianNB(priors=None, var_smoothing=1e-09) Predict: [0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 0 0 1 1 1 1 0 0 1 0 0 0 0 1 1 0 1 0 0 1 0 0 0 1 0 0 0 0 1 1 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 1 1 1 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 1] Score: 0.8430493273542601

KNN

In [153]:
knn = KNeighborsClassifier()
knn_score = m.apply_model(knn, log_level=logging.DEBUG)
Applying: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=5, p=2, weights='uniform') Predict: [0 0 0 1 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 1 1 0 0 1 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0] Score: 0.7354260089686099
In [154]:
print('Logistic Regression: {}'.format(logit_score))
print('Decision Tree:       {}'.format(decision_tree_score))
print('SVM:                 {}'.format(svc_score))
print('Naive Bayes:         {}'.format(gnb_score))
print('KNN:                 {}'.format(knn_score))
print('Random Forest:       {}'.format(random_forest_score))
print('GBM:                 {}'.format(gb_score))
print('XGBoost:             {}'.format(xgboost_score))
print('Kernel:              {}'.format(kernel_score))
Logistic Regression: 0.8385650224215246 Decision Tree: 0.7937219730941704 SVM: 0.7309417040358744 Naive Bayes: 0.8430493273542601 KNN: 0.7354260089686099 Random Forest: 0.8385650224215246 GBM: 0.8295964125560538 XGBoost: 0.8385650224215246 Kernel: 0.8116591928251121
Random State: 522
In [155]:
def apply_models_with_random_state_test_size(random, test_size=0.25):
    m = Model()
    
    xv_train, xv_test, yv_train, yv_test = train_test_split(xv_df,yv_df,test_size=test_size,random_state=random)
    m.set_data(xv_train, xv_test, yv_train, yv_test)
    
    logit = LogisticRegression()
    decision_tree = DecisionTreeClassifier()
    random_forest = RandomForestClassifier()
    svc = SVC()
    kernel = SVC(kernel='linear')
    gb = GradientBoostingClassifier()
    xgboost = XGBClassifier()
    gnb = GaussianNB()
    knn = KNeighborsClassifier()
    
    logit_score = m.apply_model(logit)
    decision_tree_score = m.apply_model(decision_tree)
    random_forest_score = m.apply_model(random_forest)
    svc_score = m.apply_model(svc)
    kernel_score = m.apply_model(kernel)
    gb_score = m.apply_model(gb)
    xgboost_score = m.apply_model(xgboost)
    gnb_score = m.apply_model(gnb)
    knn_score = m.apply_model(knn)
    
    print('Logistic Regression: {}'.format(logit_score))
    print('Decision Tree:       {}'.format(decision_tree_score))
    print('SVM:                 {}'.format(svc_score))
    print('Naive Bayes:         {}'.format(gnb_score))
    print('KNN:                 {}'.format(knn_score))
    print('Random Forest:       {}'.format(random_forest_score))
    print('GBM:                 {}'.format(gb_score))
    print('XGBoost:             {}'.format(xgboost_score))
    print('Kernel:              {}'.format(kernel_score))
In [156]:
apply_models_with_random_state_test_size(522)
Logistic Regression: 0.7802690582959642 Decision Tree: 0.7757847533632287 SVM: 0.7309417040358744 Naive Bayes: 0.7668161434977578 KNN: 0.7040358744394619 Random Forest: 0.8026905829596412 GBM: 0.8295964125560538 XGBoost: 0.8161434977578476 Kernel: 0.7847533632286996
Random state is effecting the prediction score
In [157]:
# looping the random state with range 1 to 10
for each in range(1, 11):
    print("Ramdom Value: {}".format(each))
    apply_models_with_random_state_test_size(each)
Ramdom Value: 1 Logistic Regression: 0.8116591928251121 Decision Tree: 0.7488789237668162 SVM: 0.6816143497757847 Naive Bayes: 0.7668161434977578 KNN: 0.7085201793721974 Random Forest: 0.7623318385650224 GBM: 0.7982062780269058 XGBoost: 0.8026905829596412 Kernel: 0.7847533632286996 Ramdom Value: 2 Logistic Regression: 0.7937219730941704 Decision Tree: 0.7443946188340808 SVM: 0.6816143497757847 Naive Bayes: 0.7892376681614349 KNN: 0.6816143497757847 Random Forest: 0.8026905829596412 GBM: 0.7937219730941704 XGBoost: 0.7982062780269058 Kernel: 0.7713004484304933 Ramdom Value: 3 Logistic Regression: 0.7757847533632287 Decision Tree: 0.7488789237668162 SVM: 0.7040358744394619 Naive Bayes: 0.7488789237668162 KNN: 0.6771300448430493 Random Forest: 0.7802690582959642 GBM: 0.8071748878923767 XGBoost: 0.7892376681614349 Kernel: 0.7847533632286996 Ramdom Value: 4 Logistic Regression: 0.8161434977578476 Decision Tree: 0.820627802690583 SVM: 0.7443946188340808 Naive Bayes: 0.8071748878923767 KNN: 0.7219730941704036 Random Forest: 0.820627802690583 GBM: 0.8340807174887892 XGBoost: 0.8385650224215246 Kernel: 0.8026905829596412 Ramdom Value: 5 Logistic Regression: 0.8385650224215246 Decision Tree: 0.7937219730941704 SVM: 0.7309417040358744 Naive Bayes: 0.8430493273542601 KNN: 0.7354260089686099 Random Forest: 0.820627802690583 GBM: 0.8295964125560538 XGBoost: 0.8385650224215246 Kernel: 0.8116591928251121 Ramdom Value: 6 Logistic Regression: 0.8565022421524664 Decision Tree: 0.7847533632286996 SVM: 0.726457399103139 Naive Bayes: 0.8385650224215246 KNN: 0.726457399103139 Random Forest: 0.8251121076233184 GBM: 0.852017937219731 XGBoost: 0.8475336322869955 Kernel: 0.8251121076233184 Ramdom Value: 7 Logistic Regression: 0.7533632286995515 Decision Tree: 0.7354260089686099 SVM: 0.6636771300448431 Naive Bayes: 0.7623318385650224 KNN: 0.6995515695067265 Random Forest: 0.7713004484304933 GBM: 0.7892376681614349 XGBoost: 0.7982062780269058 Kernel: 0.7309417040358744 Ramdom Value: 8 Logistic Regression: 0.7892376681614349 Decision Tree: 0.7443946188340808 SVM: 0.7219730941704036 Naive Bayes: 0.7802690582959642 KNN: 0.7040358744394619 Random Forest: 0.8161434977578476 GBM: 0.8340807174887892 XGBoost: 0.8385650224215246 Kernel: 0.7668161434977578 Ramdom Value: 9 Logistic Regression: 0.7802690582959642 Decision Tree: 0.7443946188340808 SVM: 0.6502242152466368 Naive Bayes: 0.7623318385650224 KNN: 0.6771300448430493 Random Forest: 0.757847533632287 GBM: 0.7892376681614349 XGBoost: 0.7982062780269058 Kernel: 0.7488789237668162 Ramdom Value: 10 Logistic Regression: 0.8161434977578476 Decision Tree: 0.7982062780269058 SVM: 0.6905829596412556 Naive Bayes: 0.7937219730941704 KNN: 0.7130044843049327 Random Forest: 0.8251121076233184 GBM: 0.8385650224215246 XGBoost: 0.852017937219731 Kernel: 0.8161434977578476
By looking the above loop found random value 6 is giving high prediction result
In [158]:
random_state = 6

Test Size: 30%

In [159]:
apply_models_with_random_state_test_size(random_state, test_size=.30)
Logistic Regression: 0.8507462686567164 Decision Tree: 0.7873134328358209 SVM: 0.7313432835820896 Naive Bayes: 0.832089552238806 KNN: 0.7201492537313433 Random Forest: 0.7985074626865671 GBM: 0.8283582089552238 XGBoost: 0.832089552238806 Kernel: 0.8208955223880597

Test Size: 20%

In [160]:
apply_models_with_random_state_test_size(random_state, test_size=.20)
Logistic Regression: 0.8547486033519553 Decision Tree: 0.7988826815642458 SVM: 0.7430167597765364 Naive Bayes: 0.8324022346368715 KNN: 0.7430167597765364 Random Forest: 0.7821229050279329 GBM: 0.8491620111731844 XGBoost: 0.8547486033519553 Kernel: 0.8212290502793296

Test Size: 25%

In [161]:
apply_models_with_random_state_test_size(random_state, test_size=.25)
Logistic Regression: 0.8565022421524664 Decision Tree: 0.7982062780269058 SVM: 0.726457399103139 Naive Bayes: 0.8385650224215246 KNN: 0.726457399103139 Random Forest: 0.8251121076233184 GBM: 0.852017937219731 XGBoost: 0.8475336322869955 Kernel: 0.8251121076233184

Test Size: 35%

In [163]:
apply_models_with_random_state_test_size(random_state, test_size=.35)
Logistic Regression: 0.842948717948718 Decision Tree: 0.7756410256410257 SVM: 0.7019230769230769 Naive Bayes: 0.8333333333333334 KNN: 0.7147435897435898 Random Forest: 0.8237179487179487 GBM: 0.8108974358974359 XGBoost: 0.8173076923076923 Kernel: 0.8108974358974359
In [ ]: