Jovian
⭐️
Sign In
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")
In [2]:
df = pd.read_csv("titanic_dataset_modified.csv")
In [3]:
df.head(2)
Out[3]:
In [4]:
print(df.columns)
df.columns.size
Index(['Unnamed: 0', 'pclass', 'survived', 'name', 'gender', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'], dtype='object')
Out[4]:
15
In [5]:
df.describe().head(1)
Out[5]:

Visualize missing values

In [6]:
df.isnull().sum()
Out[6]:
Unnamed: 0       0
pclass           0
survived         0
name             0
gender           0
age            263
sibsp            0
parch            0
ticket           0
fare             1
cabin         1014
embarked         2
boat           823
body          1188
home.dest      564
dtype: int64
In [7]:
plt.figure(figsize=(6,4))
sns.heatmap(df.isnull(),yticklabels=False,cbar=False)
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x282f410b7c8>
Notebook Image

Data types

In [8]:
df.dtypes
Out[8]:
Unnamed: 0      int64
pclass          int64
survived        int64
name           object
gender         object
age           float64
sibsp           int64
parch           int64
ticket         object
fare           object
cabin          object
embarked       object
boat           object
body          float64
home.dest      object
dtype: object

Drop unwated columns

In [9]:
df.drop(["Unnamed: 0","name","ticket","cabin","boat","body","home.dest"],axis=1,inplace=True)
In [10]:
df.head(2)
Out[10]:

Get info

In [11]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1309 entries, 0 to 1308 Data columns (total 8 columns): pclass 1309 non-null int64 survived 1309 non-null int64 gender 1309 non-null object age 1046 non-null float64 sibsp 1309 non-null int64 parch 1309 non-null int64 fare 1308 non-null object embarked 1307 non-null object dtypes: float64(1), int64(4), object(3) memory usage: 81.9+ KB

Reduce the memory, by typecasting int64 and float64 to int8 and float16

In [12]:
df.pclass = df.pclass.astype("int8")
df.survived = df.survived.astype("int8")
df.age = df.age.astype("float16")
df.sibsp = df.sibsp.astype("float16")
df.parch = df.parch.astype("float16")
In [13]:
df.info() # 50% memory reduction
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1309 entries, 0 to 1308 Data columns (total 8 columns): pclass 1309 non-null int8 survived 1309 non-null int8 gender 1309 non-null object age 1046 non-null float16 sibsp 1309 non-null float16 parch 1309 non-null float16 fare 1308 non-null object embarked 1307 non-null object dtypes: float16(3), int8(2), object(3) memory usage: 41.0+ KB

Fare columns to be fixed by removing "$" and changing its type

In [14]:
# first replace $
# change the type to float
df.fare.replace({"\$":""},regex=True,inplace=True)
print(df.fare[:5])
df.fare = df.fare.astype("float16")
print(df.fare[:5])
0 211.34 1 151.55 2 151.55 3 151.55 4 151.55 Name: fare, dtype: object 0 211.375 1 151.500 2 151.500 3 151.500 4 151.500 Name: fare, dtype: float16

Fill missing values with median (if type is non object), and fill with mode of type is object for Categorical value

In [15]:
for k in df.columns:
    if "int" in str(df[k].dtype) or "float" in str(df[k].dtype):
        df[k].fillna(df[k].median(),inplace=True)
    else:
        df[k].fillna(df[k].mode()[0],inplace=True)
In [16]:
df.dtypes
Out[16]:
pclass         int8
survived       int8
gender       object
age         float16
sibsp       float16
parch       float16
fare        float16
embarked     object
dtype: object
In [17]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1309 entries, 0 to 1308 Data columns (total 8 columns): pclass 1309 non-null int8 survived 1309 non-null int8 gender 1309 non-null object age 1309 non-null float16 sibsp 1309 non-null float16 parch 1309 non-null float16 fare 1309 non-null float16 embarked 1309 non-null object dtypes: float16(4), int8(2), object(2) memory usage: 33.4+ KB
In [18]:
df.isnull().sum() # no more missing values, see the heatmap also to visualize it
Out[18]:
pclass      0
survived    0
gender      0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64
In [19]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False) # clearly shows, no more missing values
plt.show()
Notebook Image

Lets Start visualization

In [20]:
df.head(2)
Out[20]:
In [21]:
plt.figure(figsize=(16,4))
sns.countplot(df.pclass)
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x282f5bf4588>
Notebook Image
In [22]:
plt.figure(figsize=(16,4))
sns.countplot(df.pclass,hue=df.survived)
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x282f5bb6a08>
Notebook Image
In [23]:
plt.figure(figsize=(16,4))
sns.countplot(df.gender,hue=df.survived)
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x282f5ce9a88>
Notebook Image
In [24]:
plt.figure(figsize=(16,4))
sns.set_style("whitegrid")
sns.countplot(df.gender)
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x282f5da00c8>
Notebook Image
In [25]:
df.head(2)
Out[25]:
In [26]:
plt.figure(figsize=(16,4))
sns.countplot(df.sibsp)
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x282f5df2dc8>
Notebook Image
In [27]:
plt.figure(figsize=(16,4))
sns.countplot(df.sibsp,hue=df.survived)
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x282f5ef9648>
Notebook Image
In [28]:
plt.figure(figsize=(16,4))
sns.countplot(df.embarked,hue=df.survived)
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x282f5f8eb08>
Notebook Image
In [29]:
df.head(2)
Out[29]:
In [30]:
plt.figure(figsize=(16,4))
sns.scatterplot(df.embarked,df.age,hue=df.survived)
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x282f5fcc948>
Notebook Image
In [31]:
plt.figure(figsize=(16,4))
sns.scatterplot(df.pclass,df.age,hue=df.survived)
plt.xticks([1,2,3])
plt.show()
Notebook Image
In [32]:
plt.figure(figsize=(16,4))
sns.distplot(df.age,bins=20,color="r")
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x282f603e908>
Notebook Image
In [33]:
df.head(2)
Out[33]:
In [34]:
plt.figure(figsize=(16,4))
sns.scatterplot(df.age,df.sibsp,hue=df.survived,size=df.fare)
plt.show()
Notebook Image
In [35]:
plt.figure(figsize=(16,4))
sns.countplot(df.gender,hue=df.embarked)
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x282f61d1f88>
Notebook Image
In [36]:
plt.figure(figsize=(16,4))
sns.scatterplot(df.age,df.fare,hue=df.survived)
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x282f7392e88>
Notebook Image

Lets Apply Machine learning and do some prediction

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,roc_auc_score,classification_report,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import scikitplot as skp

Lets encode the categorical columns to numerical , i.e gender and embarked columns

In [38]:
df.head(2)
Out[38]:
In [39]:
le = LabelEncoder()
In [40]:
le.fit(df.gender)
df.gender = le.transform(df.gender)

le.fit(df.embarked)
df.embarked = le.transform(df.embarked)
In [41]:
df.head(2)
Out[41]:

Create Train and Test data using split method

In [42]:
X = df[["pclass","gender","age","sibsp","parch","fare","embarked"]]
y = df["survived"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=20)
In [43]:
# y_train = y_train.values.reshape(-1,1)
# y_test = y_test.values.reshape(-1,1)
In [44]:
logr = LogisticRegression()
In [45]:
logr.fit(X_train,y_train)
Out[45]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
In [46]:
y_pred = logr.predict(X_test)
y_pred
Out[46]:
array([1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0],
      dtype=int8)
In [47]:
conf_mat = confusion_matrix(y_test,y_pred)
conf_mat
Out[47]:
array([[131,  23],
       [ 37,  71]], dtype=int64)
In [48]:
skp.metrics.plot_confusion_matrix(y_test,y_pred)
Out[48]:
<matplotlib.axes._subplots.AxesSubplot at 0x282f7a860c8>
Notebook Image

Confusion matrix plot

In [49]:
sns.heatmap(conf_mat,annot=True,fmt="d")
Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x282f53b6cc8>
Notebook Image

ROC curve of Logistic regression

In [73]:
skp.classifiers.plot_roc_curve_with_cv(logr,X_train.values,y_train.values)
C:\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:85: DeprecationWarning: Function plot_roc_curve is deprecated; This will be removed in v0.4.0. Please use scikitplot.metrics.plot_roc_curve instead. warnings.warn(msg, category=DeprecationWarning)
Out[73]:
<matplotlib.axes._subplots.AxesSubplot at 0x282f95dff48>
Notebook Image

Using Random Forest

In [50]:
rfc = RandomForestClassifier()
In [51]:
rfc.fit(X_train,y_train)
Out[51]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
In [52]:
y_pred = rfc.predict(X_test)
y_pred
Out[52]:
array([1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0],
      dtype=int8)
In [55]:
confusion_matrix(y_test,y_pred)
Out[55]:
array([[139,  15],
       [ 33,  75]], dtype=int64)

accurcay_score

In [57]:
accuracy_score(y_test,y_pred)*100
Out[57]:
81.67938931297711
In [64]:
y_prob = rfc.predict_proba(X_test)
roc_auc_score(y_test,y_prob[:,1])
Out[64]:
0.8407287157287155

ROC curve of Random Forest

In [75]:
skp.classifiers.plot_roc_curve_with_cv(rfc,X_train.values,y_train.values)
C:\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:85: DeprecationWarning: Function plot_roc_curve is deprecated; This will be removed in v0.4.0. Please use scikitplot.metrics.plot_roc_curve instead. warnings.warn(msg, category=DeprecationWarning)
Out[75]:
<matplotlib.axes._subplots.AxesSubplot at 0x282fa4abec8>
Notebook Image

Grouping

In [76]:
df.head(2)
Out[76]:
In [77]:
grp1 = df.groupby(["pclass"])
In [96]:
grp1.parch.value_counts().plot(kind="bar",figsize=(16,4))
Out[96]:
<matplotlib.axes._subplots.AxesSubplot at 0x28280cd8308>
Notebook Image
In [95]:
grp1.survived.value_counts().plot(kind="bar",figsize=(16,4))
Out[95]:
<matplotlib.axes._subplots.AxesSubplot at 0x282ff376c88>
Notebook Image
In [86]:
df.head(2)
Out[86]:
In [87]:
grp2 = df.groupby(["pclass","embarked"])
In [92]:
grp2.pclass.value_counts()
Out[92]:
pclass  embarked  pclass
1       0         1         141
        1         1           3
        2         1         179
2       0         2          28
        1         2           7
        2         2         242
3       0         3         101
        1         3         113
        2         3         495
Name: pclass, dtype: int64
In [94]:
grp2.pclass.value_counts().plot(kind="bar",figsize=(16,4))
Out[94]:
<matplotlib.axes._subplots.AxesSubplot at 0x282812ced88>
Notebook Image
In [ ]:
 
In [ ]: