import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("titanic_dataset_modified.csv")
df.head(2)
print(df.columns)
df.columns.size
Index(['Unnamed: 0', 'pclass', 'survived', 'name', 'gender', 'age', 'sibsp',
'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body',
'home.dest'],
dtype='object')
15
df.describe().head(1)
df.isnull().sum()
Unnamed: 0 0
pclass 0
survived 0
name 0
gender 0
age 263
sibsp 0
parch 0
ticket 0
fare 1
cabin 1014
embarked 2
boat 823
body 1188
home.dest 564
dtype: int64
plt.figure(figsize=(6,4))
sns.heatmap(df.isnull(),yticklabels=False,cbar=False)
<matplotlib.axes._subplots.AxesSubplot at 0x282f410b7c8>
df.dtypes
Unnamed: 0 int64
pclass int64
survived int64
name object
gender object
age float64
sibsp int64
parch int64
ticket object
fare object
cabin object
embarked object
boat object
body float64
home.dest object
dtype: object
df.drop(["Unnamed: 0","name","ticket","cabin","boat","body","home.dest"],axis=1,inplace=True)
df.head(2)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
pclass 1309 non-null int64
survived 1309 non-null int64
gender 1309 non-null object
age 1046 non-null float64
sibsp 1309 non-null int64
parch 1309 non-null int64
fare 1308 non-null object
embarked 1307 non-null object
dtypes: float64(1), int64(4), object(3)
memory usage: 81.9+ KB
df.pclass = df.pclass.astype("int8")
df.survived = df.survived.astype("int8")
df.age = df.age.astype("float16")
df.sibsp = df.sibsp.astype("float16")
df.parch = df.parch.astype("float16")
df.info() # 50% memory reduction
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
pclass 1309 non-null int8
survived 1309 non-null int8
gender 1309 non-null object
age 1046 non-null float16
sibsp 1309 non-null float16
parch 1309 non-null float16
fare 1308 non-null object
embarked 1307 non-null object
dtypes: float16(3), int8(2), object(3)
memory usage: 41.0+ KB
# first replace $
# change the type to float
df.fare.replace({"\$":""},regex=True,inplace=True)
print(df.fare[:5])
df.fare = df.fare.astype("float16")
print(df.fare[:5])
0 211.34
1 151.55
2 151.55
3 151.55
4 151.55
Name: fare, dtype: object
0 211.375
1 151.500
2 151.500
3 151.500
4 151.500
Name: fare, dtype: float16
for k in df.columns:
if "int" in str(df[k].dtype) or "float" in str(df[k].dtype):
df[k].fillna(df[k].median(),inplace=True)
else:
df[k].fillna(df[k].mode()[0],inplace=True)
df.dtypes
pclass int8
survived int8
gender object
age float16
sibsp float16
parch float16
fare float16
embarked object
dtype: object
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
pclass 1309 non-null int8
survived 1309 non-null int8
gender 1309 non-null object
age 1309 non-null float16
sibsp 1309 non-null float16
parch 1309 non-null float16
fare 1309 non-null float16
embarked 1309 non-null object
dtypes: float16(4), int8(2), object(2)
memory usage: 33.4+ KB
df.isnull().sum() # no more missing values, see the heatmap also to visualize it
pclass 0
survived 0
gender 0
age 0
sibsp 0
parch 0
fare 0
embarked 0
dtype: int64
sns.heatmap(df.isnull(),yticklabels=False,cbar=False) # clearly shows, no more missing values
plt.show()
df.head(2)
plt.figure(figsize=(16,4))
sns.countplot(df.pclass)
<matplotlib.axes._subplots.AxesSubplot at 0x282f5bf4588>
plt.figure(figsize=(16,4))
sns.countplot(df.pclass,hue=df.survived)
<matplotlib.axes._subplots.AxesSubplot at 0x282f5bb6a08>
plt.figure(figsize=(16,4))
sns.countplot(df.gender,hue=df.survived)
<matplotlib.axes._subplots.AxesSubplot at 0x282f5ce9a88>
plt.figure(figsize=(16,4))
sns.set_style("whitegrid")
sns.countplot(df.gender)
<matplotlib.axes._subplots.AxesSubplot at 0x282f5da00c8>
df.head(2)
plt.figure(figsize=(16,4))
sns.countplot(df.sibsp)
<matplotlib.axes._subplots.AxesSubplot at 0x282f5df2dc8>
plt.figure(figsize=(16,4))
sns.countplot(df.sibsp,hue=df.survived)
<matplotlib.axes._subplots.AxesSubplot at 0x282f5ef9648>
plt.figure(figsize=(16,4))
sns.countplot(df.embarked,hue=df.survived)
<matplotlib.axes._subplots.AxesSubplot at 0x282f5f8eb08>
df.head(2)
plt.figure(figsize=(16,4))
sns.scatterplot(df.embarked,df.age,hue=df.survived)
<matplotlib.axes._subplots.AxesSubplot at 0x282f5fcc948>
plt.figure(figsize=(16,4))
sns.scatterplot(df.pclass,df.age,hue=df.survived)
plt.xticks([1,2,3])
plt.show()
plt.figure(figsize=(16,4))
sns.distplot(df.age,bins=20,color="r")
<matplotlib.axes._subplots.AxesSubplot at 0x282f603e908>
df.head(2)
plt.figure(figsize=(16,4))
sns.scatterplot(df.age,df.sibsp,hue=df.survived,size=df.fare)
plt.show()
plt.figure(figsize=(16,4))
sns.countplot(df.gender,hue=df.embarked)
<matplotlib.axes._subplots.AxesSubplot at 0x282f61d1f88>
plt.figure(figsize=(16,4))
sns.scatterplot(df.age,df.fare,hue=df.survived)
<matplotlib.axes._subplots.AxesSubplot at 0x282f7392e88>
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,roc_auc_score,classification_report,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import scikitplot as skp
df.head(2)
le = LabelEncoder()
le.fit(df.gender)
df.gender = le.transform(df.gender)
le.fit(df.embarked)
df.embarked = le.transform(df.embarked)
df.head(2)
X = df[["pclass","gender","age","sibsp","parch","fare","embarked"]]
y = df["survived"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=20)
# y_train = y_train.values.reshape(-1,1)
# y_test = y_test.values.reshape(-1,1)
logr = LogisticRegression()
logr.fit(X_train,y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)
y_pred = logr.predict(X_test)
y_pred
array([1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0],
dtype=int8)
conf_mat = confusion_matrix(y_test,y_pred)
conf_mat
array([[131, 23],
[ 37, 71]], dtype=int64)
skp.metrics.plot_confusion_matrix(y_test,y_pred)
<matplotlib.axes._subplots.AxesSubplot at 0x282f7a860c8>
sns.heatmap(conf_mat,annot=True,fmt="d")
<matplotlib.axes._subplots.AxesSubplot at 0x282f53b6cc8>
skp.classifiers.plot_roc_curve_with_cv(logr,X_train.values,y_train.values)
C:\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:85: DeprecationWarning: Function plot_roc_curve is deprecated; This will be removed in v0.4.0. Please use scikitplot.metrics.plot_roc_curve instead.
warnings.warn(msg, category=DeprecationWarning)
<matplotlib.axes._subplots.AxesSubplot at 0x282f95dff48>
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
y_pred = rfc.predict(X_test)
y_pred
array([1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0],
dtype=int8)
confusion_matrix(y_test,y_pred)
array([[139, 15],
[ 33, 75]], dtype=int64)
accuracy_score(y_test,y_pred)*100
81.67938931297711
y_prob = rfc.predict_proba(X_test)
roc_auc_score(y_test,y_prob[:,1])
0.8407287157287155
skp.classifiers.plot_roc_curve_with_cv(rfc,X_train.values,y_train.values)
C:\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:85: DeprecationWarning: Function plot_roc_curve is deprecated; This will be removed in v0.4.0. Please use scikitplot.metrics.plot_roc_curve instead.
warnings.warn(msg, category=DeprecationWarning)
<matplotlib.axes._subplots.AxesSubplot at 0x282fa4abec8>
df.head(2)
grp1 = df.groupby(["pclass"])
grp1.parch.value_counts().plot(kind="bar",figsize=(16,4))
<matplotlib.axes._subplots.AxesSubplot at 0x28280cd8308>
grp1.survived.value_counts().plot(kind="bar",figsize=(16,4))
<matplotlib.axes._subplots.AxesSubplot at 0x282ff376c88>
df.head(2)
grp2 = df.groupby(["pclass","embarked"])
grp2.pclass.value_counts()
pclass embarked pclass
1 0 1 141
1 1 3
2 1 179
2 0 2 28
1 2 7
2 2 242
3 0 3 101
1 3 113
2 3 495
Name: pclass, dtype: int64
grp2.pclass.value_counts().plot(kind="bar",figsize=(16,4))
<matplotlib.axes._subplots.AxesSubplot at 0x282812ced88>