Jovian
⭐️
Sign In
In [1]:
# libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, KFold
from sklearn import metrics
from sklearn.utils import resample
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
In [2]:
cancer_df = pd.read_csv("https://github.com/mpourhoma/CS4661/raw/master/Cancer.csv")
cancer_df.head()
Out[2]:
In [3]:
#feature columns
feature_cols = ['Clump_Thickness','Uniformity_of_Cell_Size','Uniformity_of_Cell_Shape',
                'Marginal_Adhesion','Single_Epithelial_Cell_Size','Bare_Nuclei',
                'Bland_Chromatin','Normal_Nucleoli','Mitoses']
X = cancer_df[feature_cols] 
# label
y = cancer_df['Malignant_Cancer']
print(X.head())
print(y.head())
Clump_Thickness Uniformity_of_Cell_Size Uniformity_of_Cell_Shape \ 0 5 1 1 1 5 4 4 2 3 1 1 3 6 8 8 4 4 1 1 Marginal_Adhesion Single_Epithelial_Cell_Size Bare_Nuclei \ 0 1 2 1 1 5 7 10 2 1 2 2 3 1 3 4 4 3 2 1 Bland_Chromatin Normal_Nucleoli Mitoses 0 3 1 1 1 3 2 1 2 3 1 1 3 3 7 1 4 3 1 1 0 0 1 0 2 0 3 0 4 0 Name: Malignant_Cancer, dtype: int64
In [4]:
# splitting the original dataset into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=3)
# print size of test train data
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
(97, 9) (97,) (53, 9) (53,)
In [5]:
# Creating instance for DTree Calssifier
my_decisiontree = DecisionTreeClassifier(random_state=3)
my_decisiontree.fit(X_train, y_train)
y_predict_dt = my_decisiontree.predict(X_test)
# Accuracy
print(classification_report(y_test, y_predict_dt))
# Print
print(y_predict_dt)
precision recall f1-score support 0 0.74 0.91 0.82 22 1 0.92 0.77 0.84 31 accuracy 0.83 53 macro avg 0.83 0.84 0.83 53 weighted avg 0.85 0.83 0.83 53 [0 1 1 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 1 0 1 1]

Resampling and Bagging

In [6]:
# Accuracy for each model
accuracy = []
estimators = []
# Training sample size
X_train_Sample_size =  int(0.8*(len(X_train)))                        
for i in range(19):
    X_train_i , y_train_i = resample(X_train, y_train, n_samples = X_train_Sample_size , random_state=i , replace = True)
    my_decisiontree.fit(X_train_i, y_train_i)
    y_predict_dt_i = my_decisiontree.predict(X_test)
    accuracy.append(y_predict_dt_i)
# Accuracy
    print(classification_report(y_test, y_predict_dt_i))
# Print
    print(y_predict_dt_i)
estimators = np.concatenate(accuracy).ravel().tolist()
precision recall f1-score support 0 0.90 0.82 0.86 22 1 0.88 0.94 0.91 31 accuracy 0.89 53 macro avg 0.89 0.88 0.88 53 weighted avg 0.89 0.89 0.89 53 [1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1] precision recall f1-score support 0 0.90 0.86 0.88 22 1 0.91 0.94 0.92 31 accuracy 0.91 53 macro avg 0.91 0.90 0.90 53 weighted avg 0.91 0.91 0.91 53 [1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 1 0 1 1 0 1 1 0 1 1] precision recall f1-score support 0 0.77 0.91 0.83 22 1 0.93 0.81 0.86 31 accuracy 0.85 53 macro avg 0.85 0.86 0.85 53 weighted avg 0.86 0.85 0.85 53 [1 1 0 1 1 0 1 0 1 1 0 1 0 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 1 1 0 1 1] precision recall f1-score support 0 0.75 0.95 0.84 22 1 0.96 0.77 0.86 31 accuracy 0.85 53 macro avg 0.85 0.86 0.85 53 weighted avg 0.87 0.85 0.85 53 [0 1 0 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 1 0 1 1] precision recall f1-score support 0 0.86 0.86 0.86 22 1 0.90 0.90 0.90 31 accuracy 0.89 53 macro avg 0.88 0.88 0.88 53 weighted avg 0.89 0.89 0.89 53 [1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1] precision recall f1-score support 0 0.83 0.91 0.87 22 1 0.93 0.87 0.90 31 accuracy 0.89 53 macro avg 0.88 0.89 0.88 53 weighted avg 0.89 0.89 0.89 53 [1 1 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 1 0 1 1 0 1 1 0 1 1] precision recall f1-score support 0 0.86 0.86 0.86 22 1 0.90 0.90 0.90 31 accuracy 0.89 53 macro avg 0.88 0.88 0.88 53 weighted avg 0.89 0.89 0.89 53 [1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1] precision recall f1-score support 0 0.80 0.91 0.85 22 1 0.93 0.84 0.88 31 accuracy 0.87 53 macro avg 0.86 0.87 0.87 53 weighted avg 0.88 0.87 0.87 53 [0 1 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 1 0 1 1 0 1 1 0 1 1] precision recall f1-score support 0 0.71 0.91 0.80 22 1 0.92 0.74 0.82 31 accuracy 0.81 53 macro avg 0.82 0.83 0.81 53 weighted avg 0.83 0.81 0.81 53 [1 1 0 1 1 0 1 0 1 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 1 1 0 1 1] precision recall f1-score support 0 0.77 0.91 0.83 22 1 0.93 0.81 0.86 31 accuracy 0.85 53 macro avg 0.85 0.86 0.85 53 weighted avg 0.86 0.85 0.85 53 [1 1 1 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 1 1] precision recall f1-score support 0 0.90 0.86 0.88 22 1 0.91 0.94 0.92 31 accuracy 0.91 53 macro avg 0.91 0.90 0.90 53 weighted avg 0.91 0.91 0.91 53 [1 1 1 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 1 1 0 1 1 0 1 1 1 1 1] precision recall f1-score support 0 0.83 0.86 0.84 22 1 0.90 0.87 0.89 31 accuracy 0.87 53 macro avg 0.86 0.87 0.86 53 weighted avg 0.87 0.87 0.87 53 [0 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1] precision recall f1-score support 0 0.73 0.86 0.79 22 1 0.89 0.77 0.83 31 accuracy 0.81 53 macro avg 0.81 0.82 0.81 53 weighted avg 0.82 0.81 0.81 53 [0 1 0 1 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1] precision recall f1-score support 0 0.67 0.91 0.77 22 1 0.91 0.68 0.78 31 accuracy 0.77 53 macro avg 0.79 0.79 0.77 53 weighted avg 0.81 0.77 0.77 53 [0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 1 0 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 1 0] precision recall f1-score support 0 0.83 0.91 0.87 22 1 0.93 0.87 0.90 31 accuracy 0.89 53 macro avg 0.88 0.89 0.88 53 weighted avg 0.89 0.89 0.89 53 [1 1 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 1 0 1 1 0 1 1 0 1 1] precision recall f1-score support 0 0.83 0.91 0.87 22 1 0.93 0.87 0.90 31 accuracy 0.89 53 macro avg 0.88 0.89 0.88 53 weighted avg 0.89 0.89 0.89 53 [1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1] precision recall f1-score support 0 0.87 0.91 0.89 22 1 0.93 0.90 0.92 31 accuracy 0.91 53 macro avg 0.90 0.91 0.90 53 weighted avg 0.91 0.91 0.91 53 [1 1 0 1 1 0 1 0 1 1 0 1 0 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 1 1 1 1 1 1] precision recall f1-score support 0 0.83 0.86 0.84 22 1 0.90 0.87 0.89 31 accuracy 0.87 53 macro avg 0.86 0.87 0.86 53 weighted avg 0.87 0.87 0.87 53 [1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 1 0 1 1] precision recall f1-score support 0 0.87 0.91 0.89 22 1 0.93 0.90 0.92 31 accuracy 0.91 53 macro avg 0.90 0.91 0.90 53 weighted avg 0.91 0.91 0.91 53 [1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1]

Performing voting - Dose not work???!

In [7]:
# # create an ensemble model and verifying using cross val score using kFold
# kfold = KFold(n_splits=19, random_state=3)
# ensemble = VotingClassifier(estimators)
# results = cross_val_score(ensemble, X_train, y_train, cv=kfold)
# print(results.mean())

Random Forrest

In [8]:
from sklearn.ensemble import RandomForestClassifier
RandomForest = RandomForestClassifier(n_estimators = 19, bootstrap = True, random_state=3)
RandomForest.fit(X_train, y_train)
R_predict = RandomForest.predict(X_test)

print(classification_report(y_test, R_predict))
precision recall f1-score support 0 0.91 0.91 0.91 22 1 0.94 0.94 0.94 31 accuracy 0.92 53 macro avg 0.92 0.92 0.92 53 weighted avg 0.92 0.92 0.92 53