# libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, KFold
from sklearn import metrics
from sklearn.utils import resample
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
cancer_df = pd.read_csv("https://github.com/mpourhoma/CS4661/raw/master/Cancer.csv")
cancer_df.head()
#feature columns
feature_cols = ['Clump_Thickness','Uniformity_of_Cell_Size','Uniformity_of_Cell_Shape',
'Marginal_Adhesion','Single_Epithelial_Cell_Size','Bare_Nuclei',
'Bland_Chromatin','Normal_Nucleoli','Mitoses']
X = cancer_df[feature_cols]
# label
y = cancer_df['Malignant_Cancer']
print(X.head())
print(y.head())
Clump_Thickness Uniformity_of_Cell_Size Uniformity_of_Cell_Shape \
0 5 1 1
1 5 4 4
2 3 1 1
3 6 8 8
4 4 1 1
Marginal_Adhesion Single_Epithelial_Cell_Size Bare_Nuclei \
0 1 2 1
1 5 7 10
2 1 2 2
3 1 3 4
4 3 2 1
Bland_Chromatin Normal_Nucleoli Mitoses
0 3 1 1
1 3 2 1
2 3 1 1
3 3 7 1
4 3 1 1
0 0
1 0
2 0
3 0
4 0
Name: Malignant_Cancer, dtype: int64
# splitting the original dataset into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=3)
# print size of test train data
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
(97, 9)
(97,)
(53, 9)
(53,)
# Creating instance for DTree Calssifier
my_decisiontree = DecisionTreeClassifier(random_state=3)
my_decisiontree.fit(X_train, y_train)
y_predict_dt = my_decisiontree.predict(X_test)
# Accuracy
print(classification_report(y_test, y_predict_dt))
# Print
print(y_predict_dt)
precision recall f1-score support
0 0.74 0.91 0.82 22
1 0.92 0.77 0.84 31
accuracy 0.83 53
macro avg 0.83 0.84 0.83 53
weighted avg 0.85 0.83 0.83 53
[0 1 1 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0
0 1 0 0 0 0 1 0 0 1 0 1 1 0 1 1]
# Accuracy for each model
accuracy = []
estimators = []
# Training sample size
X_train_Sample_size = int(0.8*(len(X_train)))
for i in range(19):
X_train_i , y_train_i = resample(X_train, y_train, n_samples = X_train_Sample_size , random_state=i , replace = True)
my_decisiontree.fit(X_train_i, y_train_i)
y_predict_dt_i = my_decisiontree.predict(X_test)
accuracy.append(y_predict_dt_i)
# Accuracy
print(classification_report(y_test, y_predict_dt_i))
# Print
print(y_predict_dt_i)
estimators = np.concatenate(accuracy).ravel().tolist()
precision recall f1-score support
0 0.90 0.82 0.86 22
1 0.88 0.94 0.91 31
accuracy 0.89 53
macro avg 0.89 0.88 0.88 53
weighted avg 0.89 0.89 0.89 53
[1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 1 0 0 0
0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1]
precision recall f1-score support
0 0.90 0.86 0.88 22
1 0.91 0.94 0.92 31
accuracy 0.91 53
macro avg 0.91 0.90 0.90 53
weighted avg 0.91 0.91 0.91 53
[1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0
0 1 0 0 0 1 1 0 1 1 0 1 1 0 1 1]
precision recall f1-score support
0 0.77 0.91 0.83 22
1 0.93 0.81 0.86 31
accuracy 0.85 53
macro avg 0.85 0.86 0.85 53
weighted avg 0.86 0.85 0.85 53
[1 1 0 1 1 0 1 0 1 1 0 1 0 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0
0 1 0 0 0 1 0 0 0 1 1 1 1 0 1 1]
precision recall f1-score support
0 0.75 0.95 0.84 22
1 0.96 0.77 0.86 31
accuracy 0.85 53
macro avg 0.85 0.86 0.85 53
weighted avg 0.87 0.85 0.85 53
[0 1 0 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0
0 1 0 0 0 0 1 0 1 1 0 0 1 0 1 1]
precision recall f1-score support
0 0.86 0.86 0.86 22
1 0.90 0.90 0.90 31
accuracy 0.89 53
macro avg 0.88 0.88 0.88 53
weighted avg 0.89 0.89 0.89 53
[1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0
0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1]
precision recall f1-score support
0 0.83 0.91 0.87 22
1 0.93 0.87 0.90 31
accuracy 0.89 53
macro avg 0.88 0.89 0.88 53
weighted avg 0.89 0.89 0.89 53
[1 1 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0
0 1 0 0 0 1 1 0 1 1 0 1 1 0 1 1]
precision recall f1-score support
0 0.86 0.86 0.86 22
1 0.90 0.90 0.90 31
accuracy 0.89 53
macro avg 0.88 0.88 0.88 53
weighted avg 0.89 0.89 0.89 53
[1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0
0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1]
precision recall f1-score support
0 0.80 0.91 0.85 22
1 0.93 0.84 0.88 31
accuracy 0.87 53
macro avg 0.86 0.87 0.87 53
weighted avg 0.88 0.87 0.87 53
[0 1 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0
0 1 0 0 0 1 1 0 1 1 0 1 1 0 1 1]
precision recall f1-score support
0 0.71 0.91 0.80 22
1 0.92 0.74 0.82 31
accuracy 0.81 53
macro avg 0.82 0.83 0.81 53
weighted avg 0.83 0.81 0.81 53
[1 1 0 1 1 0 1 0 1 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0 1 1 1 1 0 1 0 0 0 0 0 0 0
0 1 0 0 0 1 0 0 0 1 1 1 1 0 1 1]
precision recall f1-score support
0 0.77 0.91 0.83 22
1 0.93 0.81 0.86 31
accuracy 0.85 53
macro avg 0.85 0.86 0.85 53
weighted avg 0.86 0.85 0.85 53
[1 1 1 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0
0 1 0 0 0 1 0 0 0 1 0 1 1 0 1 1]
precision recall f1-score support
0 0.90 0.86 0.88 22
1 0.91 0.94 0.92 31
accuracy 0.91 53
macro avg 0.91 0.90 0.90 53
weighted avg 0.91 0.91 0.91 53
[1 1 1 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0
0 1 0 0 0 1 1 0 1 1 0 1 1 1 1 1]
precision recall f1-score support
0 0.83 0.86 0.84 22
1 0.90 0.87 0.89 31
accuracy 0.87 53
macro avg 0.86 0.87 0.86 53
weighted avg 0.87 0.87 0.87 53
[0 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0
0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1]
precision recall f1-score support
0 0.73 0.86 0.79 22
1 0.89 0.77 0.83 31
accuracy 0.81 53
macro avg 0.81 0.82 0.81 53
weighted avg 0.82 0.81 0.81 53
[0 1 0 1 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0
0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1]
precision recall f1-score support
0 0.67 0.91 0.77 22
1 0.91 0.68 0.78 31
accuracy 0.77 53
macro avg 0.79 0.79 0.77 53
weighted avg 0.81 0.77 0.77 53
[0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 1 0 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0
0 1 0 0 0 0 1 0 0 1 0 0 1 0 1 0]
precision recall f1-score support
0 0.83 0.91 0.87 22
1 0.93 0.87 0.90 31
accuracy 0.89 53
macro avg 0.88 0.89 0.88 53
weighted avg 0.89 0.89 0.89 53
[1 1 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0
0 1 0 0 0 1 1 0 1 1 0 1 1 0 1 1]
precision recall f1-score support
0 0.83 0.91 0.87 22
1 0.93 0.87 0.90 31
accuracy 0.89 53
macro avg 0.88 0.89 0.88 53
weighted avg 0.89 0.89 0.89 53
[1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 0 0 1 0 0 0 0 0
0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1]
precision recall f1-score support
0 0.87 0.91 0.89 22
1 0.93 0.90 0.92 31
accuracy 0.91 53
macro avg 0.90 0.91 0.90 53
weighted avg 0.91 0.91 0.91 53
[1 1 0 1 1 0 1 0 1 1 0 1 0 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0
0 1 0 0 0 1 0 0 1 1 1 1 1 1 1 1]
precision recall f1-score support
0 0.83 0.86 0.84 22
1 0.90 0.87 0.89 31
accuracy 0.87 53
macro avg 0.86 0.87 0.86 53
weighted avg 0.87 0.87 0.87 53
[1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0
0 1 0 0 0 0 1 0 0 1 0 1 1 0 1 1]
precision recall f1-score support
0 0.87 0.91 0.89 22
1 0.93 0.90 0.92 31
accuracy 0.91 53
macro avg 0.90 0.91 0.90 53
weighted avg 0.91 0.91 0.91 53
[1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0
0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1]
# # create an ensemble model and verifying using cross val score using kFold
# kfold = KFold(n_splits=19, random_state=3)
# ensemble = VotingClassifier(estimators)
# results = cross_val_score(ensemble, X_train, y_train, cv=kfold)
# print(results.mean())
from sklearn.ensemble import RandomForestClassifier
RandomForest = RandomForestClassifier(n_estimators = 19, bootstrap = True, random_state=3)
RandomForest.fit(X_train, y_train)
R_predict = RandomForest.predict(X_test)
print(classification_report(y_test, R_predict))
precision recall f1-score support
0 0.91 0.91 0.91 22
1 0.94 0.94 0.94 31
accuracy 0.92 53
macro avg 0.92 0.92 0.92 53
weighted avg 0.92 0.92 0.92 53