Cancer Prediction - Notebook by Mayank (saboomayank)

Learn practical skills, build real-world projects, and advance your career

Updated 4 years ago

# libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, KFold
from sklearn import metrics
from sklearn.utils import resample
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

cancer_df = pd.read_csv("https://github.com/mpourhoma/CS4661/raw/master/Cancer.csv")
cancer_df.head()

#feature columns
feature_cols = ['Clump_Thickness','Uniformity_of_Cell_Size','Uniformity_of_Cell_Shape',
                'Marginal_Adhesion','Single_Epithelial_Cell_Size','Bare_Nuclei',
                'Bland_Chromatin','Normal_Nucleoli','Mitoses']
X = cancer_df[feature_cols] 
# label
y = cancer_df['Malignant_Cancer']
print(X.head())
print(y.head())

   Clump_Thickness  Uniformity_of_Cell_Size  Uniformity_of_Cell_Shape  \
0                5                        1                         1   
1                5                        4                         4   
2                3                        1                         1   
3                6                        8                         8   
4                4                        1                         1   

   Marginal_Adhesion  Single_Epithelial_Cell_Size  Bare_Nuclei  \
0                  1                            2            1   
1                  5                            7           10   
2                  1                            2            2   
3                  1                            3            4   
4                  3                            2            1   

   Bland_Chromatin  Normal_Nucleoli  Mitoses  
0                3                1        1  
1                3                2        1  
2                3                1        1  
3                3                7        1  
4                3                1        1  
0    0
1    0
2    0
3    0
4    0
Name: Malignant_Cancer, dtype: int64

# splitting the original dataset into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=3)
# print size of test train data
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(97, 9)
(97,)
(53, 9)
(53,)

# Creating instance for DTree Calssifier
my_decisiontree = DecisionTreeClassifier(random_state=3)
my_decisiontree.fit(X_train, y_train)
y_predict_dt = my_decisiontree.predict(X_test)
# Accuracy
print(classification_report(y_test, y_predict_dt))
# Print
print(y_predict_dt)

              precision    recall  f1-score   support

           0       0.74      0.91      0.82        22
           1       0.92      0.77      0.84        31

    accuracy                           0.83        53
   macro avg       0.83      0.84      0.83        53
weighted avg       0.85      0.83      0.83        53

[0 1 1 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0
 0 1 0 0 0 0 1 0 0 1 0 1 1 0 1 1]