Jovian
⭐️
Sign In
In [93]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_validate,KFold,cross_val_score,cross_val_predict,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,precision_score,roc_auc_score
from sklearn.datasets import load_iris,load_diabetes
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB

import warnings
warnings.simplefilter("ignore")
Get data
In [81]:
data = load_iris()
d = data.data
t = data.target
In [82]:
print(data.target_names)
['setosa' 'versicolor' 'virginica']
get shape
In [83]:
d.shape
Out[83]:
(150, 4)
In [84]:
t.shape
Out[84]:
(150,)
Train and test split
In [85]:
X_train, X_test, y_train, y_test = train_test_split(d,t,test_size=0.2, random_state=5)
Fit model
In [86]:
lr = LogisticRegression()
lr
Out[86]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
In [87]:
lr.fit(X_train,y_train)
Out[87]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
Predict
In [88]:
y_pred = lr.predict(X_test)
y_pred
Out[88]:
array([1, 2, 2, 0, 2, 1, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2,
       0, 1, 1, 2, 1, 1, 1, 2])
In [89]:
y_test
Out[89]:
array([1, 2, 2, 0, 2, 1, 0, 1, 0, 1, 1, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2,
       0, 1, 1, 2, 1, 1, 1, 2])
Check accuracy, usign direct function
In [11]:
accuracy_score(y_test,y_pred)
Out[11]:
0.9
manually checking the score
True rate
In [12]:
sum([True if y_test[i]==v else False for i,v in enumerate(y_pred)])/len(y_test)
Out[12]:
0.9
False rate
In [13]:
sum([True if y_test[i]!=v else False for i,v in enumerate(y_pred)])/len(y_test)
Out[13]:
0.1
In [14]:
# or 1-True rate
1-sum([True if y_test[i]==v else False for i,v in enumerate(y_pred)])/len(y_test)
# ~= .1
Out[14]:
0.09999999999999998
confusion matrix
In [15]:
data.target_names
Out[15]:
array(['setosa', 'versicolor', 'virginica'], dtype='<U10')
In [16]:
cm = confusion_matrix(y_test,y_pred)
cm
Out[16]:
array([[ 8,  0,  0],
       [ 0,  8,  3],
       [ 0,  0, 11]], dtype=int64)
In [17]:
df = pd.DataFrame(d)
df.plot(kind="kde")
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x27c582665c8>
Notebook Image
In [18]:
df["target"] = data.target
df
Out[18]:
In [19]:
sns.scatterplot(df.iloc[:,0],df.iloc[:,1],hue=df.target,palette="winter")
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x27c587b3048>
Notebook Image
In [20]:
# sns.distplot(df.iloc[:,0],hist=False,label=df.columns[0])
# sns.distplot(df.iloc[:,1],hist=False,label=df.columns[1])
# sns.distplot(df.iloc[:,2],hist=False,label=df.columns[2])
# sns.distplot(df.iloc[:,3],hist=False,label=df.columns[3])
In [21]:
sns.heatmap(cm,annot=True,cbar=False,cmap="winter",xticklabels=data.target_names,yticklabels=data.target_names,linewidths=.1)
plt.title("Confusion Matrix")
plt.ylabel("True_Labels")
plt.xlabel("Predicted_Labels")
Out[21]:
Text(0.5, 15.0, 'Predicted_Labels')
Notebook Image
Classification report
In [22]:
print(classification_report(y_test,y_pred))
precision recall f1-score support 0 1.00 1.00 1.00 8 1 1.00 0.73 0.84 11 2 0.79 1.00 0.88 11 accuracy 0.90 30 macro avg 0.93 0.91 0.91 30 weighted avg 0.92 0.90 0.90 30
precision_score: to check the precision
In [23]:
precision_score(y_test,y_pred,average="macro")
Out[23]:
0.9285714285714285

Naive bayes algo

In [95]:
data = load_iris()
d = data.data
t = data.target
X_train, X_test, y_train, y_test = train_test_split(d,t,test_size=0.2, random_state=5)
In [96]:
nb = MultinomialNB()
In [97]:
nb.fit(X_train,y_train)
Out[97]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
In [100]:
y_pred = nb.predict(X_test)
y_pred
Out[100]:
array([1, 2, 2, 0, 2, 1, 0, 2, 0, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2,
       0, 1, 1, 2, 1, 1, 1, 2])
In [101]:
confusion_matrix(y_test,y_pred)
Out[101]:
array([[ 8,  0,  0],
       [ 0, 10,  1],
       [ 0,  1, 10]], dtype=int64)
In [102]:
accuracy_score(y_test,y_pred)
Out[102]:
0.9333333333333333

Cross_Validate

In [24]:
cv = cross_validate(lr,X_test,y_test)
In [25]:
cv
Out[25]:
{'fit_time': array([0.00399613, 0.        , 0.00399637]),
 'score_time': array([0., 0., 0.]),
 'test_score': array([0.81818182, 0.81818182, 0.875     ])}
In [26]:
data = load_iris()
d = data.data
t = data.target
In [27]:
kf = KFold(n_splits=3)
In [28]:
# d = np.random.randint(-10,30,10)
# t = np.random.randint(-2,0,10)
gnr = kf.split(d,t)
In [29]:
# lst = list((gnr))
# lst
In [30]:
lr1 = LogisticRegression()
In [31]:
for train_idx,test_idx in gnr:
    X_train = d[train_idx]
    y_train = t[train_idx]
    X_test = d[train_idx]
    y_test = t[train_idx]
    lr1.fit(X_train,y_train)
    y_pred = lr1.predict(X_test)
    print("Accuracy score is::",accuracy_score(y_test,y_pred))
Accuracy score is:: 0.97 Accuracy score is:: 1.0 Accuracy score is:: 1.0

using Above test we can not say whether split 3 is good K value. Lets use K from 0 to 30 and see the score for each K fold

For Linear Regression

In [32]:
lr2 = LogisticRegression()
lr2_score = []
for kFold in range(2,30):
    kf = KFold(n_splits=kFold)
    gnr = kf.split(d,t)
    tmp_score = []
    for train_idx,test_idx in gnr:
        X_train = d[train_idx]
        y_train = t[train_idx]
        X_test = d[train_idx]
        y_test = t[train_idx]
        lr1.fit(X_train,y_train)
        y_pred = lr1.predict(X_test)
        tmp_score.append(accuracy_score(y_test,y_pred))
    lr2_score.append(sum(tmp_score)/len(tmp_score))
print("Final score values are::",lr2_score)
Final score values are:: [0.98, 0.9899999999999999, 0.9155736409608091, 0.9316666666666666, 0.9359999999999999, 0.9444732834994463, 0.9523623640990053, 0.9508098604720758, 0.9555555555555557, 0.9559848159569071, 0.9557680101555063, 0.9561127018117948, 0.955894875935986, 0.9552380952380951, 0.9573423252279638, 0.9587483180266415, 0.9592254076071877, 0.9603720963987827, 0.9610583078892937, 0.9603377842814462, 0.9609512961785689, 0.9612175264349176, 0.9611641252266252, 0.9613888888888888, 0.9615992484526968, 0.9620530012771396, 0.9629635331143953, 0.9630978332672745]
In [33]:
plt.figure(figsize=(16,4))
sns.lineplot(range(len(lr2_score)),lr2_score)
for idx in range(len(lr2_score)):
    plt.text(idx,lr2_score[idx],round(lr2_score[idx],2))
plt.xticks(range(2,30))
plt.show()
Notebook Image

For KNeighborsClassifier

In [34]:
knn = KNeighborsClassifier()
knn_score = []
for kFold in range(2,30):
    kf = KFold(n_splits=kFold)
    gnr = kf.split(d,t)

    tmp_score = []
    for train_idx,test_idx in gnr:
        X_train = d[train_idx]
        y_train = t[train_idx]
        X_test = d[train_idx]
        y_test = t[train_idx]
        knn.fit(X_train,y_train)
        y_pred = knn.predict(X_test)
        tmp_score.append(accuracy_score(y_test,y_pred))
    knn_score.append(sum(tmp_score)/len(tmp_score))
# print("Final score values are::",knn_score)
In [35]:
plt.figure(figsize=(16,4))
sns.lineplot(range(len(knn_score)),knn_score)
for idx in range(len(knn_score)):
    plt.text(idx,knn_score[idx],round(knn_score[idx],2))
plt.xticks(range(2,30))
plt.show()
Notebook Image

For diabetes dataset

In [36]:
data = load_diabetes()
d = data.data
t = data.target
In [37]:
knn = KNeighborsClassifier()
knn_score = []
for kFold in range(2,30):
    kf = KFold(n_splits=kFold)
    gnr = kf.split(d,t)

    tmp_score = []
    for train_idx,test_idx in gnr:
        X_train = d[train_idx]
        y_train = t[train_idx]
        X_test = d[train_idx]
        y_test = t[train_idx]
        knn.fit(X_train,y_train)
        y_pred = knn.predict(X_test)
        tmp_score.append(accuracy_score(y_test,y_pred))
    knn_score.append(sum(tmp_score)/len(tmp_score))
# print("Final score values are::",knn_score)
In [38]:
plt.figure(figsize=(16,4))
sns.lineplot(range(len(knn_score)),knn_score)
for idx in range(len(knn_score)):
    plt.text(idx,knn_score[idx],round(knn_score[idx],2))
plt.xticks(range(2,30))
plt.show()
Notebook Image

Cross validation score method

In [39]:
# Iris dataSet
data = load_iris()
d = data.data
t = data.target
In [40]:
knn1 = KNeighborsClassifier() # For default 5 neighbors
cvs = cross_val_score(knn1,d,t,cv=10,scoring="accuracy")
cvs.mean() # Mean score
Out[40]:
0.9666666666666668

Lets change various neighors numbers and evaluate the model by choosing optimal value ok K

In [41]:
k_range = range(1,40)
k_scores = []
for k in k_range:
    knn1 = KNeighborsClassifier(n_neighbors=k) # For default 5 neighbors
    cvs = cross_val_score(knn1,d,t,cv=10,scoring="accuracy")
    k_scores.append(cvs.mean())
print(k_scores)
[0.96, 0.9533333333333334, 0.9666666666666666, 0.9666666666666666, 0.9666666666666668, 0.9666666666666668, 0.9666666666666668, 0.9666666666666668, 0.9733333333333334, 0.9666666666666668, 0.9666666666666668, 0.9733333333333334, 0.9800000000000001, 0.9733333333333334, 0.9733333333333334, 0.9733333333333334, 0.9733333333333334, 0.9800000000000001, 0.9733333333333334, 0.9800000000000001, 0.9666666666666666, 0.9666666666666666, 0.9733333333333334, 0.96, 0.9666666666666666, 0.96, 0.9666666666666666, 0.9533333333333334, 0.9533333333333334, 0.9533333333333334, 0.9466666666666667, 0.9466666666666667, 0.9466666666666667, 0.9466666666666667, 0.9466666666666667, 0.9466666666666667, 0.9466666666666667, 0.9466666666666667, 0.9533333333333334]
In [42]:
# plt.figure(figsize=(16,4))
sns.lineplot(range(len(k_scores)),k_scores)
plt.xlabel("Values k for KNN")
plt.ylabel("Cross validated accuracy")
plt.show()
Notebook Image
In [43]:
np.mean(k_scores)
Out[43]:
0.9627350427350425

Using fixed value Knn = 20, from the aboev graph, where accuracy was maximum

In [44]:
knn_final = KNeighborsClassifier(n_neighbors=20)
cross_val_score(knn_final,d,t,cv=10,scoring="accuracy").mean()

Out[44]:
0.9800000000000001
In [45]:
import warnings
warnings.filterwarnings("ignore")
In [46]:
lr_final = LogisticRegression()
cross_val_score(lr_final,d,t,cv=10,scoring="accuracy").mean()

Out[46]:
0.9533333333333334

So Conclusion, is Knn (98% accuracy) is better choice over Logisctic regression (95% accuracy)

In [ ]:
 

Finding best parameter while tuning

In [47]:
param_grid = dict(n_neighbors = list(range(1,31)))
print(param_grid)
{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]}
In [48]:
knn2= KNeighborsClassifier()
lr3 = LogisticRegression()
gcv = GridSearchCV(knn2,param_grid=param_grid,cv=10,scoring="accuracy")
In [49]:
gcv
Out[49]:
GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)
In [50]:
# Iris dataSet
data = load_iris()
d = data.data
t = data.target
In [51]:
gcv.fit(d,t)
Out[51]:
GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

Best Score

In [52]:
print(gcv.best_score_)
print(gcv.best_estimator_)
print(gcv.best_params_)
0.98 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=13, p=2, weights='uniform') {'n_neighbors': 13}
In [53]:
print(gcv.cv_results_)
{'mean_fit_time': array([0.00080111, 0. , 0. , 0.00199997, 0.00080051, 0. , 0.00040014, 0.00120015, 0. , 0. , 0.00119622, 0.00080009, 0.00039961, 0. , 0.00040092, 0.00040004, 0.00039995, 0.00159965, 0.00039966, 0.00039978, 0.00120366, 0.00040033, 0.00079939, 0. , 0.00040004, 0. , 0.0003999 , 0.00080357, 0. , 0.00040002]), 'std_fit_time': array([0.00160222, 0. , 0. , 0.00199998, 0.00160103, 0. , 0.00120041, 0.00183326, 0. , 0. , 0.00182727, 0.00160017, 0.00119884, 0. , 0.00120277, 0.00120013, 0.00119984, 0.00195917, 0.00119898, 0.00119934, 0.00183863, 0.00120099, 0.00159879, 0. , 0.00120013, 0. , 0.0011997 , 0.00160716, 0. , 0.00120006]), 'mean_score_time': array([0.00239923, 0.00079975, 0.00240109, 0.00039973, 0.00199904, 0.00240006, 0.00159993, 0.00080061, 0.00160332, 0.00160172, 0.00120373, 0.0008014 , 0.00079951, 0.00199943, 0.00120029, 0.00159862, 0.00120001, 0.00040009, 0.00160019, 0.00080001, 0.00079947, 0.00120113, 0.00119901, 0.00120006, 0.00159991, 0.00159743, 0.00119932, 0.00120139, 0.00119915, 0.00159607]), 'std_score_time': array([0.001959 , 0.0015995 , 0.00196049, 0.0011992 , 0.00199905, 0.00195965, 0.00195951, 0.00160122, 0.00196368, 0.00196176, 0.00183874, 0.00160284, 0.00159903, 0.00199948, 0.00183354, 0.0019579 , 0.00183305, 0.00120027, 0.00195983, 0.00160003, 0.00159893, 0.00183477, 0.00183151, 0.0018332 , 0.00195955, 0.00195648, 0.00183199, 0.00183516, 0.00183174, 0.0019548 ]), 'param_n_neighbors': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], mask=[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], fill_value='?', dtype=object), 'params': [{'n_neighbors': 1}, {'n_neighbors': 2}, {'n_neighbors': 3}, {'n_neighbors': 4}, {'n_neighbors': 5}, {'n_neighbors': 6}, {'n_neighbors': 7}, {'n_neighbors': 8}, {'n_neighbors': 9}, {'n_neighbors': 10}, {'n_neighbors': 11}, {'n_neighbors': 12}, {'n_neighbors': 13}, {'n_neighbors': 14}, {'n_neighbors': 15}, {'n_neighbors': 16}, {'n_neighbors': 17}, {'n_neighbors': 18}, {'n_neighbors': 19}, {'n_neighbors': 20}, {'n_neighbors': 21}, {'n_neighbors': 22}, {'n_neighbors': 23}, {'n_neighbors': 24}, {'n_neighbors': 25}, {'n_neighbors': 26}, {'n_neighbors': 27}, {'n_neighbors': 28}, {'n_neighbors': 29}, {'n_neighbors': 30}], 'split0_test_score': array([1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.93333333, 1. , 0.93333333, 1. , 0.93333333, 1. , 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333]), 'split1_test_score': array([0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333]), 'split2_test_score': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]), 'split3_test_score': array([0.93333333, 0.93333333, 0.93333333, 0.93333333, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.93333333, 0.93333333, 0.93333333]), 'split4_test_score': array([0.86666667, 0.86666667, 0.86666667, 0.86666667, 0.86666667, 0.86666667, 0.86666667, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.93333333, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. ]), 'split5_test_score': array([1. , 1. , 1. , 1. , 0.93333333, 0.93333333, 0.93333333, 0.86666667, 0.93333333, 0.86666667, 0.86666667, 0.93333333, 0.93333333, 0.86666667, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.86666667, 0.93333333, 0.86666667, 0.86666667, 0.86666667, 0.86666667, 0.86666667]), 'split6_test_score': array([0.86666667, 0.86666667, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333]), 'split7_test_score': array([1. , 0.93333333, 1. , 1. , 1. , 1. , 1. , 0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333, 1. , 1. , 1. , 0.93333333, 1. , 1. , 1. , 1. , 1. , 0.93333333, 1. , 0.93333333, 0.93333333, 0.93333333, 1. , 0.93333333, 0.93333333, 0.93333333]), 'split8_test_score': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]), 'split9_test_score': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]), 'mean_test_score': array([0.96 , 0.95333333, 0.96666667, 0.96666667, 0.96666667, 0.96666667, 0.96666667, 0.96666667, 0.97333333, 0.96666667, 0.96666667, 0.97333333, 0.98 , 0.97333333, 0.97333333, 0.97333333, 0.97333333, 0.98 , 0.97333333, 0.98 , 0.96666667, 0.96666667, 0.97333333, 0.96 , 0.96666667, 0.96 , 0.96666667, 0.95333333, 0.95333333, 0.95333333]), 'std_test_score': array([0.05333333, 0.05206833, 0.04472136, 0.04472136, 0.04472136, 0.04472136, 0.04472136, 0.04472136, 0.03265986, 0.04472136, 0.04472136, 0.03265986, 0.0305505 , 0.04422166, 0.03265986, 0.03265986, 0.03265986, 0.0305505 , 0.03265986, 0.0305505 , 0.03333333, 0.03333333, 0.03265986, 0.04422166, 0.03333333, 0.04422166, 0.04472136, 0.04268749, 0.04268749, 0.04268749]), 'rank_test_score': array([24, 27, 12, 12, 12, 12, 12, 12, 4, 12, 12, 4, 1, 4, 4, 4, 4, 1, 4, 1, 12, 12, 4, 24, 12, 24, 12, 27, 27, 27])}
In [54]:
print(gcv.cv_results_["mean_test_score"])
[0.96 0.95333333 0.96666667 0.96666667 0.96666667 0.96666667 0.96666667 0.96666667 0.97333333 0.96666667 0.96666667 0.97333333 0.98 0.97333333 0.97333333 0.97333333 0.97333333 0.98 0.97333333 0.98 0.96666667 0.96666667 0.97333333 0.96 0.96666667 0.96 0.96666667 0.95333333 0.95333333 0.95333333]
mean score
In [55]:
print(gcv.cv_results_["mean_test_score"].mean())
0.9673333333333332
In [56]:
sns.lineplot(range(gcv.cv_results_["mean_test_score"].size),gcv.cv_results_["mean_test_score"])
Out[56]:
<matplotlib.axes._subplots.AxesSubplot at 0x27c58b8d988>
Notebook Image

Finding best parameter while tuning, by addign some more parameter for grid

In [57]:
# Iris dataSet
data = load_iris()
d = data.data
t = data.target
In [58]:
weight = ["uniform","distance"]
param_grid = dict(n_neighbors = list(range(1,31)),weights=weight)
print(param_grid)
{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'weights': ['uniform', 'distance']}
In [59]:
knn2= KNeighborsClassifier()
gcv = GridSearchCV(knn2,param_grid=param_grid,cv=10,scoring="accuracy")
In [60]:
gcv.fit(d,t)
Out[60]:
GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)
In [61]:
print(gcv.best_score_)
print(gcv.best_estimator_)
print(gcv.best_params_)
0.98 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=13, p=2, weights='uniform') {'n_neighbors': 13, 'weights': 'uniform'}
In [62]:
gcv.best_index_
Out[62]:
24
In [63]:
# print(gcv.cv_results_)

Use best params derived from grid search to predict the test input

In [64]:
# Iris dataSet
data = load_iris()
d = data.data
t = data.target
In [65]:
X_train, X_test, y_train, y_test = train_test_split(d,t,test_size=0.2, random_state=5)
In [66]:
knn_final = KNeighborsClassifier(n_neighbors=30,weights="uniform")
knn_final
Out[66]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=30, p=2,
                     weights='uniform')
In [67]:
knn_final.fit(d,t)
Out[67]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=30, p=2,
                     weights='uniform')
In [68]:
y_pred = knn_final.predict(X_test)
y_pred
Out[68]:
array([1, 2, 2, 0, 2, 1, 0, 1, 0, 1, 1, 2, 2, 2, 0, 0, 1, 2, 0, 0, 1, 2,
       0, 1, 1, 2, 1, 1, 1, 2])
In [69]:
accuracy_score(y_test,y_pred)
Out[69]:
0.9666666666666667
In [ ]:
 

RandomizedSearchCV

In [70]:
# Iris dataSet
data = load_iris()
d = data.data
t = data.target
In [71]:
weight = ["uniform","distance"]
param_grid = dict(n_neighbors = list(range(1,31)),weights=weight)
print(param_grid)
{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'weights': ['uniform', 'distance']}
In [72]:
knn3= KNeighborsClassifier()
rcv = RandomizedSearchCV(knn3,param_distributions=param_grid,cv=10,n_iter=10,scoring="accuracy",random_state=5)
In [73]:
rcv.fit(d,t)
Out[73]:
RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=KNeighborsClassifier(algorithm='auto',
                                                  leaf_size=30,
                                                  metric='minkowski',
                                                  metric_params=None,
                                                  n_jobs=None, n_neighbors=5,
                                                  p=2, weights='uniform'),
                   iid='warn', n_iter=10, n_jobs=None,
                   param_distributions={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8,
                                                        9, 10, 11, 12, 13, 14,
                                                        15, 16, 17, 18, 19, 20,
                                                        21, 22, 23, 24, 25, 26,
                                                        27, 28, 29, 30],
                                        'weights': ['uniform', 'distance']},
                   pre_dispatch='2*n_jobs', random_state=5, refit=True,
                   return_train_score=False, scoring='accuracy', verbose=0)
In [74]:
print(rcv.best_score_)
print(rcv.best_estimator_)
print(rcv.best_params_)
0.98 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=18, p=2, weights='uniform') {'weights': 'uniform', 'n_neighbors': 18}
In [75]:
# rcv.cv_results_

Finding best ML algorithm for training and testing using tpot classifier (AutoML)

In [1]:
from tpot import TPOTClassifier
In [2]:
tpc = TPOTClassifier(verbosity=2,max_time_mins=5,generations=5)
tpc
Out[2]:
TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=5,
               max_eval_time_mins=5, max_time_mins=5, memory=None,
               mutation_rate=0.9, n_jobs=1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=100,
               random_state=None, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=False)
In [740]:
tpc.fit(X_train,y_train)
Warning: xgboost.XGBClassifier is not available and will not be used by TPOT.
HBox(children=(IntProgress(value=0, description='Optimization Progress', style=ProgressStyle(description_width…
Generation 1 - Current best internal CV score: 0.9916666666666668 Generation 2 - Current best internal CV score: 1.0 Generation 3 - Current best internal CV score: 1.0 Generation 4 - Current best internal CV score: 1.0 Generation 5 - Current best internal CV score: 1.0 Generation 6 - Current best internal CV score: 1.0 Generation 7 - Current best internal CV score: 1.0 5.01072235 minutes have elapsed. TPOT will close down. TPOT closed during evaluation in one generation. WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation. TPOT closed prematurely. Will use the current best pipeline. Best pipeline: ExtraTreesClassifier(ZeroCount(FastICA(input_matrix, tol=0.75)), bootstrap=True, criterion=entropy, max_features=0.45, min_samples_leaf=5, min_samples_split=3, n_estimators=100)
Out[740]:
TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=1000000,
               max_eval_time_mins=5, max_time_mins=5, memory=None,
               mutation_rate=0.9, n_jobs=1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=100,
               random_state=None, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=False)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: