import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_validate,KFold,cross_val_score,cross_val_predict,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,precision_score,roc_auc_score
from sklearn.datasets import load_iris,load_diabetes
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.simplefilter("ignore")
data = load_iris()
d = data.data
t = data.target
print(data.target_names)
['setosa' 'versicolor' 'virginica']
d.shape
(150, 4)
t.shape
(150,)
X_train, X_test, y_train, y_test = train_test_split(d,t,test_size=0.2, random_state=5)
lr = LogisticRegression()
lr
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)
lr.fit(X_train,y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)
y_pred = lr.predict(X_test)
y_pred
array([1, 2, 2, 0, 2, 1, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2,
0, 1, 1, 2, 1, 1, 1, 2])
y_test
array([1, 2, 2, 0, 2, 1, 0, 1, 0, 1, 1, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2,
0, 1, 1, 2, 1, 1, 1, 2])
accuracy_score(y_test,y_pred)
0.9
sum([True if y_test[i]==v else False for i,v in enumerate(y_pred)])/len(y_test)
0.9
sum([True if y_test[i]!=v else False for i,v in enumerate(y_pred)])/len(y_test)
0.1
# or 1-True rate
1-sum([True if y_test[i]==v else False for i,v in enumerate(y_pred)])/len(y_test)
# ~= .1
0.09999999999999998
data.target_names
array(['setosa', 'versicolor', 'virginica'], dtype='<U10')
cm = confusion_matrix(y_test,y_pred)
cm
array([[ 8, 0, 0],
[ 0, 8, 3],
[ 0, 0, 11]], dtype=int64)
df = pd.DataFrame(d)
df.plot(kind="kde")
<matplotlib.axes._subplots.AxesSubplot at 0x27c582665c8>
df["target"] = data.target
df
sns.scatterplot(df.iloc[:,0],df.iloc[:,1],hue=df.target,palette="winter")
<matplotlib.axes._subplots.AxesSubplot at 0x27c587b3048>
# sns.distplot(df.iloc[:,0],hist=False,label=df.columns[0])
# sns.distplot(df.iloc[:,1],hist=False,label=df.columns[1])
# sns.distplot(df.iloc[:,2],hist=False,label=df.columns[2])
# sns.distplot(df.iloc[:,3],hist=False,label=df.columns[3])
sns.heatmap(cm,annot=True,cbar=False,cmap="winter",xticklabels=data.target_names,yticklabels=data.target_names,linewidths=.1)
plt.title("Confusion Matrix")
plt.ylabel("True_Labels")
plt.xlabel("Predicted_Labels")
Text(0.5, 15.0, 'Predicted_Labels')
print(classification_report(y_test,y_pred))
precision recall f1-score support
0 1.00 1.00 1.00 8
1 1.00 0.73 0.84 11
2 0.79 1.00 0.88 11
accuracy 0.90 30
macro avg 0.93 0.91 0.91 30
weighted avg 0.92 0.90 0.90 30
precision_score(y_test,y_pred,average="macro")
0.9285714285714285
data = load_iris()
d = data.data
t = data.target
X_train, X_test, y_train, y_test = train_test_split(d,t,test_size=0.2, random_state=5)
nb = MultinomialNB()
nb.fit(X_train,y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
y_pred = nb.predict(X_test)
y_pred
array([1, 2, 2, 0, 2, 1, 0, 2, 0, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2,
0, 1, 1, 2, 1, 1, 1, 2])
confusion_matrix(y_test,y_pred)
array([[ 8, 0, 0],
[ 0, 10, 1],
[ 0, 1, 10]], dtype=int64)
accuracy_score(y_test,y_pred)
0.9333333333333333
cv = cross_validate(lr,X_test,y_test)
cv
{'fit_time': array([0.00399613, 0. , 0.00399637]),
'score_time': array([0., 0., 0.]),
'test_score': array([0.81818182, 0.81818182, 0.875 ])}
data = load_iris()
d = data.data
t = data.target
kf = KFold(n_splits=3)
# d = np.random.randint(-10,30,10)
# t = np.random.randint(-2,0,10)
gnr = kf.split(d,t)
# lst = list((gnr))
# lst
lr1 = LogisticRegression()
for train_idx,test_idx in gnr:
X_train = d[train_idx]
y_train = t[train_idx]
X_test = d[train_idx]
y_test = t[train_idx]
lr1.fit(X_train,y_train)
y_pred = lr1.predict(X_test)
print("Accuracy score is::",accuracy_score(y_test,y_pred))
Accuracy score is:: 0.97
Accuracy score is:: 1.0
Accuracy score is:: 1.0
K
foldlr2 = LogisticRegression()
lr2_score = []
for kFold in range(2,30):
kf = KFold(n_splits=kFold)
gnr = kf.split(d,t)
tmp_score = []
for train_idx,test_idx in gnr:
X_train = d[train_idx]
y_train = t[train_idx]
X_test = d[train_idx]
y_test = t[train_idx]
lr1.fit(X_train,y_train)
y_pred = lr1.predict(X_test)
tmp_score.append(accuracy_score(y_test,y_pred))
lr2_score.append(sum(tmp_score)/len(tmp_score))
print("Final score values are::",lr2_score)
Final score values are:: [0.98, 0.9899999999999999, 0.9155736409608091, 0.9316666666666666, 0.9359999999999999, 0.9444732834994463, 0.9523623640990053, 0.9508098604720758, 0.9555555555555557, 0.9559848159569071, 0.9557680101555063, 0.9561127018117948, 0.955894875935986, 0.9552380952380951, 0.9573423252279638, 0.9587483180266415, 0.9592254076071877, 0.9603720963987827, 0.9610583078892937, 0.9603377842814462, 0.9609512961785689, 0.9612175264349176, 0.9611641252266252, 0.9613888888888888, 0.9615992484526968, 0.9620530012771396, 0.9629635331143953, 0.9630978332672745]
plt.figure(figsize=(16,4))
sns.lineplot(range(len(lr2_score)),lr2_score)
for idx in range(len(lr2_score)):
plt.text(idx,lr2_score[idx],round(lr2_score[idx],2))
plt.xticks(range(2,30))
plt.show()
knn = KNeighborsClassifier()
knn_score = []
for kFold in range(2,30):
kf = KFold(n_splits=kFold)
gnr = kf.split(d,t)
tmp_score = []
for train_idx,test_idx in gnr:
X_train = d[train_idx]
y_train = t[train_idx]
X_test = d[train_idx]
y_test = t[train_idx]
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
tmp_score.append(accuracy_score(y_test,y_pred))
knn_score.append(sum(tmp_score)/len(tmp_score))
# print("Final score values are::",knn_score)
plt.figure(figsize=(16,4))
sns.lineplot(range(len(knn_score)),knn_score)
for idx in range(len(knn_score)):
plt.text(idx,knn_score[idx],round(knn_score[idx],2))
plt.xticks(range(2,30))
plt.show()
data = load_diabetes()
d = data.data
t = data.target
knn = KNeighborsClassifier()
knn_score = []
for kFold in range(2,30):
kf = KFold(n_splits=kFold)
gnr = kf.split(d,t)
tmp_score = []
for train_idx,test_idx in gnr:
X_train = d[train_idx]
y_train = t[train_idx]
X_test = d[train_idx]
y_test = t[train_idx]
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
tmp_score.append(accuracy_score(y_test,y_pred))
knn_score.append(sum(tmp_score)/len(tmp_score))
# print("Final score values are::",knn_score)
plt.figure(figsize=(16,4))
sns.lineplot(range(len(knn_score)),knn_score)
for idx in range(len(knn_score)):
plt.text(idx,knn_score[idx],round(knn_score[idx],2))
plt.xticks(range(2,30))
plt.show()
# Iris dataSet
data = load_iris()
d = data.data
t = data.target
knn1 = KNeighborsClassifier() # For default 5 neighbors
cvs = cross_val_score(knn1,d,t,cv=10,scoring="accuracy")
cvs.mean() # Mean score
0.9666666666666668
K
k_range = range(1,40)
k_scores = []
for k in k_range:
knn1 = KNeighborsClassifier(n_neighbors=k) # For default 5 neighbors
cvs = cross_val_score(knn1,d,t,cv=10,scoring="accuracy")
k_scores.append(cvs.mean())
print(k_scores)
[0.96, 0.9533333333333334, 0.9666666666666666, 0.9666666666666666, 0.9666666666666668, 0.9666666666666668, 0.9666666666666668, 0.9666666666666668, 0.9733333333333334, 0.9666666666666668, 0.9666666666666668, 0.9733333333333334, 0.9800000000000001, 0.9733333333333334, 0.9733333333333334, 0.9733333333333334, 0.9733333333333334, 0.9800000000000001, 0.9733333333333334, 0.9800000000000001, 0.9666666666666666, 0.9666666666666666, 0.9733333333333334, 0.96, 0.9666666666666666, 0.96, 0.9666666666666666, 0.9533333333333334, 0.9533333333333334, 0.9533333333333334, 0.9466666666666667, 0.9466666666666667, 0.9466666666666667, 0.9466666666666667, 0.9466666666666667, 0.9466666666666667, 0.9466666666666667, 0.9466666666666667, 0.9533333333333334]
# plt.figure(figsize=(16,4))
sns.lineplot(range(len(k_scores)),k_scores)
plt.xlabel("Values k for KNN")
plt.ylabel("Cross validated accuracy")
plt.show()
np.mean(k_scores)
0.9627350427350425
knn_final = KNeighborsClassifier(n_neighbors=20)
cross_val_score(knn_final,d,t,cv=10,scoring="accuracy").mean()
0.9800000000000001
import warnings
warnings.filterwarnings("ignore")
lr_final = LogisticRegression()
cross_val_score(lr_final,d,t,cv=10,scoring="accuracy").mean()
0.9533333333333334
(98% accuracy)
is better choice over Logisctic regression (95% accuracy)
param_grid = dict(n_neighbors = list(range(1,31)))
print(param_grid)
{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]}
knn2= KNeighborsClassifier()
lr3 = LogisticRegression()
gcv = GridSearchCV(knn2,param_grid=param_grid,cv=10,scoring="accuracy")
gcv
GridSearchCV(cv=10, error_score='raise-deprecating',
estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski',
metric_params=None, n_jobs=None,
n_neighbors=5, p=2,
weights='uniform'),
iid='warn', n_jobs=None,
param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='accuracy', verbose=0)
# Iris dataSet
data = load_iris()
d = data.data
t = data.target
gcv.fit(d,t)
GridSearchCV(cv=10, error_score='raise-deprecating',
estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski',
metric_params=None, n_jobs=None,
n_neighbors=5, p=2,
weights='uniform'),
iid='warn', n_jobs=None,
param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='accuracy', verbose=0)
print(gcv.best_score_)
print(gcv.best_estimator_)
print(gcv.best_params_)
0.98
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=13, p=2,
weights='uniform')
{'n_neighbors': 13}
print(gcv.cv_results_)
{'mean_fit_time': array([0.00080111, 0. , 0. , 0.00199997, 0.00080051,
0. , 0.00040014, 0.00120015, 0. , 0. ,
0.00119622, 0.00080009, 0.00039961, 0. , 0.00040092,
0.00040004, 0.00039995, 0.00159965, 0.00039966, 0.00039978,
0.00120366, 0.00040033, 0.00079939, 0. , 0.00040004,
0. , 0.0003999 , 0.00080357, 0. , 0.00040002]), 'std_fit_time': array([0.00160222, 0. , 0. , 0.00199998, 0.00160103,
0. , 0.00120041, 0.00183326, 0. , 0. ,
0.00182727, 0.00160017, 0.00119884, 0. , 0.00120277,
0.00120013, 0.00119984, 0.00195917, 0.00119898, 0.00119934,
0.00183863, 0.00120099, 0.00159879, 0. , 0.00120013,
0. , 0.0011997 , 0.00160716, 0. , 0.00120006]), 'mean_score_time': array([0.00239923, 0.00079975, 0.00240109, 0.00039973, 0.00199904,
0.00240006, 0.00159993, 0.00080061, 0.00160332, 0.00160172,
0.00120373, 0.0008014 , 0.00079951, 0.00199943, 0.00120029,
0.00159862, 0.00120001, 0.00040009, 0.00160019, 0.00080001,
0.00079947, 0.00120113, 0.00119901, 0.00120006, 0.00159991,
0.00159743, 0.00119932, 0.00120139, 0.00119915, 0.00159607]), 'std_score_time': array([0.001959 , 0.0015995 , 0.00196049, 0.0011992 , 0.00199905,
0.00195965, 0.00195951, 0.00160122, 0.00196368, 0.00196176,
0.00183874, 0.00160284, 0.00159903, 0.00199948, 0.00183354,
0.0019579 , 0.00183305, 0.00120027, 0.00195983, 0.00160003,
0.00159893, 0.00183477, 0.00183151, 0.0018332 , 0.00195955,
0.00195648, 0.00183199, 0.00183516, 0.00183174, 0.0019548 ]), 'param_n_neighbors': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False],
fill_value='?',
dtype=object), 'params': [{'n_neighbors': 1}, {'n_neighbors': 2}, {'n_neighbors': 3}, {'n_neighbors': 4}, {'n_neighbors': 5}, {'n_neighbors': 6}, {'n_neighbors': 7}, {'n_neighbors': 8}, {'n_neighbors': 9}, {'n_neighbors': 10}, {'n_neighbors': 11}, {'n_neighbors': 12}, {'n_neighbors': 13}, {'n_neighbors': 14}, {'n_neighbors': 15}, {'n_neighbors': 16}, {'n_neighbors': 17}, {'n_neighbors': 18}, {'n_neighbors': 19}, {'n_neighbors': 20}, {'n_neighbors': 21}, {'n_neighbors': 22}, {'n_neighbors': 23}, {'n_neighbors': 24}, {'n_neighbors': 25}, {'n_neighbors': 26}, {'n_neighbors': 27}, {'n_neighbors': 28}, {'n_neighbors': 29}, {'n_neighbors': 30}], 'split0_test_score': array([1. , 1. , 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 0.93333333,
1. , 0.93333333, 1. , 0.93333333, 1. ,
0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333,
0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333]), 'split1_test_score': array([0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333,
0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333,
0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333,
0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333,
0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333,
0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333]), 'split2_test_score': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]), 'split3_test_score': array([0.93333333, 0.93333333, 0.93333333, 0.93333333, 1. ,
1. , 1. , 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
1. , 1. , 0.93333333, 0.93333333, 0.93333333]), 'split4_test_score': array([0.86666667, 0.86666667, 0.86666667, 0.86666667, 0.86666667,
0.86666667, 0.86666667, 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
0.93333333, 1. , 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ]), 'split5_test_score': array([1. , 1. , 1. , 1. , 0.93333333,
0.93333333, 0.93333333, 0.86666667, 0.93333333, 0.86666667,
0.86666667, 0.93333333, 0.93333333, 0.86666667, 0.93333333,
0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333,
0.93333333, 0.93333333, 0.93333333, 0.86666667, 0.93333333,
0.86666667, 0.86666667, 0.86666667, 0.86666667, 0.86666667]), 'split6_test_score': array([0.86666667, 0.86666667, 0.93333333, 0.93333333, 0.93333333,
0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333,
0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333,
0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333,
0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333,
0.93333333, 0.93333333, 0.93333333, 0.93333333, 0.93333333]), 'split7_test_score': array([1. , 0.93333333, 1. , 1. , 1. ,
1. , 1. , 0.93333333, 0.93333333, 0.93333333,
0.93333333, 0.93333333, 1. , 1. , 1. ,
0.93333333, 1. , 1. , 1. , 1. ,
1. , 0.93333333, 1. , 0.93333333, 0.93333333,
0.93333333, 1. , 0.93333333, 0.93333333, 0.93333333]), 'split8_test_score': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]), 'split9_test_score': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]), 'mean_test_score': array([0.96 , 0.95333333, 0.96666667, 0.96666667, 0.96666667,
0.96666667, 0.96666667, 0.96666667, 0.97333333, 0.96666667,
0.96666667, 0.97333333, 0.98 , 0.97333333, 0.97333333,
0.97333333, 0.97333333, 0.98 , 0.97333333, 0.98 ,
0.96666667, 0.96666667, 0.97333333, 0.96 , 0.96666667,
0.96 , 0.96666667, 0.95333333, 0.95333333, 0.95333333]), 'std_test_score': array([0.05333333, 0.05206833, 0.04472136, 0.04472136, 0.04472136,
0.04472136, 0.04472136, 0.04472136, 0.03265986, 0.04472136,
0.04472136, 0.03265986, 0.0305505 , 0.04422166, 0.03265986,
0.03265986, 0.03265986, 0.0305505 , 0.03265986, 0.0305505 ,
0.03333333, 0.03333333, 0.03265986, 0.04422166, 0.03333333,
0.04422166, 0.04472136, 0.04268749, 0.04268749, 0.04268749]), 'rank_test_score': array([24, 27, 12, 12, 12, 12, 12, 12, 4, 12, 12, 4, 1, 4, 4, 4, 4,
1, 4, 1, 12, 12, 4, 24, 12, 24, 12, 27, 27, 27])}
print(gcv.cv_results_["mean_test_score"])
[0.96 0.95333333 0.96666667 0.96666667 0.96666667 0.96666667
0.96666667 0.96666667 0.97333333 0.96666667 0.96666667 0.97333333
0.98 0.97333333 0.97333333 0.97333333 0.97333333 0.98
0.97333333 0.98 0.96666667 0.96666667 0.97333333 0.96
0.96666667 0.96 0.96666667 0.95333333 0.95333333 0.95333333]
print(gcv.cv_results_["mean_test_score"].mean())
0.9673333333333332
sns.lineplot(range(gcv.cv_results_["mean_test_score"].size),gcv.cv_results_["mean_test_score"])
<matplotlib.axes._subplots.AxesSubplot at 0x27c58b8d988>
# Iris dataSet
data = load_iris()
d = data.data
t = data.target
weight = ["uniform","distance"]
param_grid = dict(n_neighbors = list(range(1,31)),weights=weight)
print(param_grid)
{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'weights': ['uniform', 'distance']}
knn2= KNeighborsClassifier()
gcv = GridSearchCV(knn2,param_grid=param_grid,cv=10,scoring="accuracy")
gcv.fit(d,t)
GridSearchCV(cv=10, error_score='raise-deprecating',
estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski',
metric_params=None, n_jobs=None,
n_neighbors=5, p=2,
weights='uniform'),
iid='warn', n_jobs=None,
param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30],
'weights': ['uniform', 'distance']},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='accuracy', verbose=0)
print(gcv.best_score_)
print(gcv.best_estimator_)
print(gcv.best_params_)
0.98
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=13, p=2,
weights='uniform')
{'n_neighbors': 13, 'weights': 'uniform'}
gcv.best_index_
24
# print(gcv.cv_results_)
# Iris dataSet
data = load_iris()
d = data.data
t = data.target
X_train, X_test, y_train, y_test = train_test_split(d,t,test_size=0.2, random_state=5)
knn_final = KNeighborsClassifier(n_neighbors=30,weights="uniform")
knn_final
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=30, p=2,
weights='uniform')
knn_final.fit(d,t)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=30, p=2,
weights='uniform')
y_pred = knn_final.predict(X_test)
y_pred
array([1, 2, 2, 0, 2, 1, 0, 1, 0, 1, 1, 2, 2, 2, 0, 0, 1, 2, 0, 0, 1, 2,
0, 1, 1, 2, 1, 1, 1, 2])
accuracy_score(y_test,y_pred)
0.9666666666666667
# Iris dataSet
data = load_iris()
d = data.data
t = data.target
weight = ["uniform","distance"]
param_grid = dict(n_neighbors = list(range(1,31)),weights=weight)
print(param_grid)
{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'weights': ['uniform', 'distance']}
knn3= KNeighborsClassifier()
rcv = RandomizedSearchCV(knn3,param_distributions=param_grid,cv=10,n_iter=10,scoring="accuracy",random_state=5)
rcv.fit(d,t)
RandomizedSearchCV(cv=10, error_score='raise-deprecating',
estimator=KNeighborsClassifier(algorithm='auto',
leaf_size=30,
metric='minkowski',
metric_params=None,
n_jobs=None, n_neighbors=5,
p=2, weights='uniform'),
iid='warn', n_iter=10, n_jobs=None,
param_distributions={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25, 26,
27, 28, 29, 30],
'weights': ['uniform', 'distance']},
pre_dispatch='2*n_jobs', random_state=5, refit=True,
return_train_score=False, scoring='accuracy', verbose=0)
print(rcv.best_score_)
print(rcv.best_estimator_)
print(rcv.best_params_)
0.98
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=18, p=2,
weights='uniform')
{'weights': 'uniform', 'n_neighbors': 18}
# rcv.cv_results_
from tpot import TPOTClassifier
tpc = TPOTClassifier(verbosity=2,max_time_mins=5,generations=5)
tpc
TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
disable_update_check=False, early_stop=None, generations=5,
max_eval_time_mins=5, max_time_mins=5, memory=None,
mutation_rate=0.9, n_jobs=1, offspring_size=None,
periodic_checkpoint_folder=None, population_size=100,
random_state=None, scoring=None, subsample=1.0, template=None,
use_dask=False, verbosity=2, warm_start=False)
tpc.fit(X_train,y_train)
Warning: xgboost.XGBClassifier is not available and will not be used by TPOT.
HBox(children=(IntProgress(value=0, description='Optimization Progress', style=ProgressStyle(description_width…
Generation 1 - Current best internal CV score: 0.9916666666666668
Generation 2 - Current best internal CV score: 1.0
Generation 3 - Current best internal CV score: 1.0
Generation 4 - Current best internal CV score: 1.0
Generation 5 - Current best internal CV score: 1.0
Generation 6 - Current best internal CV score: 1.0
Generation 7 - Current best internal CV score: 1.0
5.01072235 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.
TPOT closed prematurely. Will use the current best pipeline.
Best pipeline: ExtraTreesClassifier(ZeroCount(FastICA(input_matrix, tol=0.75)), bootstrap=True, criterion=entropy, max_features=0.45, min_samples_leaf=5, min_samples_split=3, n_estimators=100)
TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
disable_update_check=False, early_stop=None, generations=1000000,
max_eval_time_mins=5, max_time_mins=5, memory=None,
mutation_rate=0.9, n_jobs=1, offspring_size=None,
periodic_checkpoint_folder=None, population_size=100,
random_state=None, scoring=None, subsample=1.0, template=None,
use_dask=False, verbosity=2, warm_start=False)