import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
%matplotlib inline
df = pd.DataFrame()
df = pd.read_csv('../project-kobe/data.csv')
pd.options.display.max_columns = None
display(df)
# prnting all the features with numbers
#categorize Shot_made_flag as a test set
test_set=df[df['shot_made_flag'].notnull()]
test_set.shape
(25697, 25)
# find the columns which has null value for shot_made_flag
df['shot_made_flag'].isnull().sum()
# so we have 5000 rows which has no data for shot_made_flag which is column for getting the end result
5000
#combined_shot_type VS shot_made_flag
# efficiency of each shot type using mean
test_set[['combined_shot_type', 'shot_made_flag']].groupby(['combined_shot_type'],
as_index=False).mean().sort_values(by='shot_made_flag', ascending=False)
test_set[['action_type', 'shot_made_flag']].groupby(['action_type'], as_index=False).mean().sort_values(by='shot_made_flag', ascending=False)
#let's drop those rows from the dataset
#drop = df.dropna(subset=['shot_made_flag'])
#drop.describe()
DataWithoutNull = df.dropna()
DataWithoutNull
#measure values without missing data
#shot_made_flag = df['shot_made_flag']
#no_missing_values = df[pd.notnull(shot_made_flag)]
#no_missing_values.filter(items=['shot_id', 'shot_made_flag']).head()
flagShot = df['shot_made_flag']
no_missing_values = df[pd.notnull(flagShot)]
no_missing_values.filter(items=['shot_id', 'shot_made_flag'])
df.head()
from sklearn import preprocessing
feature_updated = ['shot_id','loc_x','loc_y','shot_distance','seconds_remaining','lon','lat']
#X=DataWithoutNull.loc[~df.shot_made_flag.isnull(), ['minutes_remaining','shot_distance', 'loc_x','loc_y','lon']]
X = DataWithoutNull[feature_updated]
y = DataWithoutNull['shot_made_flag']
# X= preprocessing.scale(X)
y.head()
1 0.0
2 1.0
3 0.0
4 1.0
5 0.0
Name: shot_made_flag, dtype: float64
X_train, X_test, y_train,y_test =train_test_split(X,y,test_size=0.05,random_state=6)
# X_train_new, X_test_new, y_train,y_test =train_test_split(X,y,test_size=0.25,random_state=6)
my_logreg = LogisticRegression(random_state=5)
my_decisiontree = DecisionTreeClassifier(random_state=5)
my_logreg.fit(X_train, y_train)
my_decisiontree.fit(X_train, y_train)
//anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False,
random_state=5, splitter='best')
# testing
from sklearn.metrics import accuracy_score
y_predict_lr = my_logreg.predict(X_test) # lr is abbreviation for logistic regreesion
y_predict_dt = my_decisiontree.predict(X_test) # dt is abbreviation for decision tree
score_lr = accuracy_score(y_test, y_predict_lr)
score_dt = accuracy_score(y_test, y_predict_dt)
print(score_lr)
print(score_dt) # printing accuracy of all three classes which are logistic,decision tree and knn
0.6280155642023346
0.5455252918287937
plt.figure(figsize=(10,10))
# Plot using following data fields loc_x und loc_y
plt.subplot(121)
plt.scatter(df.loc_x, df.loc_y, color='red', alpha=0.01)
plt.title('loc_x and loc_y')
plt.subplot(122)
plt.scatter(no_missing_values.lon, no_missing_values.lat, color='green', alpha=0.01)
plt.title('lat and lon')
Text(0.5, 1.0, 'lat and lon')
alpha = 0.3
plt.figure(figsize=(15,10))
# hit
plt.subplot(121)
h = df.loc[df.shot_made_flag == 1]
plt.scatter(h.loc_x, h.loc_y, color='green', alpha=alpha)
plt.title('Shots Made')
#ax = plt.gca()
#ax.set_ylim([-50, 900])
plt.subplot(122)
h = df.loc[df.shot_made_flag == 0]
plt.scatter(h.loc_x, h.loc_y, color='red', alpha=alpha)
plt.title('Shots missed')
Text(0.5, 1.0, 'Shots missed')
#court_scale, alpha = 7, 0.05
#plt.figure(figsize=(2 * court_scale, court_scale*(84.0/50.0)))
# hit
#plt.subplot(121)
#h = df.loc[df.shot_made_flag == 1]
#plt.scatter(h.loc_x, h.loc_y, color='green', alpha=alpha)
#plt.title('Shots Made')
#ax = plt.gca()
#ax.set_ylim([-50, 900])
# miss
#plt.subplot(122)
#h = df.loc[df.shot_made_flag == 0]
#plt.scatter(h.loc_x, h.loc_y, color='red', alpha=alpha)
#plt.title('Shots missed')
#ax = plt.gca()
#ax.set_ylim([-50, 900])
#plt.savefig('shots_made_and_missed.png')
from sklearn.ensemble import RandomForestClassifier
my_RandomForest = RandomForestClassifier(n_estimators = 500, bootstrap = True, random_state=5)
my_RandomForest.fit(X_train,y_train)
y_predict_rf = my_RandomForest.predict(X_test)
score_rf = accuracy_score(y_test, y_predict_rf)
print(score_rf)
0.5945525291828794
from sklearn.neighbors import KNeighborsClassifier
# In the following line, "knn" is instantiated as an "object" of KNeighborsClassifier "class".
k = 4
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
y_predict_kn = knn.predict(X_test)
score_kn = accuracy_score(y_test,y_predict_kn)
print(score_kn)
0.601556420233463