Jovian
⭐️
Sign In

Kobe Bryant Shot Selection

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
%matplotlib inline 
In [23]:

df = pd.DataFrame()
df = pd.read_csv('../project-kobe/data.csv')
pd.options.display.max_columns = None
display(df)


In [24]:
# prnting all the features with numbers
#categorize Shot_made_flag as a test set
test_set=df[df['shot_made_flag'].notnull()]
test_set.shape
Out[24]:
(25697, 25)
In [25]:
# find the columns which has null value for shot_made_flag
df['shot_made_flag'].isnull().sum()

# so we have 5000 rows which has no data for shot_made_flag which is column for getting the end result

Out[25]:
5000
In [26]:
#combined_shot_type VS shot_made_flag
# efficiency of each shot type using mean
test_set[['combined_shot_type', 'shot_made_flag']].groupby(['combined_shot_type'], 
as_index=False).mean().sort_values(by='shot_made_flag', ascending=False)
Out[26]:
In [27]:
test_set[['action_type', 'shot_made_flag']].groupby(['action_type'], as_index=False).mean().sort_values(by='shot_made_flag', ascending=False)
Out[27]:
In [ ]:


In [28]:

#let's drop those rows from the dataset
#drop = df.dropna(subset=['shot_made_flag'])
#drop.describe()

DataWithoutNull = df.dropna()
DataWithoutNull
Out[28]:
In [ ]:
 
In [29]:
#measure values without missing data
#shot_made_flag = df['shot_made_flag']
#no_missing_values = df[pd.notnull(shot_made_flag)]
#no_missing_values.filter(items=['shot_id', 'shot_made_flag']).head()

flagShot = df['shot_made_flag']
no_missing_values = df[pd.notnull(flagShot)]
no_missing_values.filter(items=['shot_id', 'shot_made_flag'])
Out[29]:
In [30]:
df.head()
Out[30]:
In [ ]:


In [ ]:
 
In [58]:
from sklearn import preprocessing



feature_updated = ['shot_id','loc_x','loc_y','shot_distance','seconds_remaining','lon','lat']
#X=DataWithoutNull.loc[~df.shot_made_flag.isnull(), ['minutes_remaining','shot_distance', 'loc_x','loc_y','lon']]
X = DataWithoutNull[feature_updated]
y = DataWithoutNull['shot_made_flag']

# X= preprocessing.scale(X)
y.head()
Out[58]:
1    0.0
2    1.0
3    0.0
4    1.0
5    0.0
Name: shot_made_flag, dtype: float64
In [71]:
X_train, X_test, y_train,y_test =train_test_split(X,y,test_size=0.05,random_state=6)
# X_train_new, X_test_new, y_train,y_test =train_test_split(X,y,test_size=0.25,random_state=6)
In [72]:

my_logreg = LogisticRegression(random_state=5)

my_decisiontree = DecisionTreeClassifier(random_state=5)


In [73]:
my_logreg.fit(X_train, y_train)

my_decisiontree.fit(X_train, y_train)
//anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning)
Out[73]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=5, splitter='best')
In [74]:
# testing


from sklearn.metrics import accuracy_score

y_predict_lr = my_logreg.predict(X_test)  # lr is abbreviation for logistic regreesion
y_predict_dt = my_decisiontree.predict(X_test)  # dt is abbreviation for decision tree


score_lr = accuracy_score(y_test, y_predict_lr)
score_dt = accuracy_score(y_test, y_predict_dt)


print(score_lr)
print(score_dt)   # printing accuracy of all three classes which are logistic,decision tree and knn

0.6280155642023346 0.5455252918287937
In [ ]:
 
In [75]:
plt.figure(figsize=(10,10))

# Plot using following data fields loc_x und loc_y 
plt.subplot(121)
plt.scatter(df.loc_x, df.loc_y, color='red', alpha=0.01)
plt.title('loc_x and loc_y')

plt.subplot(122)
plt.scatter(no_missing_values.lon, no_missing_values.lat, color='green', alpha=0.01)
plt.title('lat and lon')


Out[75]:
Text(0.5, 1.0, 'lat and lon')
Notebook Image
In [76]:
alpha = 0.3
plt.figure(figsize=(15,10))
# hit
plt.subplot(121)
h = df.loc[df.shot_made_flag == 1]
plt.scatter(h.loc_x, h.loc_y, color='green', alpha=alpha)
plt.title('Shots Made')
#ax = plt.gca()
#ax.set_ylim([-50, 900])


plt.subplot(122)
h = df.loc[df.shot_made_flag == 0]
plt.scatter(h.loc_x, h.loc_y, color='red', alpha=alpha)
plt.title('Shots missed')
Out[76]:
Text(0.5, 1.0, 'Shots missed')
Notebook Image
shots missed plot is darker, because he has missed more shots, then he has made.
also shot missed is more compressed because, distance is longer for missed shots.
In [77]:
#court_scale, alpha = 7, 0.05
#plt.figure(figsize=(2 * court_scale, court_scale*(84.0/50.0)))
# hit
#plt.subplot(121)
#h = df.loc[df.shot_made_flag == 1]
#plt.scatter(h.loc_x, h.loc_y, color='green', alpha=alpha)
#plt.title('Shots Made')
#ax = plt.gca()
#ax.set_ylim([-50, 900])
# miss
#plt.subplot(122)
#h = df.loc[df.shot_made_flag == 0]
#plt.scatter(h.loc_x, h.loc_y, color='red', alpha=alpha)
#plt.title('Shots missed')
#ax = plt.gca()
#ax.set_ylim([-50, 900])
#plt.savefig('shots_made_and_missed.png')
In [78]:
from sklearn.ensemble import RandomForestClassifier
my_RandomForest = RandomForestClassifier(n_estimators = 500, bootstrap = True, random_state=5)
my_RandomForest.fit(X_train,y_train)
y_predict_rf = my_RandomForest.predict(X_test)
In [79]:
score_rf = accuracy_score(y_test, y_predict_rf)
print(score_rf)
0.5945525291828794
In [80]:
from sklearn.neighbors import KNeighborsClassifier



# In the following line, "knn" is instantiated as an "object" of KNeighborsClassifier "class". 

k = 4
knn = KNeighborsClassifier(n_neighbors=k) 

knn.fit(X_train, y_train)
y_predict_kn = knn.predict(X_test)
score_kn = accuracy_score(y_test,y_predict_kn)
print(score_kn)

0.601556420233463
In [ ]:
 
In [ ]:
 
In [ ]: