Learn practical skills, build real-world projects, and advance your career
Updated 3 years ago
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from matplotlib.colors import LogNorm
from sklearn.metrics import precision_recall_fscore_support
input_file = '/home/adrian/Tug/Data/Table-Port-Results/FIRAU_Table.parquet.gzip'
df = pd.read_parquet(input_file)
df = df.dropna()
df = df.reset_index(drop=True)
df=df.drop(df[df.ship_type==90].index).reset_index(drop=True)
#simplifying problem
df.loc[df.tug_amount == 3,"tug_amount"] = 1
df.loc[df.tug_amount == 2,"tug_amount"] = 1
#Preprocessing
#translating hour feature to continuous features - now hour consists of 2 features - hoursin and hourcos
df['hour_sin'] = np.sin(2 * np.pi * df['hour']/23.0)
df['hour_cos'] = np.cos(2 * np.pi * df['hour']/23.0)
#string to number of weekday
df.loc[df['weekday'] == "Monday", 'weekday' ] = 1
df.loc[df['weekday'] == "Tuesday", 'weekday' ] = 2
df.loc[df['weekday'] == "Wednesday", 'weekday' ] = 3
df.loc[df['weekday'] == "Thursday", 'weekday' ] = 4
df.loc[df['weekday'] == "Friday", 'weekday' ] = 5
df.loc[df['weekday'] == "Saturday", 'weekday' ] = 6
df.loc[df['weekday'] == "Sunday", 'weekday' ] = 7
#same as hours
df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'].astype(float)/7.0)
df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'].astype(float)/7.0)
df['wd_10min_sin'] = np.sin(2 * np.pi * df['wd_10min'].astype(float)/360.0)
df['wd_10min_cos'] = np.cos(2 * np.pi * df['wd_10min'].astype(float)/360.0)
#making datasets
X=pd.get_dummies(df.ship_type)
y=df["tug_amount"].values
#making dataset continues
X=X.join(df[["ws_10min","wg_10min","length","breadth","draught","hour_sin","hour_cos","weekday_sin","weekday_cos","wd_10min_sin","wd_10min_cos"]])
X=X.join(pd.get_dummies(df.voyage_type))
X=X.join(pd.get_dummies(df.port))
X=X.values
parameters = {
'max_depth': [2,3,4,5],
# 'min_child_weight' : [0.1,0.2],
# "learning_rate" : [0.05, 0.10],
# 'n_estimators' : [70, 110, 150, 190],
# 'subsample':[0.5,0.7,0.8]
}
def nested_cv_for_xgboost(X,y):
y_preds=[]
y_tested=[]
cv = StratifiedKFold(n_splits=10)
fold=0
#Outer 10-fold CV Data Split
for train_index, test_index in cv.split(X, y):
print("Calculating fold", fold ,"out of 10",end="\r")
fold+=1
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf=XGBClassifier(use_label_encoder=False, objective= 'binary:hinge', eval_metric='aucpr',nthread=10,
scale_pos_weight=np.count_nonzero(y==0)/np.count_nonzero(y==1),max_delta_step=1,n_estimators=140)
gsearch = GridSearchCV(estimator = clf, param_grid = parameters, scoring="f1",n_jobs=-1, cv = StratifiedKFold(n_splits=3), verbose=True)
gsearch.fit(X_train,y_train)
model = gsearch.best_estimator_
y_preds.append(model.predict(X_test))
#grid_search.fit(X_train,y_train)
#y_preds.append(grid_search.best_estimator_.predict(X_test))
y_tested.append(y_test)
return(y_preds, y_tested)
y_tested,y_preds=nested_cv_for_xgboost(X,y)
y_preds=np.concatenate(y_preds)
y_tested=np.concatenate(y_tested)
cm = confusion_matrix(y_tested, y_preds)
print("Accuracy of classification is ",accuracy_score(y_tested, y_preds)*100,"%")
#confusion matrix
df_cm = pd.DataFrame(cm)
plt.figure(figsize=(8,7))
ax =sns.heatmap(df_cm, annot=True,
cmap='PuBu_r', shading='auto', fmt='g', norm=LogNorm(vmin=(df_cm.to_numpy().min()+1), vmax=df_cm.to_numpy().max()))
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
ax.set_xlabel("Predicted labels")
ax.set_ylabel("True labels")
ax.set_title("Confusion matrix")
print(precision_recall_fscore_support(y_tested, y_preds, average='weighted'))
print("Label 0 accuracy: ",cm[0][0] / (cm[0][0] + cm[1][0]) *100,"%")
print("Label 1 accuracy: ",cm[1][1] / (cm[0][1] + cm[1][1]) *100,"%")
print("Tug help = 0 :" ,len(df[df.tug_amount==0]))
print("Tug help = 1 :" ,len(df[df.tug_amount==1]))
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[13:48:36] WARNING: ../src/learner.cc:541:
Parameters: { scale_pos_weight } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[13:48:38] WARNING: ../src/learner.cc:541:
Parameters: { scale_pos_weight } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[13:48:39] WARNING: ../src/learner.cc:541:
Parameters: { scale_pos_weight } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[13:48:41] WARNING: ../src/learner.cc:541:
Parameters: { scale_pos_weight } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[13:48:43] WARNING: ../src/learner.cc:541:
Parameters: { scale_pos_weight } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[13:48:45] WARNING: ../src/learner.cc:541:
Parameters: { scale_pos_weight } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[13:48:47] WARNING: ../src/learner.cc:541:
Parameters: { scale_pos_weight } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[13:48:49] WARNING: ../src/learner.cc:541:
Parameters: { scale_pos_weight } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[13:48:51] WARNING: ../src/learner.cc:541:
Parameters: { scale_pos_weight } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[13:48:53] WARNING: ../src/learner.cc:541:
Parameters: { scale_pos_weight } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
Accuracy of classification is 95.37331701346389 %
(0.9547548008622274, 0.9537331701346389, 0.9541913077997374, None)
Label 0 accuracy: 97.67441860465115 %
Label 1 accuracy: 77.80126849894292 %
Tug help = 0 : 3612
Tug help = 1 : 473
import jovian
jovian.commit()
[jovian] Attempting to save notebook..