Jovian
⭐️
Sign In
In [18]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re
pd.options.mode.chained_assignment = None

import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn import metrics
In [37]:
def printInfo(data):
    print(type(data))
    print(data.shape)
    print("---------------------")
    print(data[:5])
In [38]:
df = pd.read_csv('api_call_hist_normalize/api_call_hist_normalize.csv')
printInfo(df)
<class 'pandas.core.frame.DataFrame'> (20330, 310) --------------------- Unnamed: 0 sha1 \ 0 0 0005e3eba4ccabb43cffcd101a1c2424ab8b08fb 1 1 0006a13f031d83a5a7b577ebe88b1f7e51aed1be 2 2 000b8ecf81f3b917b0dc243fb66c6b3e0b8f2ec2 3 3 000befa6a9ebb2d63e9fe9aa3812e1512a2f1321 4 4 000deee4291dcb91cee866eaa298f5a6abb26e5b label SetUnhandledExceptionFilter \ 0 Trojan:Win32/Fuerboos.C!cl 0.0 1 Trojan:Win32/Emotet!rfn 13.0 2 Trojan:Win32/Fuery.A!cl 5.0 3 VirTool:Win32/VBInject 0.0 4 Trojan:Win32/Emotet.AC!bit 0.0 UnhookWindowsHookEx accept bind closesocket \ 0 0.0 0.0 0.0 0.0 1 0.0 0.0 12.0 0.0 2 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 4 0.0 0.0 0.0 0.0 CoInternetSetFeatureEnabled connect ... CryptEnumProvidersW \ 0 0.0 0.0 ... 0.0 1 0.0 0.0 ... 0.0 2 0.0 0.0 ... 0.0 3 0.0 0.0 ... 0.0 4 0.0 0.0 ... 0.0 CryptExportKey CryptHashData CryptGenKey CryptImportPublicKeyInfo \ 0 0.0 0.0 0.0 0.0 1 0.0 5.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 4 0.0 0.0 0.0 0.0 HTTPSCertificateTrust HTTPSFinalProv CDocument_write \ 0 0.0 0.0 0.0 1 0.0 0.0 0.0 2 0.0 0.0 0.0 3 0.0 0.0 0.0 4 0.0 0.0 0.0 COleScript_ParseScriptText JsEval 0 0.0 0.0 1 0.0 0.0 2 0.0 0.0 3 0.0 0.0 4 0.0 0.0 [5 rows x 310 columns]
In [34]:
# get all label numbers on top 20
top_20 = df['label'].value_counts()[:20]
printInfo(top_20)
<class 'pandas.core.series.Series'> (20,) --------------------- Trojan:Win32/Emotet.AC!bit 4067 Trojan:Win32/Fuerboos.C!cl 1614 Trojan:Win32/Emotet!rfn 960 Trojan:Win32/Emotet.PA!MTB 873 Trojan:Win32/Emotet.LK!ml 576 Trojan:Win32/Occamy.C 519 Trojan:Win32/Dynamer!rfn 445 Trojan:Win32/Tiggre!plock 331 Trojan:Win32/Skeeyah.A!rfn 305 VirTool:Win32/VBInject 300 Trojan:Win32/Sonbokli.A!cl 279 Trojan:HTML/Brocoiner.A!lib 279 Trojan:Win32/Azden.A!cl 277 Trojan:Win32/Fuery.C!cl 270 Trojan:Win32/Fuerboos.A!cl 264 Trojan:Win32/MereTam.A 259 Trojan:Win32/Casdet!rfn 245 VirTool:Win32/VBInject.OX!bit 242 VirTool:Win32/VBInject.ACS!bit 218 Trojan:Win32/Emali.A!cl 218 Name: label, dtype: int64
In [29]:
print(top_20.index)
Index(['Trojan:Win32/Emotet.AC!bit', 'Trojan:Win32/Fuerboos.C!cl', 'Trojan:Win32/Emotet!rfn', 'Trojan:Win32/Emotet.PA!MTB', 'Trojan:Win32/Emotet.LK!ml', 'Trojan:Win32/Occamy.C', 'Trojan:Win32/Dynamer!rfn', 'Trojan:Win32/Tiggre!plock', 'Trojan:Win32/Skeeyah.A!rfn', 'VirTool:Win32/VBInject', 'Trojan:Win32/Sonbokli.A!cl', 'Trojan:HTML/Brocoiner.A!lib', 'Trojan:Win32/Azden.A!cl', 'Trojan:Win32/Fuery.C!cl', 'Trojan:Win32/Fuerboos.A!cl', 'Trojan:Win32/MereTam.A', 'Trojan:Win32/Casdet!rfn', 'VirTool:Win32/VBInject.OX!bit', 'VirTool:Win32/VBInject.ACS!bit', 'Trojan:Win32/Emali.A!cl'], dtype='object')
In [40]:
df_new = df[df['label'].isin(top_20.index)]
printInfo(df_new)
<class 'pandas.core.frame.DataFrame'> (12541, 310) --------------------- Unnamed: 0 sha1 \ 0 0 0005e3eba4ccabb43cffcd101a1c2424ab8b08fb 1 1 0006a13f031d83a5a7b577ebe88b1f7e51aed1be 3 3 000befa6a9ebb2d63e9fe9aa3812e1512a2f1321 4 4 000deee4291dcb91cee866eaa298f5a6abb26e5b 5 5 000e95c211a72cffc0f00879fe4170411e754a9d label SetUnhandledExceptionFilter \ 0 Trojan:Win32/Fuerboos.C!cl 0.0 1 Trojan:Win32/Emotet!rfn 13.0 3 VirTool:Win32/VBInject 0.0 4 Trojan:Win32/Emotet.AC!bit 0.0 5 Trojan:Win32/Fuerboos.C!cl 13.0 UnhookWindowsHookEx accept bind closesocket \ 0 0.0 0.0 0.0 0.0 1 0.0 0.0 12.0 0.0 3 0.0 0.0 0.0 0.0 4 0.0 0.0 0.0 0.0 5 0.0 0.0 13.0 1.0 CoInternetSetFeatureEnabled connect ... CryptEnumProvidersW \ 0 0.0 0.0 ... 0.0 1 0.0 0.0 ... 0.0 3 0.0 0.0 ... 0.0 4 0.0 0.0 ... 0.0 5 0.0 0.0 ... 0.0 CryptExportKey CryptHashData CryptGenKey CryptImportPublicKeyInfo \ 0 0.0 0.0 0.0 0.0 1 0.0 5.0 0.0 0.0 3 0.0 0.0 0.0 0.0 4 0.0 0.0 0.0 0.0 5 0.0 5.0 0.0 0.0 HTTPSCertificateTrust HTTPSFinalProv CDocument_write \ 0 0.0 0.0 0.0 1 0.0 0.0 0.0 3 0.0 0.0 0.0 4 0.0 0.0 0.0 5 0.0 0.0 0.0 COleScript_ParseScriptText JsEval 0 0.0 0.0 1 0.0 0.0 3 0.0 0.0 4 0.0 0.0 5 0.0 0.0 [5 rows x 310 columns]
In [41]:
X = df_new.loc[:, 'SetUnhandledExceptionFilter':'JsEval'].values
y_str = df_new.loc[:, 'label'].values
y_str_cleaned = [re.split('[\.\!]', labels.split(',')[-1])[0] for labels in y_str ]
In [64]:
print(type(y_str_cleaned))
print([[x,y_str_cleaned.count(x)] for x in set(y_str_cleaned)])
<class 'list'> [['Trojan:Win32/Tiggre', 331], ['Trojan:Win32/MereTam', 259], ['Trojan:Win32/Casdet', 245], ['Trojan:Win32/Fuery', 270], ['Trojan:Win32/Skeeyah', 305], ['Trojan:HTML/Brocoiner', 279], ['Trojan:Win32/Fuerboos', 1878], ['Trojan:Win32/Sonbokli', 279], ['Trojan:Win32/Dynamer', 445], ['Trojan:Win32/Azden', 277], ['Trojan:Win32/Occamy', 519], ['Trojan:Win32/Emali', 218], ['Trojan:Win32/Emotet', 6476], ['VirTool:Win32/VBInject', 760]]
In [42]:
# transfer string into index
label_encoder = LabelEncoder()
label_encoder.fit(y_str_cleaned)
y = label_encoder.transform(y_str_cleaned)
In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25)
In [45]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
(9405, 307) (9405,) (3136, 307) (3136,)
In [48]:
cat_features = [0]
train_dataset = Pool(data = X_train, label = y_train, cat_features = cat_features)
test_dataset  = Pool(data = X_test,  label = y_test,  cat_features = cat_features)

model = CatBoostClassifier(iterations = 20, learning_rate=0.1, depth=5, loss_function='MultiClassOneVsAll', task_type = "GPU")

model.fit(train_dataset)
0: learn: -0.6116996 total: 23.4ms remaining: 444ms 1: learn: -0.5447600 total: 43.5ms remaining: 391ms 2: learn: -0.4893654 total: 66.4ms remaining: 377ms 3: learn: -0.4423707 total: 89ms remaining: 356ms 4: learn: -0.4020795 total: 110ms remaining: 331ms 5: learn: -0.3675314 total: 135ms remaining: 314ms 6: learn: -0.3376214 total: 159ms remaining: 295ms 7: learn: -0.3115849 total: 180ms remaining: 270ms 8: learn: -0.2889378 total: 204ms remaining: 249ms 9: learn: -0.2692023 total: 227ms remaining: 227ms 10: learn: -0.2517499 total: 252ms remaining: 206ms 11: learn: -0.2364758 total: 276ms remaining: 184ms 12: learn: -0.2229535 total: 298ms remaining: 160ms 13: learn: -0.2110398 total: 319ms remaining: 137ms 14: learn: -0.2005216 total: 340ms remaining: 113ms 15: learn: -0.1912187 total: 362ms remaining: 90.6ms 16: learn: -0.1828540 total: 384ms remaining: 67.8ms 17: learn: -0.1754283 total: 406ms remaining: 45.1ms 18: learn: -0.1690622 total: 431ms remaining: 22.7ms 19: learn: -0.1631846 total: 452ms remaining: 0us
Out[48]:
<catboost.core.CatBoostClassifier at 0x25cca66bc50>
In [49]:
print(model.get_best_score())

# https://stackoverflow.com/questions/51230062/mlogloss-value-in-catboost-starts-negative-and-increases
# https://catboost.ai/docs/concepts/loss-functions-multiclassification.html
{'learn': {'MultiClassOneVsAll': -0.16318460613536684}}
In [69]:
preds_raw = model.predict(test_dataset, prediction_type='Class')

print("data >>")
print(preds_raw)
print("-----------------------")
print("length >>",len(preds_raw))
print("data type >>", type(preds_raw))
print("shape >>", preds_raw.shape)


# confusion_matrix
# http://rasbt.github.io/mlxtend/user_guide/plotting/plot_confusion_matrix/
data >> [[5.] [5.] [6.] ... [5.] [6.] [6.]] ----------------------- length >> 3136 data type >> <class 'numpy.ndarray'> shape >> (3136, 1)
In [79]:
from sklearn.metrics import confusion_matrix

new_labels = [x for x in set(y_str_cleaned)]


arr_confusion_matrix = confusion_matrix(y_true = y_test, y_pred = preds_raw, labels=range(0,14))
print(arr_confusion_matrix)
[[ 66 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 1 0 0 0 0 13 48 0 0 6 0 0 0 0] [ 1 0 0 0 0 45 16 0 1 1 0 0 0 0] [ 0 0 0 2 0 70 31 0 0 7 0 0 0 4] [ 0 0 0 0 0 0 43 0 0 0 0 0 0 0] [ 1 0 0 0 0 1620 1 0 0 0 0 0 0 0] [ 1 0 0 0 0 107 369 0 0 21 0 0 0 0] [ 0 0 0 0 0 39 22 0 0 3 0 0 0 0] [ 0 0 0 0 0 0 0 0 63 0 0 0 0 0] [ 1 0 0 0 0 17 88 0 5 22 1 0 0 4] [ 0 0 0 0 0 49 7 0 0 1 6 2 0 0] [ 0 0 0 0 0 0 55 0 0 18 0 2 0 0] [ 0 0 0 0 0 8 47 0 1 6 2 0 2 1] [ 0 0 0 0 0 0 123 0 0 2 0 0 0 64]]
In [80]:
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt

fig, ax = plot_confusion_matrix(conf_mat=arr_confusion_matrix,
                                colorbar=True,
                                show_absolute=False,
                                show_normed=True,
                               figsize=(20,20))
plt.show()
Notebook Image
In [77]:
plt.hist(preds_raw, bins='auto')
plt.show()
Notebook Image
In [ ]:
import jovian
jovian.commit()
[jovian] Saving notebook..