import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re
pd.options.mode.chained_assignment = None
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn import metrics
def printInfo(data):
print(type(data))
print(data.shape)
print("---------------------")
print(data[:5])
df = pd.read_csv('api_call_hist_normalize/api_call_hist_normalize.csv')
printInfo(df)
<class 'pandas.core.frame.DataFrame'>
(20330, 310)
---------------------
Unnamed: 0 sha1 \
0 0 0005e3eba4ccabb43cffcd101a1c2424ab8b08fb
1 1 0006a13f031d83a5a7b577ebe88b1f7e51aed1be
2 2 000b8ecf81f3b917b0dc243fb66c6b3e0b8f2ec2
3 3 000befa6a9ebb2d63e9fe9aa3812e1512a2f1321
4 4 000deee4291dcb91cee866eaa298f5a6abb26e5b
label SetUnhandledExceptionFilter \
0 Trojan:Win32/Fuerboos.C!cl 0.0
1 Trojan:Win32/Emotet!rfn 13.0
2 Trojan:Win32/Fuery.A!cl 5.0
3 VirTool:Win32/VBInject 0.0
4 Trojan:Win32/Emotet.AC!bit 0.0
UnhookWindowsHookEx accept bind closesocket \
0 0.0 0.0 0.0 0.0
1 0.0 0.0 12.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0
CoInternetSetFeatureEnabled connect ... CryptEnumProvidersW \
0 0.0 0.0 ... 0.0
1 0.0 0.0 ... 0.0
2 0.0 0.0 ... 0.0
3 0.0 0.0 ... 0.0
4 0.0 0.0 ... 0.0
CryptExportKey CryptHashData CryptGenKey CryptImportPublicKeyInfo \
0 0.0 0.0 0.0 0.0
1 0.0 5.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0
HTTPSCertificateTrust HTTPSFinalProv CDocument_write \
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 0.0 0.0
4 0.0 0.0 0.0
COleScript_ParseScriptText JsEval
0 0.0 0.0
1 0.0 0.0
2 0.0 0.0
3 0.0 0.0
4 0.0 0.0
[5 rows x 310 columns]
# get all label numbers on top 20
top_20 = df['label'].value_counts()[:20]
printInfo(top_20)
<class 'pandas.core.series.Series'>
(20,)
---------------------
Trojan:Win32/Emotet.AC!bit 4067
Trojan:Win32/Fuerboos.C!cl 1614
Trojan:Win32/Emotet!rfn 960
Trojan:Win32/Emotet.PA!MTB 873
Trojan:Win32/Emotet.LK!ml 576
Trojan:Win32/Occamy.C 519
Trojan:Win32/Dynamer!rfn 445
Trojan:Win32/Tiggre!plock 331
Trojan:Win32/Skeeyah.A!rfn 305
VirTool:Win32/VBInject 300
Trojan:Win32/Sonbokli.A!cl 279
Trojan:HTML/Brocoiner.A!lib 279
Trojan:Win32/Azden.A!cl 277
Trojan:Win32/Fuery.C!cl 270
Trojan:Win32/Fuerboos.A!cl 264
Trojan:Win32/MereTam.A 259
Trojan:Win32/Casdet!rfn 245
VirTool:Win32/VBInject.OX!bit 242
VirTool:Win32/VBInject.ACS!bit 218
Trojan:Win32/Emali.A!cl 218
Name: label, dtype: int64
print(top_20.index)
Index(['Trojan:Win32/Emotet.AC!bit', 'Trojan:Win32/Fuerboos.C!cl',
'Trojan:Win32/Emotet!rfn', 'Trojan:Win32/Emotet.PA!MTB',
'Trojan:Win32/Emotet.LK!ml', 'Trojan:Win32/Occamy.C',
'Trojan:Win32/Dynamer!rfn', 'Trojan:Win32/Tiggre!plock',
'Trojan:Win32/Skeeyah.A!rfn', 'VirTool:Win32/VBInject',
'Trojan:Win32/Sonbokli.A!cl', 'Trojan:HTML/Brocoiner.A!lib',
'Trojan:Win32/Azden.A!cl', 'Trojan:Win32/Fuery.C!cl',
'Trojan:Win32/Fuerboos.A!cl', 'Trojan:Win32/MereTam.A',
'Trojan:Win32/Casdet!rfn', 'VirTool:Win32/VBInject.OX!bit',
'VirTool:Win32/VBInject.ACS!bit', 'Trojan:Win32/Emali.A!cl'],
dtype='object')
df_new = df[df['label'].isin(top_20.index)]
printInfo(df_new)
<class 'pandas.core.frame.DataFrame'>
(12541, 310)
---------------------
Unnamed: 0 sha1 \
0 0 0005e3eba4ccabb43cffcd101a1c2424ab8b08fb
1 1 0006a13f031d83a5a7b577ebe88b1f7e51aed1be
3 3 000befa6a9ebb2d63e9fe9aa3812e1512a2f1321
4 4 000deee4291dcb91cee866eaa298f5a6abb26e5b
5 5 000e95c211a72cffc0f00879fe4170411e754a9d
label SetUnhandledExceptionFilter \
0 Trojan:Win32/Fuerboos.C!cl 0.0
1 Trojan:Win32/Emotet!rfn 13.0
3 VirTool:Win32/VBInject 0.0
4 Trojan:Win32/Emotet.AC!bit 0.0
5 Trojan:Win32/Fuerboos.C!cl 13.0
UnhookWindowsHookEx accept bind closesocket \
0 0.0 0.0 0.0 0.0
1 0.0 0.0 12.0 0.0
3 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0
5 0.0 0.0 13.0 1.0
CoInternetSetFeatureEnabled connect ... CryptEnumProvidersW \
0 0.0 0.0 ... 0.0
1 0.0 0.0 ... 0.0
3 0.0 0.0 ... 0.0
4 0.0 0.0 ... 0.0
5 0.0 0.0 ... 0.0
CryptExportKey CryptHashData CryptGenKey CryptImportPublicKeyInfo \
0 0.0 0.0 0.0 0.0
1 0.0 5.0 0.0 0.0
3 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0
5 0.0 5.0 0.0 0.0
HTTPSCertificateTrust HTTPSFinalProv CDocument_write \
0 0.0 0.0 0.0
1 0.0 0.0 0.0
3 0.0 0.0 0.0
4 0.0 0.0 0.0
5 0.0 0.0 0.0
COleScript_ParseScriptText JsEval
0 0.0 0.0
1 0.0 0.0
3 0.0 0.0
4 0.0 0.0
5 0.0 0.0
[5 rows x 310 columns]
X = df_new.loc[:, 'SetUnhandledExceptionFilter':'JsEval'].values
y_str = df_new.loc[:, 'label'].values
y_str_cleaned = [re.split('[\.\!]', labels.split(',')[-1])[0] for labels in y_str ]
print(type(y_str_cleaned))
print([[x,y_str_cleaned.count(x)] for x in set(y_str_cleaned)])
<class 'list'>
[['Trojan:Win32/Tiggre', 331], ['Trojan:Win32/MereTam', 259], ['Trojan:Win32/Casdet', 245], ['Trojan:Win32/Fuery', 270], ['Trojan:Win32/Skeeyah', 305], ['Trojan:HTML/Brocoiner', 279], ['Trojan:Win32/Fuerboos', 1878], ['Trojan:Win32/Sonbokli', 279], ['Trojan:Win32/Dynamer', 445], ['Trojan:Win32/Azden', 277], ['Trojan:Win32/Occamy', 519], ['Trojan:Win32/Emali', 218], ['Trojan:Win32/Emotet', 6476], ['VirTool:Win32/VBInject', 760]]
# transfer string into index
label_encoder = LabelEncoder()
label_encoder.fit(y_str_cleaned)
y = label_encoder.transform(y_str_cleaned)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
(9405, 307)
(9405,)
(3136, 307)
(3136,)
cat_features = [0]
train_dataset = Pool(data = X_train, label = y_train, cat_features = cat_features)
test_dataset = Pool(data = X_test, label = y_test, cat_features = cat_features)
model = CatBoostClassifier(iterations = 20, learning_rate=0.1, depth=5, loss_function='MultiClassOneVsAll', task_type = "GPU")
model.fit(train_dataset)
0: learn: -0.6116996 total: 23.4ms remaining: 444ms
1: learn: -0.5447600 total: 43.5ms remaining: 391ms
2: learn: -0.4893654 total: 66.4ms remaining: 377ms
3: learn: -0.4423707 total: 89ms remaining: 356ms
4: learn: -0.4020795 total: 110ms remaining: 331ms
5: learn: -0.3675314 total: 135ms remaining: 314ms
6: learn: -0.3376214 total: 159ms remaining: 295ms
7: learn: -0.3115849 total: 180ms remaining: 270ms
8: learn: -0.2889378 total: 204ms remaining: 249ms
9: learn: -0.2692023 total: 227ms remaining: 227ms
10: learn: -0.2517499 total: 252ms remaining: 206ms
11: learn: -0.2364758 total: 276ms remaining: 184ms
12: learn: -0.2229535 total: 298ms remaining: 160ms
13: learn: -0.2110398 total: 319ms remaining: 137ms
14: learn: -0.2005216 total: 340ms remaining: 113ms
15: learn: -0.1912187 total: 362ms remaining: 90.6ms
16: learn: -0.1828540 total: 384ms remaining: 67.8ms
17: learn: -0.1754283 total: 406ms remaining: 45.1ms
18: learn: -0.1690622 total: 431ms remaining: 22.7ms
19: learn: -0.1631846 total: 452ms remaining: 0us
<catboost.core.CatBoostClassifier at 0x25cca66bc50>
print(model.get_best_score())
# https://stackoverflow.com/questions/51230062/mlogloss-value-in-catboost-starts-negative-and-increases
# https://catboost.ai/docs/concepts/loss-functions-multiclassification.html
{'learn': {'MultiClassOneVsAll': -0.16318460613536684}}
preds_raw = model.predict(test_dataset, prediction_type='Class')
print("data >>")
print(preds_raw)
print("-----------------------")
print("length >>",len(preds_raw))
print("data type >>", type(preds_raw))
print("shape >>", preds_raw.shape)
# confusion_matrix
# http://rasbt.github.io/mlxtend/user_guide/plotting/plot_confusion_matrix/
data >>
[[5.]
[5.]
[6.]
...
[5.]
[6.]
[6.]]
-----------------------
length >> 3136
data type >> <class 'numpy.ndarray'>
shape >> (3136, 1)
from sklearn.metrics import confusion_matrix
new_labels = [x for x in set(y_str_cleaned)]
arr_confusion_matrix = confusion_matrix(y_true = y_test, y_pred = preds_raw, labels=range(0,14))
print(arr_confusion_matrix)
[[ 66 0 0 0 0 0 0 0 0 0 0 0 0 0]
[ 1 0 0 0 0 13 48 0 0 6 0 0 0 0]
[ 1 0 0 0 0 45 16 0 1 1 0 0 0 0]
[ 0 0 0 2 0 70 31 0 0 7 0 0 0 4]
[ 0 0 0 0 0 0 43 0 0 0 0 0 0 0]
[ 1 0 0 0 0 1620 1 0 0 0 0 0 0 0]
[ 1 0 0 0 0 107 369 0 0 21 0 0 0 0]
[ 0 0 0 0 0 39 22 0 0 3 0 0 0 0]
[ 0 0 0 0 0 0 0 0 63 0 0 0 0 0]
[ 1 0 0 0 0 17 88 0 5 22 1 0 0 4]
[ 0 0 0 0 0 49 7 0 0 1 6 2 0 0]
[ 0 0 0 0 0 0 55 0 0 18 0 2 0 0]
[ 0 0 0 0 0 8 47 0 1 6 2 0 2 1]
[ 0 0 0 0 0 0 123 0 0 2 0 0 0 64]]
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt
fig, ax = plot_confusion_matrix(conf_mat=arr_confusion_matrix,
colorbar=True,
show_absolute=False,
show_normed=True,
figsize=(20,20))
plt.show()
plt.hist(preds_raw, bins='auto')
plt.show()
import jovian
jovian.commit()
[jovian] Saving notebook..