Learn practical skills, build real-world projects, and advance your career
Updated 5 years ago
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re
pd.options.mode.chained_assignment = None
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn import metrics
def printInfo(data):
print(type(data))
print(data.shape)
print("---------------------")
print(data[:5])
df = pd.read_csv('api_call_hist_normalize/api_call_hist_normalize.csv')
printInfo(df)
<class 'pandas.core.frame.DataFrame'>
(20330, 310)
---------------------
Unnamed: 0 sha1 \
0 0 0005e3eba4ccabb43cffcd101a1c2424ab8b08fb
1 1 0006a13f031d83a5a7b577ebe88b1f7e51aed1be
2 2 000b8ecf81f3b917b0dc243fb66c6b3e0b8f2ec2
3 3 000befa6a9ebb2d63e9fe9aa3812e1512a2f1321
4 4 000deee4291dcb91cee866eaa298f5a6abb26e5b
label SetUnhandledExceptionFilter \
0 Trojan:Win32/Fuerboos.C!cl 0.0
1 Trojan:Win32/Emotet!rfn 13.0
2 Trojan:Win32/Fuery.A!cl 5.0
3 VirTool:Win32/VBInject 0.0
4 Trojan:Win32/Emotet.AC!bit 0.0
UnhookWindowsHookEx accept bind closesocket \
0 0.0 0.0 0.0 0.0
1 0.0 0.0 12.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0
CoInternetSetFeatureEnabled connect ... CryptEnumProvidersW \
0 0.0 0.0 ... 0.0
1 0.0 0.0 ... 0.0
2 0.0 0.0 ... 0.0
3 0.0 0.0 ... 0.0
4 0.0 0.0 ... 0.0
CryptExportKey CryptHashData CryptGenKey CryptImportPublicKeyInfo \
0 0.0 0.0 0.0 0.0
1 0.0 5.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0
HTTPSCertificateTrust HTTPSFinalProv CDocument_write \
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 0.0 0.0
4 0.0 0.0 0.0
COleScript_ParseScriptText JsEval
0 0.0 0.0
1 0.0 0.0
2 0.0 0.0
3 0.0 0.0
4 0.0 0.0
[5 rows x 310 columns]
# get all label numbers on top 20
top_20 = df['label'].value_counts()[:20]
printInfo(top_20)
<class 'pandas.core.series.Series'>
(20,)
---------------------
Trojan:Win32/Emotet.AC!bit 4067
Trojan:Win32/Fuerboos.C!cl 1614
Trojan:Win32/Emotet!rfn 960
Trojan:Win32/Emotet.PA!MTB 873
Trojan:Win32/Emotet.LK!ml 576
Trojan:Win32/Occamy.C 519
Trojan:Win32/Dynamer!rfn 445
Trojan:Win32/Tiggre!plock 331
Trojan:Win32/Skeeyah.A!rfn 305
VirTool:Win32/VBInject 300
Trojan:Win32/Sonbokli.A!cl 279
Trojan:HTML/Brocoiner.A!lib 279
Trojan:Win32/Azden.A!cl 277
Trojan:Win32/Fuery.C!cl 270
Trojan:Win32/Fuerboos.A!cl 264
Trojan:Win32/MereTam.A 259
Trojan:Win32/Casdet!rfn 245
VirTool:Win32/VBInject.OX!bit 242
VirTool:Win32/VBInject.ACS!bit 218
Trojan:Win32/Emali.A!cl 218
Name: label, dtype: int64
print(top_20.index)
Index(['Trojan:Win32/Emotet.AC!bit', 'Trojan:Win32/Fuerboos.C!cl',
'Trojan:Win32/Emotet!rfn', 'Trojan:Win32/Emotet.PA!MTB',
'Trojan:Win32/Emotet.LK!ml', 'Trojan:Win32/Occamy.C',
'Trojan:Win32/Dynamer!rfn', 'Trojan:Win32/Tiggre!plock',
'Trojan:Win32/Skeeyah.A!rfn', 'VirTool:Win32/VBInject',
'Trojan:Win32/Sonbokli.A!cl', 'Trojan:HTML/Brocoiner.A!lib',
'Trojan:Win32/Azden.A!cl', 'Trojan:Win32/Fuery.C!cl',
'Trojan:Win32/Fuerboos.A!cl', 'Trojan:Win32/MereTam.A',
'Trojan:Win32/Casdet!rfn', 'VirTool:Win32/VBInject.OX!bit',
'VirTool:Win32/VBInject.ACS!bit', 'Trojan:Win32/Emali.A!cl'],
dtype='object')