Learn practical skills, build real-world projects, and advance your career
Created 5 years ago
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re
import numpy as np
pd.options.mode.chained_assignment = None
df = pd.read_csv('malware-classifier-ipynb-b918e/api_call_hist_normalize/api_call_hist_normalize.csv')
X = df.loc[:, 'SetUnhandledExceptionFilter':'JsEval'].values
y_str = df.loc[:, 'label'].values
y_str_cleaned = [re.split('[\.\!]', labels.split(',')[-1])[0] for labels in y_str ]
id = df.loc[:,'sha1'].values
print(type(X))
X = np.column_stack((id,X))
print(type(X))
label_encoder = LabelEncoder()
label_encoder.fit(y_str_cleaned)
y = label_encoder.transform(y_str_cleaned)
DATASET_SIZE = len(X)
print(DATASET_SIZE)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
20330
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)
import tensorflow as tf
def Transfer2TfData(data, label, shuffle=True, batch_size=32):
id = data[:,0]
_data = data[:,1:]
ds_data = tf.data.Dataset.from_tensor_slices((dict(names = id.tolist(), formats = _data.tolist()), label))
if shuffle:
ds_data.shuffle(buffer_size = len(data))
ds_data = ds_data.batch(batch_size)
return ds_data
ds_train = Transfer2TfData(X_train, y_train, True)