Click fraud detection of every click based on click log data given on real world industry dataset
Private Dataset (columbia dataset) contains almost 45 fields of data i.e click log of the data
Used machine learning and deep learning techniques to classify each click Accuracy up to 98%
import jovian
PREPROCESSING
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from pandas import read_csv
df1=pd.read_csv("AllFeb21-fraud.csv")
df2=pd.read_csv("AllFeb21-click.csv")
df2=df2.sample(frac=0.06)
df=pd.concat([df1,df2], sort = False, axis=0, join='outer')
df.head()
df = df.sample(frac=1).reset_index(drop=True) # shuffle the dataset
# print(df['fraud'])
#dropping unnecesary
df=df.drop('bundleId',axis=1)#NR
df=df.drop('refUrl',axis=1)#NR
df=df.drop('fraudReason',axis=1)#NR
df=df.drop('auds',axis=1)#COMPLEX
df=df.drop('callIdentifier',axis=1)#COMPLEX
df=df.drop('section',axis=1)#COMPLEX
df=df.drop('ctxCatId',axis=1)#COMPLEX
df=df.drop('clickXForwardedFor',axis=1)#COMPLEX
df=df.drop('crtd',axis=1)#CONST
df=df.drop('paid',axis=1)#CONST
df=df.drop('esi',axis=1)#CONST
# Encoding the Independent Variable - id
labelencoder_X = LabelEncoder()
df.id = labelencoder_X.fit_transform(df.id)
# Encoding the Independent Variable - timestamp
labelencoder_X = LabelEncoder()
df.timestamp = labelencoder_X.fit_transform(df.timestamp)
# Encoding the Independent Variable - imprId
labelencoder_X = LabelEncoder()
df.imprId = labelencoder_X.fit_transform(df.imprId)
# Encoding the Independent Variable - clmbUserId
labelencoder_X = LabelEncoder()
df.clmbUserId = labelencoder_X.fit_transform(df.clmbUserId)
# Encoding the Independent Variable - clickIp
labelencoder_X = LabelEncoder()
df.clickIp = labelencoder_X.fit_transform(df.clickIp)
# Encoding the Independent Variable - impressionServedTimeStamp
labelencoder_X = LabelEncoder()
df.impressionServedTimeStamp = labelencoder_X.fit_transform(df.impressionServedTimeStamp)
# Encoding the Independent Variable - ip
labelencoder_X = LabelEncoder()
df.ip = labelencoder_X.fit_transform(df.ip)
# Encoding the Independent Variable - userAgent
labelencoder_X = LabelEncoder()
df.userAgent = labelencoder_X.fit_transform(df.userAgent)
#Imputation
# mark zero values as missing or NaN
df =df.replace(0, np.NaN)
# fill missing values with mean column values
df.fillna(df.mean(), inplace=True)
# Encoding the Independent Variable - fraud #this is done after imputation else all these zeros will be replaced by 1s
labelencoder_X = LabelEncoder()
df.fraud= labelencoder_X.fit_transform(df.fraud)
# print(df['fraud'])
from sklearn.preprocessing import MinMaxScaler
scaler= MinMaxScaler(feature_range=(0, 100))
df.columns.values
array(['id', 'timestamp', 'imprId', 'clmbUserId', 'adSltDimId', 'itemid',
'algo', 'advClientId', 'pubClientId', 'itmClmbLId', 'tmpltId',
'clickBid', 'position', 'siteId', 'fraud', 'uuidSource', 'clickIp',
'clickGeoId', 'adBlockerDetectionValueId',
'impressionServedTimeStamp', 'ip', 'geoDimId', 'cityDimId',
'stateDimId', 'ispDimId', 'userAgent', 'osDimId', 'devTypeDimId',
'vendorDimId', 'modelDimId', 'connTypeDimId', 'browserDimId'],
dtype=object)
cor = df.corr()
cor_sorted = cor.sort_values(by=['fraud'], ascending=False).iloc[:,14] #sorting the features to show high corelated features (with fraud) on the top
# cor_sorted
# position 0.021268
# connTypeDimId 0.014521
# advClientId 0.001073
# clickGeoId -0.001880
# clmbUserId -0.013623
# imprId -0.018225
# these can be removed
sp_cor = df.corr(method='spearman')
sp_cor_sorted = sp_cor.sort_values(by=['fraud'], ascending=False).iloc[:,14] #sorting the features to show high spearman corelated features (with fraud) on the top
array = df.values
x_cols = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
# store the feature matrix (X) and response vector (y)
X = array[:,x_cols]
Y = array[:,14]
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
#to print confusion matrix
from sklearn.metrics import confusion_matrix
#to print metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
GAUSSIAN NAIVEBAYES CLASSIFIER - without normalisation
# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, Y_train)
# making predictions on the testing set
Y_pred = gnb.predict(X_test)
# comparing actual response values (y_test) with predicted response values (y_pred)
from sklearn import metrics
# print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(Y_test, Y_pred)*100)
#Confusion matrix and various metrics of GNB classifer
results = confusion_matrix(Y_test, Y_pred)
print ('Confusion Matrix of GNB Classifier:')
print(results)
print ('Accuracy Score :',accuracy_score(Y_test, Y_pred))
print ('Report : ')
print (classification_report(Y_test, Y_pred))
Confusion Matrix of GNB Classifier:
[[1471 156]
[ 659 773]]
Accuracy Score : 0.7335730630925139
Report :
precision recall f1-score support
0.0 0.69 0.90 0.78 1627
1.0 0.83 0.54 0.65 1432
avg / total 0.76 0.73 0.72 3059
SVM - without normalisation
# from sklearn.svm import SVC # "Support Vector Classifier"
# clf = SVC(kernel='linear')
# # fitting x samples and y classes
# clf.fit(X_train, Y_train)
# Y_pred = clf.predict(X_test)
# # comparing actual response values (y_test) with predicted response values (y_pred)
# print("SVM accuracy(in %):", metrics.accuracy_score(Y_test, Y_pred)*100)
# results = confusion_matrix(Y_test, Y_pred)
# print ('Confusion Matrix SVM:')
# print(results)
# print ('Accuracy Score :',accuracy_score(Y_test, Y_pred))
# print ('Report : ')
# print (classification_report(Y_test, Y_pred))
LOGISTIC REGRESSION - without normalisation
from sklearn import linear_model
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
# Put the result into a color plot
# print("Logistic Regression accuracy(in %):", metrics.accuracy_score(Y_test, Y_pred)*100)
results = confusion_matrix(Y_test, Y_pred)
print ('Confusion Matrix SVM:')
print(results)
print ('Accuracy Score :',accuracy_score(Y_test, Y_pred))
print ('Report : ')
print (classification_report(Y_test, Y_pred))
Confusion Matrix SVM:
[[1451 176]
[ 491 941]]
Accuracy Score : 0.7819548872180451
Report :
precision recall f1-score support
0.0 0.75 0.89 0.81 1627
1.0 0.84 0.66 0.74 1432
avg / total 0.79 0.78 0.78 3059
Normalising data
from sklearn.preprocessing import MinMaxScaler
scaler= MinMaxScaler(feature_range=(0, 100))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
GNB CLASSIFIER - normalised input
# training the model on training set
# from sklearn.naive_bayes import GaussianNB
# gnb = GaussianNB()
gnb.fit(X_train, Y_train)
# making predictions on the testing set
Y_pred = gnb.predict(X_test)
# comparing actual response values (y_test) with predicted response values (y_pred)
from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(Y_test, Y_pred)*100)
Gaussian Naive Bayes model accuracy(in %): 79.53579601176855
#Confusion matrix and various metrics of GNB classifer
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import classification_report
results = confusion_matrix(Y_test, Y_pred)
print ('Confusion Matrix of GNB Classifier:')
print(results)
print ('Accuracy Score :',accuracy_score(Y_test, Y_pred))
print ('Report : ')
print (classification_report(Y_test, Y_pred))
Confusion Matrix of GNB Classifier:
[[1262 365]
[ 261 1171]]
Accuracy Score : 0.7953579601176856
Report :
precision recall f1-score support
0.0 0.83 0.78 0.80 1627
1.0 0.76 0.82 0.79 1432
avg / total 0.80 0.80 0.80 3059
SVM - normalised input
from sklearn.svm import SVC # "Support Vector Classifier"
clf = SVC(kernel='linear')
# fitting x samples and y classes
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
# comparing actual response values (y_test) with predicted response values (y_pred)
# print("SVM accuracy(in %):", metrics.accuracy_score(Y_test, Y_pred)*100)
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import classification_report
results = confusion_matrix(Y_test, Y_pred)
print ('Confusion Matrix SVM:')
print(results)
print ('Accuracy Score :',accuracy_score(Y_test, Y_pred))
print ('Report : ')
print (classification_report(Y_test, Y_pred))
Confusion Matrix SVM:
[[1626 1]
[ 39 1393]]
Accuracy Score : 0.986923831317424
Report :
precision recall f1-score support
0.0 0.98 1.00 0.99 1627
1.0 1.00 0.97 0.99 1432
avg / total 0.99 0.99 0.99 3059
LOGISTIC REGRESSION - normalised input
from sklearn import linear_model
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
# Put the result into a color plot
# print("Logistic Regression accuracy(in %):", metrics.accuracy_score(Y_test, Y_pred)*100)
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import classification_report
results = confusion_matrix(Y_test, Y_pred)
print ('Confusion Matrix Logistic regression :')
print(results)
print ('Accuracy Score :',accuracy_score(Y_test, Y_pred))
print ('Report : ')
print (classification_report(Y_test, Y_pred))
Confusion Matrix Logistic regression :
[[1620 7]
[ 37 1395]]
Accuracy Score : 0.9856162144491664
Report :
precision recall f1-score support
0.0 0.98 1.00 0.99 1627
1.0 1.00 0.97 0.98 1432
avg / total 0.99 0.99 0.99 3059
DEEP LEARNING
import warnings
warnings.filterwarnings('ignore')
# from pandas import read_csv
# import numpy as np
import matplotlib.pyplot as plt
import keras
from keras.models import Sequential,model_from_json
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop
from keras.layers import BatchNormalization
import pylab as plt
import h5py
Using TensorFlow backend.
import sklearn.metrics
batch_size = 128
num_classes = 2
epochs = 30
# convert class vectors to binary class matrices
Y_train = keras.utils.to_categorical(Y_train, num_classes)
Y_test = keras.utils.to_categorical(Y_test, num_classes)
# Y_test
#building NN
first_layer_size = 31
model = Sequential()
model.add(Dense(first_layer_size, activation='relu', input_shape=(31,)))
model.add(Dense(32, activation='sigmoid'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(25, activation='sigmoid'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(16, activation='sigmoid'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_1 (Dense) (None, 31) 992
_________________________________________________________________
dense_2 (Dense) (None, 32) 1024
_________________________________________________________________
batch_normalization_1 (Batch (None, 32) 128
_________________________________________________________________
dropout_1 (Dropout) (None, 32) 0
_________________________________________________________________
dense_3 (Dense) (None, 25) 825
_________________________________________________________________
batch_normalization_2 (Batch (None, 25) 100
_________________________________________________________________
dropout_2 (Dropout) (None, 25) 0
_________________________________________________________________
dense_4 (Dense) (None, 16) 416
_________________________________________________________________
dense_5 (Dense) (None, 2) 34
=================================================================
Total params: 3,519
Trainable params: 3,405
Non-trainable params: 114
_________________________________________________________________
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])
classifier = model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs,verbose=0,validation_data=(X_test, Y_test))
score = model.evaluate(X_test, Y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
Test loss: 0.04661931366994295
Test accuracy: 0.9879045439686172
#print(x_test[1:2].shape)
predictions = model.predict(X_test)
# prediction = prediction[0]
print('Prediction\n',predictions)
thr_pred =(predictions>0.5)*1
print('Thresholded output\n',thr_pred)
Prediction
[[0.99623126 0.00376873]
[0.996387 0.00361296]
[0.00180777 0.99819225]
...
[0.00190076 0.99809927]
[0.00180778 0.99819225]
[0.99640816 0.00359182]]
Thresholded output
[[1 0]
[1 0]
[0 1]
...
[0 1]
[0 1]
[1 0]]
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# results = confusion_matrix(Y_test, l)
results = confusion_matrix(Y_test.argmax(axis=1), predictions.argmax(axis=1))
print ('Confusion Matrix of Neural network:')
print(results)
print ('Accuracy Score :',accuracy_score(Y_test.argmax(axis=1), predictions.argmax(axis=1)))
print ('Report : ')
print (classification_report(Y_test.argmax(axis=1), predictions.argmax(axis=1)))
Confusion Matrix of Neural network:
[[1611 16]
[ 21 1411]]
Accuracy Score : 0.9879045439686172
Report :
precision recall f1-score support
0 0.99 0.99 0.99 1627
1 0.99 0.99 0.99 1432
avg / total 0.99 0.99 0.99 3059
model.save_weights('model.h5')
import matplotlib.pyplot as plt
%matplotlib inline
fig = plt.gcf()
plt.plot(classifier.history['acc'])
plt.plot(classifier.history['val_acc'])
plt.title("Accuracy Curve")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(['train','test'], loc="upper left")
plt.show()
fig.savefig("Model_Accuracy_Curves.png")
import matplotlib.pyplot as plt
%matplotlib inline
fig = plt.gcf()
plt.plot(classifier.history['loss'])
plt.plot(classifier.history['val_loss'])
plt.title("Log Loss Curve")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(['train','test'], loc="upper left")
plt.show()
fig.savefig("Model_Loss_Curves.png")
jovian.commit(artifacts='headings.txt')
[jovian] Saving notebook..