Jovian
⭐️
Sign In
In [126]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
import re
import seaborn as sns
import matplotlib.pyplot as plt
In [226]:
tr = pd.read_csv("./jigsaw-toxic-comment-classification-challenge/train.csv")
ts = pd.read_csv("./jigsaw-toxic-comment-classification-challenge/test.csv")
In [4]:
tr.shape
Out[4]:
(159571, 8)
In [9]:
ts.shape
Out[9]:
(153164, 2)
In [227]:
tr.head(2)
Out[227]:
In [163]:
tr["labels"] = tr.iloc[:,2:].sum(axis=1)
In [224]:
tr.head(2)
Out[224]:
In [123]:
ts.head(2)
Out[123]:
In [124]:
tr.iloc[:,2].value_counts()[1]
Out[124]:
15294
In [125]:
for i in range(tr.keys().size):
    if i >=2:
        plt.bar(tr.keys()[i],tr.iloc[:,i].value_counts()[1])
        plt.text(tr.keys()[i],tr.iloc[:,i].value_counts()[1],tr.iloc[:,i].value_counts()[1])
plt.xticks(rotation=55)
plt.show()
Notebook Image
In [169]:
tr.labels.value_counts().plot(kind="bar")
plt.show()
Notebook Image
In [223]:
# Comment preprocessing...
def convertTosent(text):
    text = text.lower()
    text = re.sub("[^a-z0-9']"," ",text)
    text = [wd for wd in text.split(" ")]
    text = " ".join(text)
    text = re.sub("\s+"," ",text)
#     sent = " ".join(text)
    return(text)
# print(tr.comment_text[3])
# print("\n")
# print(convertTosent(tr.comment_text[3]))
In [29]:
lam = lambda x:convertTosent(x)
In [139]:
tr.comment_text = tr.comment_text.apply(convertTosent)
In [225]:
tr.head(2)
Out[225]:
In [210]:
cv = CountVectorizer(ngram_range=(1,4),dtype=np.int16)
In [211]:
X_train,X_test,y_train,y_test = train_test_split(tr.comment_text,tr.labels,test_size=.2,random_state=6)
In [212]:
cv_train = cv.fit_transform(X_train)
cv_train
Out[212]:
<127656x12966329 sparse matrix of type '<class 'numpy.int16'>'
	with 28374182 stored elements in Compressed Sparse Row format>
In [213]:
# cv.get_feature_names()
In [214]:
cv_test = cv.transform(X_test)

Train and Prediction

In [215]:
from sklearn.naive_bayes import MultinomialNB
In [216]:
nb = MultinomialNB()
In [217]:
nb.fit(cv_train,y_train)
Out[217]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
In [218]:
y_pred = nb.predict(cv_test)
In [219]:
accuracy_score(y_test,y_pred)
Out[219]:
0.9026163246122513
In [220]:
confusion_matrix(y_test,y_pred)
Out[220]:
array([[28643,    14,     4,     0,     0,     0,     0],
       [ 1286,    27,     5,    11,     3,     0,     0],
       [  607,    17,    11,    20,    10,     0,     0],
       [  685,    19,     1,    79,    52,     0,     0],
       [  219,     9,     3,    52,    47,     1,     0],
       [   45,     2,     0,    15,    22,     0,     0],
       [    6,     0,     0,     0,     0,     0,     0]], dtype=int64)
In [221]:
sns.heatmap(confusion_matrix(y_test,y_pred),cmap="jet",linewidths=.2,annot=True,fmt="d")
Out[221]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a7843b1488>
Notebook Image
In [222]:
print(classification_report(y_test,y_pred))
precision recall f1-score support 0 0.91 1.00 0.95 28661 1 0.31 0.02 0.04 1332 2 0.46 0.02 0.03 665 3 0.45 0.09 0.16 836 4 0.35 0.14 0.20 331 5 0.00 0.00 0.00 84 6 0.00 0.00 0.00 6 accuracy 0.90 31915 macro avg 0.35 0.18 0.20 31915 weighted avg 0.85 0.90 0.86 31915
C:\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. 'precision', 'predicted', average, warn_for)
In [ ]:
 
In [ ]: