import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
import re
import seaborn as sns
import matplotlib.pyplot as plt
tr = pd.read_csv("./jigsaw-toxic-comment-classification-challenge/train.csv")
ts = pd.read_csv("./jigsaw-toxic-comment-classification-challenge/test.csv")
tr.shape
(159571, 8)
ts.shape
(153164, 2)
tr.head(2)
tr["labels"] = tr.iloc[:,2:].sum(axis=1)
tr.head(2)
ts.head(2)
tr.iloc[:,2].value_counts()[1]
15294
for i in range(tr.keys().size):
if i >=2:
plt.bar(tr.keys()[i],tr.iloc[:,i].value_counts()[1])
plt.text(tr.keys()[i],tr.iloc[:,i].value_counts()[1],tr.iloc[:,i].value_counts()[1])
plt.xticks(rotation=55)
plt.show()
tr.labels.value_counts().plot(kind="bar")
plt.show()
# Comment preprocessing...
def convertTosent(text):
text = text.lower()
text = re.sub("[^a-z0-9']"," ",text)
text = [wd for wd in text.split(" ")]
text = " ".join(text)
text = re.sub("\s+"," ",text)
# sent = " ".join(text)
return(text)
# print(tr.comment_text[3])
# print("\n")
# print(convertTosent(tr.comment_text[3]))
lam = lambda x:convertTosent(x)
tr.comment_text = tr.comment_text.apply(convertTosent)
tr.head(2)
cv = CountVectorizer(ngram_range=(1,4),dtype=np.int16)
X_train,X_test,y_train,y_test = train_test_split(tr.comment_text,tr.labels,test_size=.2,random_state=6)
cv_train = cv.fit_transform(X_train)
cv_train
<127656x12966329 sparse matrix of type '<class 'numpy.int16'>'
with 28374182 stored elements in Compressed Sparse Row format>
# cv.get_feature_names()
cv_test = cv.transform(X_test)
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(cv_train,y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
y_pred = nb.predict(cv_test)
accuracy_score(y_test,y_pred)
0.9026163246122513
confusion_matrix(y_test,y_pred)
array([[28643, 14, 4, 0, 0, 0, 0],
[ 1286, 27, 5, 11, 3, 0, 0],
[ 607, 17, 11, 20, 10, 0, 0],
[ 685, 19, 1, 79, 52, 0, 0],
[ 219, 9, 3, 52, 47, 1, 0],
[ 45, 2, 0, 15, 22, 0, 0],
[ 6, 0, 0, 0, 0, 0, 0]], dtype=int64)
sns.heatmap(confusion_matrix(y_test,y_pred),cmap="jet",linewidths=.2,annot=True,fmt="d")
<matplotlib.axes._subplots.AxesSubplot at 0x1a7843b1488>
print(classification_report(y_test,y_pred))
precision recall f1-score support
0 0.91 1.00 0.95 28661
1 0.31 0.02 0.04 1332
2 0.46 0.02 0.03 665
3 0.45 0.09 0.16 836
4 0.35 0.14 0.20 331
5 0.00 0.00 0.00 84
6 0.00 0.00 0.00 6
accuracy 0.90 31915
macro avg 0.35 0.18 0.20 31915
weighted avg 0.85 0.90 0.86 31915
C:\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
'precision', 'predicted', average, warn_for)