Jovian
⭐️
Sign In

Toxic comment classification

Library imports

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem.porter import PorterStemmer
import emoji
import string
import spacy
nlp = spacy.load("en_core_web_sm")
from collections import Counter

Data

In [2]:
toxic_data = pd.read_csv('./data/jigsaw-toxic-comment-classification-challenge/train.csv')
In [3]:
toxic_data.head()
Out[3]:

Cleaning

In [4]:
def preprocess_text(text, remove_stop = True, stem_words = False, remove_mentions_hashtags = True):
    
    # Remove emojis
    emoji_pattern = re.compile("[" "\U0001F1E0-\U0001F6FF" "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r"", text)
    text = "".join([x for x in text if x not in emoji.UNICODE_EMOJI])

    if remove_mentions_hashtags:
        text = re.sub(r"@(\w+)", " ", text)
        text = re.sub(r"#(\w+)", " ", text)
        
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text.lower())
    words = (''.join(nopunct)).split()
    if(remove_stop):
        words = [w for w in words if w not in ENGLISH_STOP_WORDS]
        words = [w for w in words if len(w) > 2]  # ignore a, an, to, at, be, ...
        
    if(stem_words):
        stemmer = PorterStemmer()
        words = [stemmer.stem(w) for w in words]
        
    return list(words)
In [5]:
toxic_data['comments_cleaned'] = toxic_data['comment_text'].apply(lambda x: ' '.join(preprocess_text(x,remove_stop=True)))
In [6]:
toxic_data
Out[6]:

Exploration

In [7]:
import spacy
nlp = spacy.load("en_core_web_sm")
text = ' '.join(toxic_data[:1000]['comment_text'])
doc = nlp(' '.join(preprocess_text(text)))
In [8]:
x = doc[0]
print('word:', x, '\n')
print('part of speech:', x.pos_, '\n')
print('sentiment:', x.sentiment,'\n')
print('sentiment:', x.sentiment,'\n')
print('word vector:', x.vector)
word: explanation part of speech: NOUN sentiment: 0.0 sentiment: 0.0 word vector: [-2.6271276 0.5393927 -3.23806 -0.59520626 2.621077 -0.6216399 0.97293466 0.97172683 -0.56233525 -0.67415416 0.28694367 0.41693017 2.0528343 3.4813812 -0.30988914 -5.2923045 0.69299906 -1.4184353 -1.841832 -1.4449937 1.7166979 -1.4933926 3.3603275 5.827181 -0.6357467 -1.3172668 -2.197509 0.8505597 -3.203742 -0.61942637 -2.7716808 0.27717188 1.8001975 0.30208138 2.8583205 -0.8662306 -1.1807231 0.14017114 2.1908553 -1.998581 -1.409525 1.7886095 -2.7575257 4.1438007 1.9355705 -4.3088074 0.38313246 -3.0499797 -4.0533943 -1.035784 -0.7164862 -3.2510495 1.1207311 3.0780158 0.4149342 -0.83172405 -1.1492554 3.0987144 4.498347 1.0345111 -2.1468039 0.24153978 -3.202008 1.0254357 2.9969568 -1.3097451 0.1563667 1.1692545 2.5476491 -2.2194042 0.61490136 1.1603665 -1.8811877 0.1641304 -0.54432106 -0.23718886 0.49933246 3.472591 3.7640336 -3.0298285 -0.6738827 0.686922 -0.34006023 1.3695829 -0.24660659 -0.18915454 0.3993888 1.2477443 4.126819 0.4591628 -1.2343663 -1.6628046 1.8221438 -2.6070857 -0.53009963 0.24270105]

Visualization

In [21]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def make_wc(word_list):
    wordcloud = WordCloud()
    wordcloud.fit_words(dict(Counter(word_list).most_common(40)))

    fig=plt.figure(figsize=(10, 10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
    
make_wc([token.text for token in doc if token.pos_ in ['NOUN']])
Notebook Image

Sentiment Analysis

In [10]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sentiment_analyzer = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    return sentiment_analyzer.polarity_scores(text)['compound']
In [11]:
## let's get some toxic comments
toxic_comments = toxic_data.loc[toxic_data.toxic == 1]
print('toxic comment score: ', get_sentiment_score(list(toxic_comments['comment_text'])[0]))

##now for some non-toxic comment
non_toxic_comments = toxic_data.loc[((toxic_data.toxic == 0) & (toxic_data.severe_toxic==0) & (toxic_data.obscene==0) 
                                    & (toxic_data.threat == 0) & (toxic_data.insult == 0) & (toxic_data.identity_hate == 0))]
print('non toxic comment score: ', get_sentiment_score(list(non_toxic_comments['comment_text'])[0]))
toxic comment score: -0.7783 non toxic comment score: 0.5574

Classification using fasttext

In [12]:
# ignore severe_toxic, obscene, threat, insult, identity_hate
toxic_non_toxic_data = toxic_data.loc[((toxic_data.severe_toxic==0) & (toxic_data.obscene==0) 
                                    & (toxic_data.threat == 0) & (toxic_data.insult == 0) 
                                       & (toxic_data.identity_hate == 0))]
y = toxic_non_toxic_data[['toxic']]
In [13]:
#train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(toxic_non_toxic_data, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(119209, 9) (119209, 1) (29803, 9) (29803, 1)
In [14]:
#preparing data to train
import csv
X_train['toxic'] = ['__label__'+str(s) for s in X_train['toxic']]
data = X_train[['comment_text', 'toxic']]
data.to_csv('comments.txt', index=False, sep=' ', header=False,quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
/Users/aakanksha/miniconda3/envs/Zyper/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy This is separate from the ipykernel package so we can avoid doing imports until
In [15]:
#training for 4 epochs
import fasttext
model = fasttext.train_supervised(input="comments.txt", epoch=4)
In [16]:
#extracting prediction out of the fasttext output
def get_prediction(fast_out):
    return int(fast_out[0][0].split('__label__')[1])
In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

##Training accuracy
preds = [get_prediction(model.predict(' '.join(preprocess_text(x))))  for x in list(X_train['comment_text'])]
print('accuracy:',accuracy_score(preds,y_train),'\n')

##Training F1-score
print('F1-score:',f1_score(preds,y_train),'\n')
accuracy: 0.9514885621052102 F1-score: 0.45448542590321667
In [18]:
##Test accuracy
preds = [get_prediction(model.predict(' '.join(preprocess_text(x))))  for x in list(X_test['comment_text'])]
print('accuracy:',accuracy_score(preds,y_test),'\n')

##Training F1-score
print('F1-score:',f1_score(preds,y_test),'\n')
accuracy: 0.9484280106029594 F1-score: 0.4012465913517725
In [19]:
import jovian
In [ ]:
jovian.commit()
[jovian] Saving notebook..
In [ ]: