import numpy as np
import pandas as pd
import re
import spacy as sp
import nltk as nl
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
nlp = sp.load("en_core_web_sm")
df = pd.read_table("SMSSpamCollection.txt",names=["labels","message"])
df.head(2)
/Users/vigneshkarthick/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\t'.
"""Entry point for launching an IPython kernel.
df = pd.read_csv("SMSSpamCollection.txt",sep="\t",names=["labels","message"])
df.head(2)
df.shape
(5572, 2)
df.message[:5]
0 Go until jurong point, crazy.. Available only ...
1 Ok lar... Joking wif u oni...
2 Free entry in 2 a wkly comp to win FA Cup fina...
3 U dun say so early hor... U c already then say...
4 Nah I don't think he goes to usf, he lives aro...
Name: message, dtype: object
def textPreprocessor(msg):
msg = msg.lower()
msg = re.sub("[^\w]"," ",msg)
obj = nlp(msg)
msg_wo_sw_pun = [wd.text for wd in obj if wd.is_stop == False and wd.is_punct == False]
sent = " ".join(msg_wo_sw_pun)
clean_sent = re.sub("\s+"," ",sent)
return clean_sent
# textPreprocessor(df.message[200])
df.message = df.message.apply(textPreprocessor)
df.message[:10]
0 jurong point crazy available bugis n great wor...
1 ok lar joking wif u oni
2 free entry 2 wkly comp win fa cup final tkts 2...
3 u dun early hor u c
4 nah don t think goes usf lives
5 freemsg hey darling s 3 week s word d like fun...
6 brother like speak treat like aids patent
7 request melle melle oru minnaminunginte nurung...
8 winner valued network customer selected receiv...
9 mobile 11 months u r entitled update latest co...
Name: message, dtype: object
df.labels[:5]
0 ham
1 ham
2 spam
3 ham
4 ham
Name: labels, dtype: object
df.labels.replace({"ham":0,"spam":1},inplace=True)
df.labels[:5]
0 0
1 0
2 1
3 0
4 0
Name: labels, dtype: int64
# train = df.message[:-200]
# test = df.message[-200:]
X = df.message# X is my independent data will be used for training and testing
y = df.labels# y is my dependent(target) data will be used for training and testing
X.shape
(5572,)
y.shape
(5572,)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=6) # random_state is used for reproducibility
X_train.shape
(4457,)
X_test.shape
(1115,)
y_train.shape
(4457,)
y_test.shape
(1115,)
# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=7)
# X_train
cv = CountVectorizer()
cv
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None, vocabulary=None)
cv_train_dtm = cv.fit_transform(X_train)
cv_train_dtm
<4457x7488 sparse matrix of type '<class 'numpy.int64'>'
with 34567 stored elements in Compressed Sparse Row format>
cv_train_dtm.toarray() # to see sparse matrix
array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]], dtype=int64)
print(cv_train_dtm.tocoo()) # to see co-ordinates of non-zero values
(0, 5716) 1
(0, 3221) 1
(0, 1003) 1
(0, 5994) 1
(0, 6441) 1
(0, 4491) 1
(0, 2188) 1
(0, 753) 1
(0, 2254) 1
(1, 2848) 1
(1, 1460) 1
(1, 3859) 1
(1, 4704) 1
(2, 2390) 1
(2, 6472) 1
(2, 4774) 1
(3, 4641) 1
(3, 3601) 1
(3, 7152) 1
(4, 6547) 1
(4, 6708) 1
(4, 3744) 1
(4, 853) 1
(4, 6536) 1
(4, 2611) 1
: :
(4454, 4144) 1
(4454, 4544) 1
(4454, 3095) 1
(4455, 549) 1
(4455, 5023) 1
(4455, 4940) 1
(4455, 5788) 1
(4455, 5726) 1
(4455, 3347) 1
(4455, 2452) 1
(4455, 4218) 1
(4455, 4000) 1
(4455, 979) 1
(4455, 6834) 1
(4456, 4434) 1
(4456, 1926) 1
(4456, 4329) 1
(4456, 3953) 1
(4456, 4786) 1
(4456, 3149) 1
(4456, 1975) 1
(4456, 3405) 1
(4456, 6730) 1
(4456, 2923) 1
(4456, 3859) 1
cv_test_dtm = cv.transform(X_test)
# cv2 = CountVectorizer(ngram_range=(1,2))
# cv2.fit_transform(X)
# cv2.get_feature_names()
https://monkeylearn.com/blog/practical-explanation-naive-bayes-classifier/
nb = MultinomialNB()
nb.fit(cv_train_dtm,y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
y_pred = nb.predict(cv_test_dtm)
y_pred
array([0, 0, 0, ..., 0, 0, 0])
confusion_matrix(y_test,y_pred)
array([[963, 5],
[ 10, 137]])
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,fmt="d")
<matplotlib.axes._subplots.AxesSubplot at 0x1a7f052908>
accuracy_score(y_test,y_pred)*100
98.65470852017937
tfidf = TfidfVectorizer()
tf_train = tfidf.fit_transform(X_train[:5])
tf_train
<5x41 sparse matrix of type '<class 'numpy.float64'>'
with 41 stored elements in Compressed Sparse Row format>
tf_train.toarray()
array([[0.33333333, 0. , 0. , 0.33333333, 0. ,
0. , 0.33333333, 0.33333333, 0. , 0. ,
0. , 0. , 0. , 0.33333333, 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0.33333333, 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0.33333333, 0.33333333, 0. , 0.33333333,
0. , 0. , 0. , 0. , 0. ,
0. ],
[0. , 0. , 0. , 0. , 0.5 ,
0. , 0. , 0. , 0. , 0. ,
0. , 0.5 , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0.5 ,
0. , 0. , 0. , 0. , 0.5 ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0.57735027, 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0.57735027, 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0.57735027, 0. , 0. , 0. , 0. ,
0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0.57735027, 0. , 0. , 0. ,
0. , 0. , 0. , 0.57735027, 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0.57735027],
[0. , 0.21320072, 0.21320072, 0. , 0. ,
0.21320072, 0. , 0. , 0. , 0.21320072,
0.21320072, 0. , 0.21320072, 0. , 0.21320072,
0.21320072, 0. , 0.21320072, 0.21320072, 0. ,
0.21320072, 0.21320072, 0. , 0. , 0. ,
0. , 0.21320072, 0.21320072, 0.21320072, 0.21320072,
0.21320072, 0. , 0. , 0.21320072, 0. ,
0. , 0.21320072, 0.21320072, 0.21320072, 0.21320072,
0. ]])
tf_train.toarray()[0]
0.0
from joblib import dump,load
dump(nb,"nb.joblib")
['nb.joblib']
mod = load("nb.joblib")
y_pred = mod.predict(cv_test_dtm)
y_pred
array([0, 0, 0, ..., 0, 0, 0])
accuracy_score(y_test,y_pred)*100
98.65470852017937