Learn practical skills, build real-world projects, and advance your career
Created 3 years ago
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from gensim import parsing
from sklearn.metrics import accuracy_score
import chardet
from sklearn.metrics import roc_auc_score,confusion_matrix,classification_report
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
#detects encoding of csv file
with open('../input/spam.csv', 'rb') as f:
result = chardet.detect(f.read())
#put csv file in a dataframe.
df = pd.read_csv("../input/spam.csv", encoding = result['encoding'])
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df['v1'] = df.v1.map({'ham':0, 'spam':1})
#df['v1'] = df.v1.map({'ham':0, 'spam':1})
# Any results you write to the current directory are saved as output.
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-1-32dc68aff6f2> in <module>
8 from sklearn.linear_model import LogisticRegression
9 from sklearn.pipeline import Pipeline
---> 10 from gensim import parsing
11 from sklearn.metrics import accuracy_score
12 import chardet
ModuleNotFoundError: No module named 'gensim'
#Count observations in each label
df.v1.value_counts()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-2-6b679abe81e2> in <module>
1 #Count observations in each label
----> 2 df.v1.value_counts()
NameError: name 'df' is not defined
def parse(s):
parsing.stem_text(s)
return s
#applying parsing to comments.
for i in range(0,len(df)):
df.iloc[i,1]=parse(df.iloc[i,1])
df.iloc[i,1]=df.iloc[i,1].lower()
X, y = df['v2'].tolist(), df['v1'].tolist()
#Train and test set split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SVC(kernel='rbf'))])
#train model
text_clf.fit(X_train, y_train)
#predict class form test data
predicted = text_clf.predict(X_test)
print(accuracy_score(y_test, predicted))
print(roc_auc_score(y_test,predicted))
print(confusion_matrix(y_test, predicted))