Learn practical skills, build real-world projects, and advance your career
Created 5 years ago
Bag of words model
# load all necessary libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
pd.set_option('max_colwidth', 100)
Let's build a basic bag of words model on three sample documents
documents = ["Gangs of Wasseypur is a great movie.", "The success of a movie depends on the performance of the actors.", "There are no new movies releasing this week."]
print(documents)
['Gangs of Wasseypur is a great movie.', 'The success of a movie depends on the performance of the actors.', 'There are no new movies releasing this week.']
def preprocess(document):
'changes document to lower case and removes stopwords'
# change sentence to lower case
document = document.lower()
# tokenize into words
words = word_tokenize(document)
# remove stop words
words = [word for word in words if word not in stopwords.words("english")]
# join words to make sentence
document = " ".join(words)
return document
documents = [preprocess(document) for document in documents]
print(documents)
['gangs wasseypur great movie .', 'success movie depends performance actors .', 'new movies releasing week .']