Learn practical skills, build real-world projects, and advance your career

Text to Numeric using sklearn feature extraction

Ref: https://github.com/justmarkham/pycon-2016-tutorial/blob/master/tutorial.ipynb

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import scikitplot as skp

# Suppress Warning 
import warnings
warnings.filterwarnings("ignore")
sample dataset

Each element is treated as document

sample_data = ["This is test1","This is test2","This is another line with test3","Yet another line with test4",
               "yet again another line with test5"]
sample_data
['This is test1',
 'This is test2',
 'This is another line with test3',
 'Yet another line with test4',
 'yet again another line with test5']

using CountVectorizer()