Jovian
⭐️
Sign In
In [88]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import pandas as pd
import seaborn as sns

Lets have some sample document

In [52]:
sample_docs = ["This is a line in one document",
              "This is another line in another document",
              "Yet another line in third document",
              "This is also a line which is same as present in first document"]

Vectorize the document

In [53]:
cv = CountVectorizer()
In [54]:
samploe_doc_dtm = cv.fit_transform(sample_docs)
In [55]:
samploe_doc_dtm
Out[55]:
<4x15 sparse matrix of type '<class 'numpy.int64'>'
	with 29 stored elements in Compressed Sparse Row format>
In [56]:
print(samploe_doc_dtm.todense())
[[0 0 0 1 0 1 1 1 1 0 0 0 1 0 0] [0 2 0 1 0 1 1 1 0 0 0 0 1 0 0] [0 1 0 1 0 1 0 1 0 0 0 1 0 0 1] [1 0 1 1 1 1 2 1 0 1 1 0 1 1 0]]
In [57]:
print(samploe_doc_dtm.toarray())
[[0 0 0 1 0 1 1 1 1 0 0 0 1 0 0] [0 2 0 1 0 1 1 1 0 0 0 0 1 0 0] [0 1 0 1 0 1 0 1 0 0 0 1 0 0 1] [1 0 1 1 1 1 2 1 0 1 1 0 1 1 0]]
In [58]:
print(cv.get_feature_names())
['also', 'another', 'as', 'document', 'first', 'in', 'is', 'line', 'one', 'present', 'same', 'third', 'this', 'which', 'yet']

Visualize the vector form of sparse matrix into dataframe

In [59]:
df = pd.DataFrame(samploe_doc_dtm.toarray(),index=sample_docs,columns=cv.get_feature_names())
df
Out[59]:
In [60]:
cv.vocabulary_ # To get the index of term in vocab
Out[60]:
{'this': 12,
 'is': 6,
 'line': 7,
 'in': 5,
 'one': 8,
 'document': 3,
 'another': 1,
 'yet': 14,
 'third': 11,
 'also': 0,
 'which': 13,
 'same': 10,
 'as': 2,
 'present': 9,
 'first': 4}
In [64]:
samploe_doc_dtm[0].toarray()
Out[64]:
array([[0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0]], dtype=int64)

Calculate cosine similarity

In [72]:
cosine_similarity(samploe_doc_dtm,samploe_doc_dtm)
Out[72]:
array([[1.        , 0.68041382, 0.5       , 0.65465367],
       [0.68041382, 1.        , 0.68041382, 0.53452248],
       [0.5       , 0.68041382, 1.        , 0.32732684],
       [0.65465367, 0.53452248, 0.32732684, 1.        ]])

Visualize cosine similarity in the form of dataframe

In [70]:
df1 = pd.DataFrame(cosine_similarity(samploe_doc_dtm,samploe_doc_dtm),index=sample_docs,columns=sample_docs)
df1
Out[70]:

Visualize cosine similarity in the form of heatmap

In [121]:
ax = sns.heatmap(cosine_similarity(samploe_doc_dtm,samploe_doc_dtm),xticklabels=sample_docs,yticklabels=sample_docs,annot=True,square=True)
ax.set_ylim(4,0)
Out[121]:
(4, 0)
Notebook Image

example: to convert get dgree

In [113]:
np.arccos(1) # convert into radian
np.degrees(np.arccos(1)) # convert in to degree
Out[113]:
0.0
In [114]:
np.arccos(0) # convert into radian
np.degrees(np.arccos(0)) # convert in to degree
Out[114]:
90.0

get cosine Angle \(\theta\)

In [109]:
np.degrees(np.arccos(cosine_similarity(samploe_doc_dtm,samploe_doc_dtm)))
C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: RuntimeWarning: invalid value encountered in arccos """Entry point for launching an IPython kernel.
Out[109]:
array([[        nan, 47.12401133, 60.        , 49.10660535],
       [47.12401133,         nan, 47.12401133, 57.68846676],
       [60.        , 47.12401133,         nan, 70.89339465],
       [49.10660535, 57.68846676, 70.89339465,  0.        ]])