Jovian
⭐️
Sign In
In [1]:
import pandas as pd
import numpy as np
import gensim
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize
stop_words = stopwords.words('english')
In [2]:
def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)
In [3]:
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())
In [5]:
d = {'sentence1': [' I am going to India ', 'I will be eating coffee' ], 'sentence2': ['I am going to bharat', 'I will be drinking coffee']}
data = pd.DataFrame(data=d)
In [6]:
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
/home/shashankhalo7/.local/lib/python3.5/site-packages/smart_open/smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
In [8]:
data['wmd'] = data.apply(lambda x: wmd(x['sentence1'], x['sentence2']), axis=1)
In [9]:
sentence1_vectors = np.zeros((data.shape[0], 300))
error_count = 0

#Word2Vec Features
for i, q in tqdm(enumerate(data.sentence1.values)):
    sentence1_vectors[i, :] = sent2vec(q)

sentence2_vectors  = np.zeros((data.shape[0], 300))
for i, q in tqdm(enumerate(data.sentence2.values)):
    sentence2_vectors[i, :] = sent2vec(q)
2it [00:00, 14.34it/s] 2it [00:00, 1488.93it/s]
In [10]:
data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(sentence1_vectors), np.nan_to_num(sentence2_vectors))]
data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(sentence1_vectors), np.nan_to_num(sentence2_vectors))]
data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(sentence1_vectors), np.nan_to_num(sentence2_vectors))]
data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(sentence1_vectors), np.nan_to_num(sentence2_vectors))]
data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(sentence1_vectors), np.nan_to_num(sentence2_vectors))]
data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(sentence1_vectors), np.nan_to_num(sentence2_vectors))]
data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(sentence1_vectors), np.nan_to_num(sentence2_vectors))]
In [11]:
data
Out[11]: