Jovian
⭐️
Sign In
In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")
tokens = nlp("software developer")
tokens1 = nlp("junior software engineer")
print("similiraty two job roles:  ",tokens.similarity(tokens1))

similiraty two job roles: 0.8323356550138759
/usr/lib/python3.6/runpy.py:193: ModelsWarning: [W007] The model you're using has no word vectors loaded, so the result of the Doc.similarity method will be based on the tagger, parser and NER, which may not give useful similarity judgements. This may happen if you're using one of the small models, e.g. `en_core_web_sm`, which don't ship with word vectors and only use context-sensitive tensors. You can always add your own word vectors, or use one of the larger models instead if available. "__main__", mod_spec)
In [2]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline

import warnings;
warnings.filterwarnings('ignore')
In [3]:
df = pd.read_excel('/home/python/Downloads/Online Retail.xlsx')
In [4]:

df 
Out[4]:
In [5]:
df.shape
Out[5]:
(541909, 8)
In [6]:
df.isnull().sum()
Out[6]:
InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64
In [7]:
df.describe()
Out[7]:
In [8]:
df.dropna(inplace=True)
In [9]:
df.isnull().sum()
Out[9]:
InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64
In [10]:
df['StockCode'] = df['StockCode'].astype(str)
In [11]:
customers = df['CustomerID'].unique().tolist()
len(customers)
Out[11]:
4372
In [12]:
random.shuffle(customers)

customers_train = [customers[i] for i in range (round(0.9*len(customers)))]

In [13]:
train_df = df[df['CustomerID'].isin(customers_train)]

validation_df = df[~df['CustomerID'].isin(customers_train)]
In [14]:
train_df.shape
Out[14]:
(365922, 8)
In [15]:
validation_df.shape
Out[15]:
(40907, 8)
In [16]:
purchases_train = []

for i in tqdm(customers_train):
    
    temp = train_df[train_df['CustomerID'] == i]['StockCode'].tolist()
    
    purchases_train.append(temp)
100%|██████████| 3935/3935 [00:04<00:00, 920.86it/s]
In [17]:
purchases_val = []

for i in tqdm(validation_df['CustomerID'].unique()):
    temp = validation_df[validation_df['CustomerID'] == i]['StockCode'].tolist()
    
    purchases_val.append(temp)
100%|██████████| 437/437 [00:00<00:00, 1390.71it/s]
In [18]:
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(purchases_train, progress_per=200)

model.train(purchases_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)
Out[18]:
(3622793, 3659220)
In [19]:
model.init_sims(replace=True)
In [20]:
print(model)
Word2Vec(vocab=3171, size=100, alpha=0.03)
In [21]:
X = model[model.wv.vocab]

X.shape
Out[21]:
(3171, 100)
In [22]:
import umap

cluster_embedding = umap.UMAP(n_neighbors=30, min_dist=0.0,
                              n_components=2, random_state=42).fit_transform(X)

plt.figure(figsize=(10,9))
plt.scatter(cluster_embedding[:, 0], cluster_embedding[:, 1], s=3, cmap='Spectral')
Out[22]:
<matplotlib.collections.PathCollection at 0x7f0a51389860>
Notebook Image
In [23]:
products = train_df[["StockCode", "Description"]]
In [24]:
products
Out[24]:
In [25]:
products.drop_duplicates(inplace=True, subset='StockCode', keep="last")

# create product-ID and product-description dictionary
products_dict = products.groupby('StockCode')['Description'].apply(list).to_dict()
In [26]:
products_dict['84029E']
Out[26]:
['RED WOOLLY HOTTIE WHITE HEART.']
In [27]:
def similar_products(v, n = 10):
    
    # extract most similar products for the input vector
    ms = model.similar_by_vector(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar products
    new_ms = []
    for j in ms:
        pair = (products_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms        
In [28]:
product = products_dict['22745']

print("This is product ",product)

print("======================================")
similar_products(model['22745'])


This is product ["POPPY'S PLAYHOUSE BEDROOM "] ======================================
Out[28]:
[("POPPY'S PLAYHOUSE LIVINGROOM ", 0.9615857005119324),
 ("POPPY'S PLAYHOUSE KITCHEN", 0.9577984809875488),
 ("POPPY'S PLAYHOUSE BATHROOM", 0.9260616898536682),
 ('PINK PARTY SUNGLASSES', 0.5578944087028503),
 ('LITTLE PINK MONSTER SOFT TOY', 0.5493927597999573),
 ('DINOSAUR HEIGHT CHART STICKER SET', 0.5345422029495239),
 ('MOUSEY LONG LEGS SOFT TOY', 0.5149549245834351),
 ('GLOW IN DARK DOLPHINS', 0.5077410340309143),
 ('MAKE YOUR OWN FLOWERPOWER CARD KIT', 0.504978597164154),
 ('WOODLAND  HEIGHT CHART STICKERS ', 0.4955698549747467)]
In [30]:
import jovian
In [ ]:
jovian.commit()
[jovian] Saving notebook..
In [ ]: