#library imports
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
import jovian
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error
#input
x = torch.tensor([[1,2, 12,34, 56,78, 90,80],
[12,45, 99,67, 6,23, 77,82],
[3,24, 6,99, 12,56, 21,22]])
model1 = nn.Embedding(100, 7, padding_idx=0)
model2 = nn.LSTM(input_size=7, hidden_size=3, num_layers=1, batch_first=True)
out1 = model1(x)
out2 = model2(out1)
print(out1.shape)
print(out1)
torch.Size([3, 8, 7])
tensor([[[-0.6816, -0.3904, -0.0229, 1.2287, -0.4489, -0.1448, 0.1876],
[-0.1136, -0.2161, -0.6440, 1.7220, -1.0028, -0.4189, 1.4022],
[ 0.1740, -0.2212, 1.5228, -1.1506, -1.1710, 0.0406, -0.3912],
[ 1.9387, 0.3619, -0.4921, -0.6929, -0.6253, 1.1100, 0.8697],
[-1.1030, 0.5688, -0.2015, -1.0526, 2.9643, 1.2638, 1.9368],
[-0.3143, -0.8116, -0.1972, 0.9615, 0.8048, -0.2469, 1.0350],
[ 0.2626, -0.4890, -1.0185, 0.4583, 0.6501, -0.1358, 0.1586],
[-0.4704, 1.3602, 0.6796, 0.4018, -0.2171, 2.0806, -1.0199]],
[[ 0.1740, -0.2212, 1.5228, -1.1506, -1.1710, 0.0406, -0.3912],
[-0.9500, 0.7904, 0.0888, -1.0316, -0.7365, -0.8333, 0.6342],
[-1.0811, 0.2237, -0.4557, 0.4708, 0.8445, -1.0519, 0.0446],
[ 1.3037, -1.0439, -0.8036, 0.5445, 1.7022, 0.7845, 0.0318],
[ 0.8269, -0.6542, -0.3596, 1.8055, -0.8318, 0.6261, 0.2298],
[-0.2241, 0.2127, 0.1145, 0.1325, 0.3162, 0.4276, 0.5688],
[ 1.5142, 1.5675, 0.4787, 0.1893, 1.3999, 0.3825, 0.2888],
[ 0.2900, -1.8883, 0.1017, 0.7807, 2.0393, -0.2231, 0.7619]],
[[ 0.5877, 0.1631, -1.4762, 1.0529, -0.0842, 1.5817, 1.0293],
[ 1.1253, 1.9566, -0.8565, 0.0533, -1.3300, 0.4598, -0.6800],
[ 0.8269, -0.6542, -0.3596, 1.8055, -0.8318, 0.6261, 0.2298],
[-1.0811, 0.2237, -0.4557, 0.4708, 0.8445, -1.0519, 0.0446],
[ 0.1740, -0.2212, 1.5228, -1.1506, -1.1710, 0.0406, -0.3912],
[-1.1030, 0.5688, -0.2015, -1.0526, 2.9643, 1.2638, 1.9368],
[ 1.2083, 0.1338, 1.0553, -1.6460, -1.6378, -0.5144, -1.0399],
[-0.6197, -1.5587, 1.3634, 1.0663, 1.7736, -0.6517, -2.2486]]],
grad_fn=<EmbeddingBackward>)
out, (ht, ct) = model2(out1)
print(ht)
tensor([[[ 0.2788, 0.1724, 0.0143],
[-0.2855, 0.1841, 0.2382],
[-0.5367, 0.0766, 0.0969]]], grad_fn=<StackBackward>)
model3 = nn.Sequential(nn.Embedding(100, 7, padding_idx=0),
nn.LSTM(input_size=7, hidden_size=3, num_layers=1, batch_first=True))
out, (ht, ct) = model3(x)
print(out)
tensor([[[-0.2137, 0.0261, -0.1449],
[-0.3795, -0.0441, -0.0928],
[-0.3562, -0.0813, -0.0430],
[-0.3664, -0.0105, -0.2085],
[-0.5796, -0.0787, -0.3419],
[-0.4169, -0.1716, -0.3466],
[-0.4096, -0.0531, -0.5017],
[-0.6825, -0.0244, -0.3172]],
[[-0.1501, -0.0516, -0.0249],
[ 0.0922, -0.0493, -0.0280],
[ 0.1433, -0.1797, -0.0193],
[ 0.3654, -0.1216, -0.0491],
[ 0.4690, -0.1234, -0.0121],
[ 0.3195, -0.0079, -0.1690],
[ 0.2842, -0.0074, -0.2765],
[ 0.0661, -0.0064, -0.2403]],
[[-0.2451, -0.1169, -0.2873],
[-0.3352, -0.0299, -0.0743],
[-0.0965, -0.0810, -0.1175],
[ 0.0841, -0.2123, -0.0219],
[-0.0312, -0.2138, -0.0325],
[-0.2179, -0.1398, -0.2310],
[-0.1087, -0.0455, -0.0574],
[-0.1863, -0.1855, -0.1135]]], grad_fn=<TransposeBackward0>)
We are going to predict item ratings based on customer reviews bsed on this dataset from Kaggle: https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews
#loading the data
reviews = pd.read_csv("reviews.csv")
print(reviews.shape)
reviews.head()
(23486, 11)
reviews['Title'] = reviews['Title'].fillna('')
reviews['Review Text'] = reviews['Review Text'].fillna('')
reviews['review'] = reviews['Title'] + ' ' + reviews['Review Text']
#keeping only relevant columns and calculating sentence lengths
reviews = reviews[['review', 'Rating']]
reviews.columns = ['review', 'rating']
reviews['review_length'] = reviews['review'].apply(lambda x: len(x.split()))
reviews.head()
#changing ratings to 0-numbering
zero_numbering = {1:0, 2:1, 3:2, 4:3, 5:4}
reviews['rating'] = reviews['rating'].apply(lambda x: zero_numbering[x])
#mean sentence length
np.mean(reviews['review_length'])
60.832921740611425
#tokenization
tok = spacy.load('en')
def tokenize (text):
text = re.sub(r"[^\x00-\x7F]+", " ", text)
regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
nopunct = regex.sub(" ", text.lower())
return [token.text for token in tok.tokenizer(nopunct)]
#count number of occurences of each word
counts = Counter()
for index, row in reviews.iterrows():
counts.update(tokenize(row['review']))
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
if counts[word] < 2:
del counts[word]
print("num_words after:",len(counts.keys()))
num_words before: 14138
num_words after: 8263
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
vocab2index[word] = len(words)
words.append(word)
def encode_sentence(text, vocab2index, N=70):
tokenized = tokenize(text)
encoded = np.zeros(N, dtype=int)
enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
length = min(N, len(enc1))
encoded[:length] = enc1[:length]
return encoded, length
reviews['encoded'] = reviews['review'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
reviews.head()
#check how balanced the dataset is
Counter(reviews['rating'])
Counter({3: 5077, 4: 13131, 2: 2871, 1: 1565, 0: 842})
X = list(reviews['encoded'])
y = list(reviews['rating'])
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
class ReviewsDataset(Dataset):
def __init__(self, X, Y):
self.X = X
self.y = Y
def __len__(self):
return len(self.y)
def __getitem__(self, idx):
return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]
train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_valid, y_valid)
def train_model(model, epochs=10, lr=0.001):
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=lr)
for i in range(epochs):
model.train()
sum_loss = 0.0
total = 0
for x, y, l in train_dl:
x = x.long()
y = y.long()
y_pred = model(x, l)
optimizer.zero_grad()
loss = F.cross_entropy(y_pred, y)
loss.backward()
optimizer.step()
sum_loss += loss.item()*y.shape[0]
total += y.shape[0]
val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
if i % 5 == 1:
print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))
def validation_metrics (model, valid_dl):
model.eval()
correct = 0
total = 0
sum_loss = 0.0
sum_rmse = 0.0
for x, y, l in valid_dl:
x = x.long()
y = y.long()
y_hat = model(x, l)
loss = F.cross_entropy(y_hat, y)
pred = torch.max(y_hat, 1)[1]
correct += (pred == y).float().sum()
total += y.shape[0]
sum_loss += loss.item()*y.shape[0]
sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
return sum_loss/total, correct/total, sum_rmse/total
batch_size = 5000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)
class LSTM_fixed_len(torch.nn.Module) :
def __init__(self, vocab_size, embedding_dim, hidden_dim) :
super().__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
self.linear = nn.Linear(hidden_dim, 5)
self.dropout = nn.Dropout(0.2)
def forward(self, x, l):
x = self.embeddings(x)
x = self.dropout(x)
lstm_out, (ht, ct) = self.lstm(x)
return self.linear(ht[-1])
model_fixed = LSTM_fixed_len(vocab_size, 50, 50)
train_model(model_fixed, epochs=30, lr=0.01)
train loss 1.239, val loss 1.218, val accuracy 0.556, and val rmse 1.355
train loss 1.188, val loss 1.203, val accuracy 0.554, and val rmse 1.348
train loss 1.112, val loss 1.165, val accuracy 0.574, and val rmse 1.189
train loss 1.072, val loss 1.161, val accuracy 0.517, and val rmse 1.145
train loss 1.042, val loss 1.117, val accuracy 0.563, and val rmse 1.279
train loss 0.981, val loss 1.113, val accuracy 0.572, and val rmse 1.216
train_model(model_fixed, epochs=30, lr=0.01)
train loss 0.936, val loss 1.065, val accuracy 0.576, and val rmse 1.177
train loss 0.846, val loss 1.009, val accuracy 0.603, and val rmse 0.937
train loss 0.848, val loss 1.009, val accuracy 0.606, and val rmse 0.912
train loss 0.784, val loss 0.994, val accuracy 0.604, and val rmse 0.894
train loss 0.741, val loss 0.984, val accuracy 0.617, and val rmse 0.870
train loss 0.702, val loss 0.999, val accuracy 0.623, and val rmse 0.863
train_model(model_fixed, epochs=30, lr=0.01)
train loss 0.695, val loss 1.010, val accuracy 0.619, and val rmse 0.869
train loss 0.630, val loss 0.992, val accuracy 0.625, and val rmse 0.836
train loss 0.583, val loss 1.020, val accuracy 0.632, and val rmse 0.823
train loss 0.545, val loss 1.052, val accuracy 0.635, and val rmse 0.823
train loss 0.502, val loss 1.088, val accuracy 0.634, and val rmse 0.827
train loss 0.469, val loss 1.145, val accuracy 0.639, and val rmse 0.817
class LSTM_variable_input(torch.nn.Module) :
def __init__(self, vocab_size, embedding_dim, hidden_dim) :
super().__init__()
self.hidden_dim = hidden_dim
self.dropout = nn.Dropout(0.3)
self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
self.linear = nn.Linear(hidden_dim, 5)
def forward(self, x, s):
x = self.embeddings(x)
x = self.dropout(x)
x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
out_pack, (ht, ct) = self.lstm(x_pack)
out = self.linear(ht[-1])
return out
model = LSTM_variable_input(vocab_size, 50, 50)
train_model(model, epochs=30, lr=0.1)
train loss 1.328, val loss 1.250, val accuracy 0.515, and val rmse 1.312
train loss 1.031, val loss 1.063, val accuracy 0.577, and val rmse 1.017
train loss 0.904, val loss 0.995, val accuracy 0.603, and val rmse 0.941
train loss 0.849, val loss 1.000, val accuracy 0.599, and val rmse 0.940
train loss 0.845, val loss 1.009, val accuracy 0.598, and val rmse 0.921
train loss 0.834, val loss 1.005, val accuracy 0.593, and val rmse 0.902
train_model(model, epochs=30, lr=0.05)
train loss 0.828, val loss 1.000, val accuracy 0.599, and val rmse 0.920
train loss 0.790, val loss 0.989, val accuracy 0.605, and val rmse 0.894
train loss 0.775, val loss 0.992, val accuracy 0.614, and val rmse 0.884
train loss 0.755, val loss 0.994, val accuracy 0.597, and val rmse 0.883
train loss 0.738, val loss 0.987, val accuracy 0.608, and val rmse 0.872
train loss 0.741, val loss 1.005, val accuracy 0.611, and val rmse 0.888
train_model(model, epochs=30, lr=0.05)
train loss 0.758, val loss 1.028, val accuracy 0.616, and val rmse 0.884
train loss 0.725, val loss 0.994, val accuracy 0.621, and val rmse 0.877
train loss 0.715, val loss 0.999, val accuracy 0.607, and val rmse 0.881
train loss 0.707, val loss 1.008, val accuracy 0.608, and val rmse 0.879
train loss 0.698, val loss 1.018, val accuracy 0.615, and val rmse 0.890
train loss 0.686, val loss 1.017, val accuracy 0.603, and val rmse 0.893
Download weights from : https://nlp.stanford.edu/projects/glove/
def load_glove_vectors(glove_file="./data/glove.6B/glove.6B.50d.txt"):
"""Load the glove word vectors"""
word_vectors = {}
with open(glove_file) as f:
for line in f:
split = line.split()
word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
return word_vectors
def get_emb_matrix(pretrained, word_counts, emb_size = 50):
""" Creates embedding matrix from word vectors"""
vocab_size = len(word_counts) + 2
vocab_to_idx = {}
vocab = ["", "UNK"]
W = np.zeros((vocab_size, emb_size), dtype="float32")
W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words
vocab_to_idx["UNK"] = 1
i = 2
for word in word_counts:
if word in word_vecs:
W[i] = word_vecs[word]
else:
W[i] = np.random.uniform(-0.25,0.25, emb_size)
vocab_to_idx[word] = i
vocab.append(word)
i += 1
return W, np.array(vocab), vocab_to_idx
word_vecs = load_glove_vectors()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts)
class LSTM_glove_vecs(torch.nn.Module) :
def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
super().__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
self.embeddings.weight.requires_grad = False ## freeze embeddings
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
self.linear = nn.Linear(hidden_dim, 5)
self.dropout = nn.Dropout(0.2)
def forward(self, x, l):
x = self.embeddings(x)
x = self.dropout(x)
lstm_out, (ht, ct) = self.lstm(x)
return self.linear(ht[-1])
model = LSTM_glove_vecs(vocab_size, 50, 50, pretrained_weights)
train_model(model, epochs=30, lr=0.1)
train loss 1.281, val loss 1.255, val accuracy 0.556, and val rmse 1.355
train loss 1.210, val loss 1.207, val accuracy 0.556, and val rmse 1.354
train loss 1.206, val loss 1.204, val accuracy 0.556, and val rmse 1.354
train loss 1.201, val loss 1.202, val accuracy 0.556, and val rmse 1.354
train loss 1.173, val loss 1.168, val accuracy 0.557, and val rmse 1.352
train loss 1.131, val loss 1.122, val accuracy 0.562, and val rmse 1.249
train_model(model, epochs=30, lr=0.05)
train loss 1.112, val loss 1.113, val accuracy 0.556, and val rmse 1.349
train loss 1.061, val loss 1.051, val accuracy 0.570, and val rmse 1.109
train loss 1.014, val loss 1.014, val accuracy 0.582, and val rmse 1.058
train loss 0.979, val loss 0.990, val accuracy 0.599, and val rmse 0.995
train loss 0.948, val loss 0.961, val accuracy 0.610, and val rmse 0.950
train loss 0.923, val loss 0.952, val accuracy 0.612, and val rmse 0.935
train_model(model, epochs=30, lr=0.05)
train loss 1.189, val loss 1.014, val accuracy 0.586, and val rmse 1.033
train loss 0.946, val loss 0.964, val accuracy 0.606, and val rmse 0.950
train loss 0.912, val loss 0.951, val accuracy 0.612, and val rmse 0.941
train loss 0.895, val loss 0.949, val accuracy 0.615, and val rmse 0.913
train loss 0.886, val loss 0.947, val accuracy 0.617, and val rmse 0.901
train loss 0.872, val loss 0.938, val accuracy 0.621, and val rmse 0.890
def train_model_regr(model, epochs=10, lr=0.001):
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=lr)
for i in range(epochs):
model.train()
sum_loss = 0.0
total = 0
for x, y, l in train_dl:
x = x.long()
y = y.float()
y_pred = model(x, l)
optimizer.zero_grad()
loss = F.mse_loss(y_pred, y.unsqueeze(-1))
loss.backward()
optimizer.step()
sum_loss += loss.item()*y.shape[0]
total += y.shape[0]
val_loss = validation_metrics_regr(model, val_dl)
if i % 5 == 1:
print("train mse %.3f val rmse %.3f" % (sum_loss/total, val_loss))
def validation_metrics_regr (model, valid_dl):
model.eval()
correct = 0
total = 0
sum_loss = 0.0
for x, y, l in valid_dl:
x = x.long()
y = y.float()
y_hat = model(x, l)
loss = np.sqrt(F.mse_loss(y_hat, y.unsqueeze(-1)).item())
total += y.shape[0]
sum_loss += loss.item()*y.shape[0]
return sum_loss/total
class LSTM_regr(torch.nn.Module) :
def __init__(self, vocab_size, embedding_dim, hidden_dim) :
super().__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
self.linear = nn.Linear(hidden_dim, 1)
self.dropout = nn.Dropout(0.2)
def forward(self, x, l):
x = self.embeddings(x)
x = self.dropout(x)
lstm_out, (ht, ct) = self.lstm(x)
return self.linear(ht[-1])
model = LSTM_regr(vocab_size, 50, 50)
train_model_regr(model, epochs=30, lr=0.05)
train mse 1.663 val rmse 1.313
train mse 1.215 val rmse 1.125
train mse 1.151 val rmse 1.109
train mse 1.114 val rmse 1.115
train mse 1.082 val rmse 1.121
train mse 1.043 val rmse 1.116
train_model_regr(model, epochs=30, lr=0.05)
train mse 1.214 val rmse 1.193
train mse 0.884 val rmse 1.032
train mse 0.631 val rmse 0.903
train mse 0.483 val rmse 0.837
train mse 0.416 val rmse 0.806
train mse 0.363 val rmse 0.799
jovian.commit("lstm multiclass text classification, regression")
[jovian] Saving notebook..