Jovian
Sign In

LSTM in Pytorch

In [1]:
#library imports
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
import jovian
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

Basic LSTM in Pytorch with random numbers

In [2]:
#input
x = torch.tensor([[1,2, 12,34, 56,78, 90,80],
                 [12,45, 99,67, 6,23, 77,82],
                 [3,24, 6,99, 12,56, 21,22]])
using two different models
In [3]:
model1 = nn.Embedding(100, 7, padding_idx=0)
model2 = nn.LSTM(input_size=7, hidden_size=3, num_layers=1, batch_first=True)
In [4]:
out1 = model1(x)
out2 = model2(out1)
In [5]:
print(out1.shape)
print(out1)
torch.Size([3, 8, 7]) tensor([[[-0.6816, -0.3904, -0.0229, 1.2287, -0.4489, -0.1448, 0.1876], [-0.1136, -0.2161, -0.6440, 1.7220, -1.0028, -0.4189, 1.4022], [ 0.1740, -0.2212, 1.5228, -1.1506, -1.1710, 0.0406, -0.3912], [ 1.9387, 0.3619, -0.4921, -0.6929, -0.6253, 1.1100, 0.8697], [-1.1030, 0.5688, -0.2015, -1.0526, 2.9643, 1.2638, 1.9368], [-0.3143, -0.8116, -0.1972, 0.9615, 0.8048, -0.2469, 1.0350], [ 0.2626, -0.4890, -1.0185, 0.4583, 0.6501, -0.1358, 0.1586], [-0.4704, 1.3602, 0.6796, 0.4018, -0.2171, 2.0806, -1.0199]], [[ 0.1740, -0.2212, 1.5228, -1.1506, -1.1710, 0.0406, -0.3912], [-0.9500, 0.7904, 0.0888, -1.0316, -0.7365, -0.8333, 0.6342], [-1.0811, 0.2237, -0.4557, 0.4708, 0.8445, -1.0519, 0.0446], [ 1.3037, -1.0439, -0.8036, 0.5445, 1.7022, 0.7845, 0.0318], [ 0.8269, -0.6542, -0.3596, 1.8055, -0.8318, 0.6261, 0.2298], [-0.2241, 0.2127, 0.1145, 0.1325, 0.3162, 0.4276, 0.5688], [ 1.5142, 1.5675, 0.4787, 0.1893, 1.3999, 0.3825, 0.2888], [ 0.2900, -1.8883, 0.1017, 0.7807, 2.0393, -0.2231, 0.7619]], [[ 0.5877, 0.1631, -1.4762, 1.0529, -0.0842, 1.5817, 1.0293], [ 1.1253, 1.9566, -0.8565, 0.0533, -1.3300, 0.4598, -0.6800], [ 0.8269, -0.6542, -0.3596, 1.8055, -0.8318, 0.6261, 0.2298], [-1.0811, 0.2237, -0.4557, 0.4708, 0.8445, -1.0519, 0.0446], [ 0.1740, -0.2212, 1.5228, -1.1506, -1.1710, 0.0406, -0.3912], [-1.1030, 0.5688, -0.2015, -1.0526, 2.9643, 1.2638, 1.9368], [ 1.2083, 0.1338, 1.0553, -1.6460, -1.6378, -0.5144, -1.0399], [-0.6197, -1.5587, 1.3634, 1.0663, 1.7736, -0.6517, -2.2486]]], grad_fn=<EmbeddingBackward>)
In [6]:
out, (ht, ct) = model2(out1)
print(ht)
tensor([[[ 0.2788, 0.1724, 0.0143], [-0.2855, 0.1841, 0.2382], [-0.5367, 0.0766, 0.0969]]], grad_fn=<StackBackward>)
using nn.sequential
In [7]:
model3 = nn.Sequential(nn.Embedding(100, 7, padding_idx=0),
                        nn.LSTM(input_size=7, hidden_size=3, num_layers=1, batch_first=True))
In [8]:
out, (ht, ct) = model3(x)
print(out)
tensor([[[-0.2137, 0.0261, -0.1449], [-0.3795, -0.0441, -0.0928], [-0.3562, -0.0813, -0.0430], [-0.3664, -0.0105, -0.2085], [-0.5796, -0.0787, -0.3419], [-0.4169, -0.1716, -0.3466], [-0.4096, -0.0531, -0.5017], [-0.6825, -0.0244, -0.3172]], [[-0.1501, -0.0516, -0.0249], [ 0.0922, -0.0493, -0.0280], [ 0.1433, -0.1797, -0.0193], [ 0.3654, -0.1216, -0.0491], [ 0.4690, -0.1234, -0.0121], [ 0.3195, -0.0079, -0.1690], [ 0.2842, -0.0074, -0.2765], [ 0.0661, -0.0064, -0.2403]], [[-0.2451, -0.1169, -0.2873], [-0.3352, -0.0299, -0.0743], [-0.0965, -0.0810, -0.1175], [ 0.0841, -0.2123, -0.0219], [-0.0312, -0.2138, -0.0325], [-0.2179, -0.1398, -0.2310], [-0.1087, -0.0455, -0.0574], [-0.1863, -0.1855, -0.1135]]], grad_fn=<TransposeBackward0>)

Multiclass Text Classification

We are going to predict item ratings based on customer reviews bsed on this dataset from Kaggle: https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews

In [9]:
#loading the data
reviews = pd.read_csv("reviews.csv")
print(reviews.shape)
reviews.head()
(23486, 11)
Out[9]:
In [10]:
reviews['Title'] = reviews['Title'].fillna('')
reviews['Review Text'] = reviews['Review Text'].fillna('')
reviews['review'] = reviews['Title'] + ' ' + reviews['Review Text']
In [11]:
#keeping only relevant columns and calculating sentence lengths
reviews = reviews[['review', 'Rating']]
reviews.columns = ['review', 'rating']
reviews['review_length'] = reviews['review'].apply(lambda x: len(x.split()))
reviews.head()
Out[11]:
In [12]:
#changing ratings to 0-numbering
zero_numbering = {1:0, 2:1, 3:2, 4:3, 5:4}
reviews['rating'] = reviews['rating'].apply(lambda x: zero_numbering[x])
In [13]:
#mean sentence length
np.mean(reviews['review_length'])
Out[13]:
60.832921740611425
In [14]:
#tokenization
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]
In [15]:
#count number of occurences of each word
counts = Counter()
for index, row in reviews.iterrows():
    counts.update(tokenize(row['review']))
In [16]:
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))
num_words before: 14138 num_words after: 8263
In [17]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)
In [18]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length
In [19]:
reviews['encoded'] = reviews['review'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
reviews.head()
Out[19]:
In [20]:
#check how balanced the dataset is
Counter(reviews['rating'])
Out[20]:
Counter({3: 5077, 4: 13131, 2: 2871, 1: 1565, 0: 842})
In [21]:
X = list(reviews['encoded'])
y = list(reviews['rating'])
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
Pytorch Dataset
In [22]:
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]
In [23]:
train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_valid, y_valid)
In [24]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total
In [25]:
batch_size = 5000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

LSTM with fixed length input

In [26]:
class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])
In [27]:
model_fixed =  LSTM_fixed_len(vocab_size, 50, 50)
In [28]:
train_model(model_fixed, epochs=30, lr=0.01)
train loss 1.239, val loss 1.218, val accuracy 0.556, and val rmse 1.355 train loss 1.188, val loss 1.203, val accuracy 0.554, and val rmse 1.348 train loss 1.112, val loss 1.165, val accuracy 0.574, and val rmse 1.189 train loss 1.072, val loss 1.161, val accuracy 0.517, and val rmse 1.145 train loss 1.042, val loss 1.117, val accuracy 0.563, and val rmse 1.279 train loss 0.981, val loss 1.113, val accuracy 0.572, and val rmse 1.216
In [29]:
train_model(model_fixed, epochs=30, lr=0.01)
train loss 0.936, val loss 1.065, val accuracy 0.576, and val rmse 1.177 train loss 0.846, val loss 1.009, val accuracy 0.603, and val rmse 0.937 train loss 0.848, val loss 1.009, val accuracy 0.606, and val rmse 0.912 train loss 0.784, val loss 0.994, val accuracy 0.604, and val rmse 0.894 train loss 0.741, val loss 0.984, val accuracy 0.617, and val rmse 0.870 train loss 0.702, val loss 0.999, val accuracy 0.623, and val rmse 0.863
In [30]:
train_model(model_fixed, epochs=30, lr=0.01)
train loss 0.695, val loss 1.010, val accuracy 0.619, and val rmse 0.869 train loss 0.630, val loss 0.992, val accuracy 0.625, and val rmse 0.836 train loss 0.583, val loss 1.020, val accuracy 0.632, and val rmse 0.823 train loss 0.545, val loss 1.052, val accuracy 0.635, and val rmse 0.823 train loss 0.502, val loss 1.088, val accuracy 0.634, and val rmse 0.827 train loss 0.469, val loss 1.145, val accuracy 0.639, and val rmse 0.817

LSTM with variable length input

In [31]:
class LSTM_variable_input(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.3)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return out
In [32]:
model = LSTM_variable_input(vocab_size, 50, 50)
In [33]:
train_model(model, epochs=30, lr=0.1)
train loss 1.328, val loss 1.250, val accuracy 0.515, and val rmse 1.312 train loss 1.031, val loss 1.063, val accuracy 0.577, and val rmse 1.017 train loss 0.904, val loss 0.995, val accuracy 0.603, and val rmse 0.941 train loss 0.849, val loss 1.000, val accuracy 0.599, and val rmse 0.940 train loss 0.845, val loss 1.009, val accuracy 0.598, and val rmse 0.921 train loss 0.834, val loss 1.005, val accuracy 0.593, and val rmse 0.902
In [34]:
train_model(model, epochs=30, lr=0.05)
train loss 0.828, val loss 1.000, val accuracy 0.599, and val rmse 0.920 train loss 0.790, val loss 0.989, val accuracy 0.605, and val rmse 0.894 train loss 0.775, val loss 0.992, val accuracy 0.614, and val rmse 0.884 train loss 0.755, val loss 0.994, val accuracy 0.597, and val rmse 0.883 train loss 0.738, val loss 0.987, val accuracy 0.608, and val rmse 0.872 train loss 0.741, val loss 1.005, val accuracy 0.611, and val rmse 0.888
In [35]:
train_model(model, epochs=30, lr=0.05)
train loss 0.758, val loss 1.028, val accuracy 0.616, and val rmse 0.884 train loss 0.725, val loss 0.994, val accuracy 0.621, and val rmse 0.877 train loss 0.715, val loss 0.999, val accuracy 0.607, and val rmse 0.881 train loss 0.707, val loss 1.008, val accuracy 0.608, and val rmse 0.879 train loss 0.698, val loss 1.018, val accuracy 0.615, and val rmse 0.890 train loss 0.686, val loss 1.017, val accuracy 0.603, and val rmse 0.893

LSTM with pretrained Glove word embeddings

Download weights from : https://nlp.stanford.edu/projects/glove/

In [36]:
def load_glove_vectors(glove_file="./data/glove.6B/glove.6B.50d.txt"):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file) as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors
In [37]:
def get_emb_matrix(pretrained, word_counts, emb_size = 50):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(word_counts) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    i = 2
    for word in word_counts:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx
In [38]:
word_vecs = load_glove_vectors()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts)
In [39]:
class LSTM_glove_vecs(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])
In [40]:
model = LSTM_glove_vecs(vocab_size, 50, 50, pretrained_weights)
In [41]:
train_model(model, epochs=30, lr=0.1)
train loss 1.281, val loss 1.255, val accuracy 0.556, and val rmse 1.355 train loss 1.210, val loss 1.207, val accuracy 0.556, and val rmse 1.354 train loss 1.206, val loss 1.204, val accuracy 0.556, and val rmse 1.354 train loss 1.201, val loss 1.202, val accuracy 0.556, and val rmse 1.354 train loss 1.173, val loss 1.168, val accuracy 0.557, and val rmse 1.352 train loss 1.131, val loss 1.122, val accuracy 0.562, and val rmse 1.249
In [42]:
train_model(model, epochs=30, lr=0.05)
train loss 1.112, val loss 1.113, val accuracy 0.556, and val rmse 1.349 train loss 1.061, val loss 1.051, val accuracy 0.570, and val rmse 1.109 train loss 1.014, val loss 1.014, val accuracy 0.582, and val rmse 1.058 train loss 0.979, val loss 0.990, val accuracy 0.599, and val rmse 0.995 train loss 0.948, val loss 0.961, val accuracy 0.610, and val rmse 0.950 train loss 0.923, val loss 0.952, val accuracy 0.612, and val rmse 0.935
In [43]:
train_model(model, epochs=30, lr=0.05)
train loss 1.189, val loss 1.014, val accuracy 0.586, and val rmse 1.033 train loss 0.946, val loss 0.964, val accuracy 0.606, and val rmse 0.950 train loss 0.912, val loss 0.951, val accuracy 0.612, and val rmse 0.941 train loss 0.895, val loss 0.949, val accuracy 0.615, and val rmse 0.913 train loss 0.886, val loss 0.947, val accuracy 0.617, and val rmse 0.901 train loss 0.872, val loss 0.938, val accuracy 0.621, and val rmse 0.890

Predicting ratings using regression instead of classification

In [44]:
def train_model_regr(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.float()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.mse_loss(y_pred, y.unsqueeze(-1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss = validation_metrics_regr(model, val_dl)
        if i % 5 == 1:
            print("train mse %.3f val rmse %.3f" % (sum_loss/total, val_loss))

def validation_metrics_regr (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.float()
        y_hat = model(x, l)
        loss = np.sqrt(F.mse_loss(y_hat, y.unsqueeze(-1)).item())
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total
In [45]:
class LSTM_regr(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])
In [46]:
model =  LSTM_regr(vocab_size, 50, 50)
In [47]:
train_model_regr(model, epochs=30, lr=0.05)
train mse 1.663 val rmse 1.313 train mse 1.215 val rmse 1.125 train mse 1.151 val rmse 1.109 train mse 1.114 val rmse 1.115 train mse 1.082 val rmse 1.121 train mse 1.043 val rmse 1.116
In [48]:
train_model_regr(model, epochs=30, lr=0.05)
train mse 1.214 val rmse 1.193 train mse 0.884 val rmse 1.032 train mse 0.631 val rmse 0.903 train mse 0.483 val rmse 0.837 train mse 0.416 val rmse 0.806 train mse 0.363 val rmse 0.799
In [ ]:
jovian.commit("lstm multiclass text classification, regression")
[jovian] Saving notebook..
In [ ]: