Jovian
⭐️
Sign In
In [ ]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
In [ ]:
#!pip install pytorch_transformers
In [ ]:
import torch
import torch.nn as nn
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from tqdm import tqdm_notebook, trange
import os
from pytorch_transformers import BertConfig, BertTokenizer, BertModel
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule

from torch.utils.data import Dataset, DataLoader
In [ ]:
import numpy as np
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import copy
import torch.nn.functional as F
In [ ]:
import pandas as pd
In [ ]:
path = './'
In [ ]:
train = pd.read_csv(path+'train.csv')
In [ ]:
train.head()
Out[0]:
In [ ]:
# for index, row in train.iterrows():
#     if(row['location']):
#       row['text'] = row['text'] + str(row['location'])
In [ ]:
X = list(train['text'])
y = list(train['target'])
In [ ]:
from sklearn.model_selection import train_test_split
In [ ]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
In [ ]:
X_train[0]
Out[0]:
'Courageous and honest analysis of need to use Atomic Bomb in 1945. #Hiroshima70 Japanese military refused surrender. https://t.co/VhmtyTptGR'

Dataset

In [ ]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def text2ids(text, max_seq_length=300):
    tok_text = tokenizer.tokenize(text)
    if len(tok_text) > max_seq_length:
            tok_text = tok_text[:max_seq_length]
    ids_text  = tokenizer.convert_tokens_to_ids(tok_text)
    padding = [0] * (max_seq_length - len(ids_text))
    ids_text += padding
    return np.array(ids_text)
In [ ]:
class Tweet_Dataset(Dataset):
    def __init__(self, X, y):
        self.x = X
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        #print(self.x[idx])
        x = text2ids(self.x[idx])
        return x, self.y[idx]
    
train_ds = Tweet_Dataset(X_train, y_train)
valid_ds = Tweet_Dataset(X_valid, y_valid)

Model

In [ ]:
class BertForSequenceClassification(nn.Module):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    """
    def __init__(self, num_labels=1):
        super(BertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        nn.init.xavier_normal_(self.classifier.weight)
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        outputs = self.bert(input_ids, token_type_ids, attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits
    
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True
In [ ]:
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=18, num_attention_heads=13, intermediate_size=3072)
In [ ]:
num_labels = 1
model = BertForSequenceClassification(num_labels)

Training

In [ ]:
batch_size = 10
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)
In [ ]:
def train_model(model, optimizer, num_epochs=25):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for x, y in train_dl:
            #y = y.unsqueeze(1).float()
            x = x.cuda()
            y = y.unsqueeze(1).float().cuda()
            optimizer.zero_grad()
            logits = model(x)
            loss = F.binary_cross_entropy_with_logits(logits, y)            
            loss.backward()
            optimizer.step()
                
            running_loss += loss.item() * x.size(0)
        epoch_loss = running_loss / len(train_ds)
        val_loss, accuracy = eval_model(model)
        print('train loss: {:.3f}, valid loss {:.3f} accuracy {:.3f}'.format(
            epoch_loss, val_loss, accuracy))
In [ ]:
def eval_model(model):
    model.eval()
    running_loss = 0.0
    correct = 0
    for x, y in valid_dl:
        x = x.cuda()
        y = y.unsqueeze(1).float().cuda()
        #y = y.unsqueeze(1).float()
        logits = model(x)
        loss = F.binary_cross_entropy_with_logits(logits, y) 
        y_pred = logits > 0
        correct += (y_pred.float() == y).float().sum()
        running_loss += loss.item() * x.size(0)
    accuracy = correct / len(valid_ds)
    epoch_loss = running_loss / len(valid_ds)
    return epoch_loss, accuracy.item() 
In [ ]:
model = model.cuda()
In [ ]:
lrlast = .0001
lrmain = .00001
optimizer = optim.Adam(
    [
        {"params":model.bert.parameters(),"lr": lrmain},
        {"params":model.classifier.parameters(), "lr": lrlast},
       
   ])
In [ ]:
train_model(model, optimizer, num_epochs=2)
train loss: 0.502, valid loss 0.454 accuracy 0.799 train loss: 0.448, valid loss 0.491 accuracy 0.787
In [ ]:
## Prediction
In [ ]:
test = pd.read_csv('test.csv')
In [ ]:
# for index, row in test.iterrows():
#     if(row['location']):
#       row['text'] = row['text'] + str(row['location'])
In [ ]:
 test_x = list(test['text'])
In [ ]:
test_ds = Tweet_Dataset(test_x, test['id'])
In [ ]:
test_dl = DataLoader(test_ds, batch_size=batch_size)
In [ ]:
test_dl
Out[0]:
<torch.utils.data.dataloader.DataLoader at 0x7ff6a9b6e710>
In [ ]:
preds = []
with torch.no_grad():
  for x,y in test_dl:
    preds.append((model(x.cuda())>0).float())
In [ ]:
flat_list = [item for sublist in preds for item in sublist]
In [ ]:
final_preds = [int(x) for x in flat_list]
In [ ]:
len(final_preds)
In [ ]:
test['target'] = final_preds
In [ ]:
test.head()
In [ ]:
test[['id','target']].to_csv('pred.csv', index=False)
In [ ]:
from collections import Counter
Counter(final_preds)
In [ ]:
pip install jovian
In [ ]:
import jovian
jovian.commit()
[jovian] Saving notebook..
In [ ]: