%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
#!pip install pytorch_transformers
import torch
import torch.nn as nn
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from tqdm import tqdm_notebook, trange
import os
from pytorch_transformers import BertConfig, BertTokenizer, BertModel
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import copy
import torch.nn.functional as F
import pandas as pd
path = './'
train = pd.read_csv(path+'train.csv')
train.head()
# for index, row in train.iterrows():
# if(row['location']):
# row['text'] = row['text'] + str(row['location'])
X = list(train['text'])
y = list(train['target'])
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
X_train[0]
'Courageous and honest analysis of need to use Atomic Bomb in 1945. #Hiroshima70 Japanese military refused surrender. https://t.co/VhmtyTptGR'
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def text2ids(text, max_seq_length=300):
tok_text = tokenizer.tokenize(text)
if len(tok_text) > max_seq_length:
tok_text = tok_text[:max_seq_length]
ids_text = tokenizer.convert_tokens_to_ids(tok_text)
padding = [0] * (max_seq_length - len(ids_text))
ids_text += padding
return np.array(ids_text)
class Tweet_Dataset(Dataset):
def __init__(self, X, y):
self.x = X
self.y = y
def __len__(self):
return len(self.y)
def __getitem__(self, idx):
#print(self.x[idx])
x = text2ids(self.x[idx])
return x, self.y[idx]
train_ds = Tweet_Dataset(X_train, y_train)
valid_ds = Tweet_Dataset(X_valid, y_valid)
class BertForSequenceClassification(nn.Module):
"""BERT model for classification.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
"""
def __init__(self, num_labels=1):
super(BertForSequenceClassification, self).__init__()
self.num_labels = num_labels
self.bert = BertModel.from_pretrained('bert-base-uncased')
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, num_labels)
nn.init.xavier_normal_(self.classifier.weight)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
outputs = self.bert(input_ids, token_type_ids, attention_mask)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
return logits
def freeze_bert_encoder(self):
for param in self.bert.parameters():
param.requires_grad = False
def unfreeze_bert_encoder(self):
for param in self.bert.parameters():
param.requires_grad = True
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=18, num_attention_heads=13, intermediate_size=3072)
num_labels = 1
model = BertForSequenceClassification(num_labels)
batch_size = 10
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)
def train_model(model, optimizer, num_epochs=25):
for epoch in range(num_epochs):
model.train()
running_loss = 0.0
for x, y in train_dl:
#y = y.unsqueeze(1).float()
x = x.cuda()
y = y.unsqueeze(1).float().cuda()
optimizer.zero_grad()
logits = model(x)
loss = F.binary_cross_entropy_with_logits(logits, y)
loss.backward()
optimizer.step()
running_loss += loss.item() * x.size(0)
epoch_loss = running_loss / len(train_ds)
val_loss, accuracy = eval_model(model)
print('train loss: {:.3f}, valid loss {:.3f} accuracy {:.3f}'.format(
epoch_loss, val_loss, accuracy))
def eval_model(model):
model.eval()
running_loss = 0.0
correct = 0
for x, y in valid_dl:
x = x.cuda()
y = y.unsqueeze(1).float().cuda()
#y = y.unsqueeze(1).float()
logits = model(x)
loss = F.binary_cross_entropy_with_logits(logits, y)
y_pred = logits > 0
correct += (y_pred.float() == y).float().sum()
running_loss += loss.item() * x.size(0)
accuracy = correct / len(valid_ds)
epoch_loss = running_loss / len(valid_ds)
return epoch_loss, accuracy.item()
model = model.cuda()
lrlast = .0001
lrmain = .00001
optimizer = optim.Adam(
[
{"params":model.bert.parameters(),"lr": lrmain},
{"params":model.classifier.parameters(), "lr": lrlast},
])
train_model(model, optimizer, num_epochs=2)
train loss: 0.502, valid loss 0.454 accuracy 0.799
train loss: 0.448, valid loss 0.491 accuracy 0.787
## Prediction
test = pd.read_csv('test.csv')
# for index, row in test.iterrows():
# if(row['location']):
# row['text'] = row['text'] + str(row['location'])
test_x = list(test['text'])
test_ds = Tweet_Dataset(test_x, test['id'])
test_dl = DataLoader(test_ds, batch_size=batch_size)
test_dl
<torch.utils.data.dataloader.DataLoader at 0x7ff6a9b6e710>
preds = []
with torch.no_grad():
for x,y in test_dl:
preds.append((model(x.cuda())>0).float())
flat_list = [item for sublist in preds for item in sublist]
final_preds = [int(x) for x in flat_list]
len(final_preds)
test['target'] = final_preds
test.head()
test[['id','target']].to_csv('pred.csv', index=False)
from collections import Counter
Counter(final_preds)
pip install jovian
import jovian
jovian.commit()
[jovian] Saving notebook..