!nvidia-smi
Tue Sep 17 22:35:12 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67 Driver Version: 418.67 CUDA Version: 10.1 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla T4 On | 00000000:00:04.0 Off | 0 |
| N/A 69C P8 11W / 70W | 0MiB / 15079MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
import os
import cv2
import collections
import time
import tqdm
from PIL import Image
from functools import partial
train_on_gpu = True
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import torchvision
import torchvision.transforms as transforms
import torch
from torch.utils.data import TensorDataset, DataLoader,Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data.sampler import SubsetRandomSampler
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR
import albumentations as albu
from albumentations import torch as AT
from catalyst.data import Augmentor
from catalyst.dl import utils
from catalyst.data.reader import ImageReader, ScalarReader, ReaderCompose, LambdaReader
from catalyst.dl.runner import SupervisedRunner
from catalyst.contrib.models.segmentation import Unet
from catalyst.dl.callbacks import DiceCallback, EarlyStoppingCallback, InferCallback, CheckpointCallback, OptimizerCallback, CriterionCallback
import segmentation_models_pytorch as smp
import jovian
def get_img(x, folder: str='train_images'):
"""
Return image based on image name and folder.
"""
data_folder = f"{path}/{folder}"
image_path = os.path.join(data_folder, x)
img = cv2.imread(image_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return img
def rle_decode(mask_rle: str = '', shape: tuple = (1400, 2100)):
'''
Decode rle encoded mask.
:param mask_rle: run-length as string formatted (start length)
:param shape: (height, width) of array to return
Returns numpy array, 1 - mask, 0 - background
'''
s = mask_rle.split()
starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
starts -= 1
ends = starts + lengths
img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
for lo, hi in zip(starts, ends):
img[lo:hi] = 1
return img.reshape(shape, order='F')
def make_mask(df: pd.DataFrame, image_name: str='img.jpg', shape: tuple = (1400, 2100)):
"""
Create mask based on df, image name and shape.
"""
encoded_masks = df.loc[df['im_id'] == image_name, 'EncodedPixels']
masks = np.zeros((shape[0], shape[1], 4), dtype=np.float32)
for idx, label in enumerate(encoded_masks.values):
if label is not np.nan:
mask = rle_decode(label)
masks[:, :, idx] = mask
return masks
def to_tensor(x, **kwargs):
"""
Convert image or mask.
"""
return x.transpose(2, 0, 1).astype('float32')
def mask2rle(img):
'''
Convert mask to rle.
img: numpy array, 1 - mask, 0 - background
Returns run length as string formated
'''
pixels= img.T.flatten()
pixels = np.concatenate([[0], pixels, [0]])
runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
runs[1::2] -= runs[::2]
return ' '.join(str(x) for x in runs)
def visualize(image, mask, original_image=None, original_mask=None):
"""
Plot image and masks.
If two pairs of images and masks are passes, show both.
"""
fontsize = 14
class_dict = {0: 'Fish', 1: 'Flower', 2: 'Gravel', 3: 'Sugar'}
if original_image is None and original_mask is None:
f, ax = plt.subplots(1, 5, figsize=(24, 24))
ax[0].imshow(image)
for i in range(4):
ax[i + 1].imshow(mask[:, :, i])
ax[i + 1].set_title(f'Mask {class_dict[i]}', fontsize=fontsize)
else:
f, ax = plt.subplots(2, 5, figsize=(24, 12))
ax[0, 0].imshow(original_image)
ax[0, 0].set_title('Original image', fontsize=fontsize)
for i in range(4):
ax[0, i + 1].imshow(original_mask[:, :, i])
ax[0, i + 1].set_title(f'Original mask {class_dict[i]}', fontsize=fontsize)
ax[1, 0].imshow(image)
ax[1, 0].set_title('Transformed image', fontsize=fontsize)
for i in range(4):
ax[1, i + 1].imshow(mask[:, :, i])
ax[1, i + 1].set_title(f'Transformed mask {class_dict[i]}', fontsize=fontsize)
def visualize_with_raw(image, mask, original_image=None, original_mask=None, raw_image=None, raw_mask=None):
"""
Plot image and masks.
If two pairs of images and masks are passes, show both.
"""
fontsize = 14
class_dict = {0: 'Fish', 1: 'Flower', 2: 'Gravel', 3: 'Sugar'}
f, ax = plt.subplots(3, 5, figsize=(24, 12))
ax[0, 0].imshow(original_image)
ax[0, 0].set_title('Original image', fontsize=fontsize)
for i in range(4):
ax[0, i + 1].imshow(original_mask[:, :, i])
ax[0, i + 1].set_title(f'Original mask {class_dict[i]}', fontsize=fontsize)
ax[1, 0].imshow(raw_image)
ax[1, 0].set_title('Original image', fontsize=fontsize)
for i in range(4):
ax[1, i + 1].imshow(raw_mask[:, :, i])
ax[1, i + 1].set_title(f'Raw predicted mask {class_dict[i]}', fontsize=fontsize)
ax[2, 0].imshow(image)
ax[2, 0].set_title('Transformed image', fontsize=fontsize)
for i in range(4):
ax[2, i + 1].imshow(mask[:, :, i])
ax[2, i + 1].set_title(f'Predicted mask with processing {class_dict[i]}', fontsize=fontsize)
def plot_with_augmentation(image, mask, augment):
"""
Wrapper for `visualize` function.
"""
augmented = augment(image=image, mask=mask)
image_flipped = augmented['image']
mask_flipped = augmented['mask']
visualize(image_flipped, mask_flipped, original_image=image, original_mask=mask)
sigmoid = lambda x: 1 / (1 + np.exp(-x))
def post_process(probability, threshold, min_size):
"""
Post processing of each predicted mask, components with lesser number of pixels
than `min_size` are ignored
"""
# don't remember where I saw it
mask = cv2.threshold(probability, threshold, 1, cv2.THRESH_BINARY)[1]
num_component, component = cv2.connectedComponents(mask.astype(np.uint8))
predictions = np.zeros((350, 525), np.float32)
num = 0
for c in range(1, num_component):
p = (component == c)
if p.sum() > min_size:
predictions[p] = 1
num += 1
return predictions, num
def get_training_augmentation():
train_transform = [
albu.Resize(320, 640),
albu.Flip(p=0.4),
albu.ShiftScaleRotate(scale_limit=0.4, rotate_limit=0, shift_limit=0.1, p=0.5, border_mode=0),
albu.GridDistortion(p=0.4),
albu.OpticalDistortion(p=0.4, distort_limit=2, shift_limit=0.4),
albu.RandomBrightnessContrast(contrast_limit=.3)
]
return albu.Compose(train_transform)
def get_validation_augmentation():
"""Add paddings to make image shape divisible by 32"""
test_transform = [
albu.Resize(320, 640)
]
return albu.Compose(test_transform)
def get_preprocessing(preprocessing_fn):
"""Construct preprocessing transform
Args:
preprocessing_fn (callbale): data normalization function
(can be specific for each pretrained neural network)
Return:
transform: albumentations.Compose
"""
_transform = [
albu.Lambda(image=preprocessing_fn),
albu.Lambda(image=to_tensor, mask=to_tensor),
]
return albu.Compose(_transform)
def dice(img1, img2):
img1 = np.asarray(img1).astype(np.bool)
img2 = np.asarray(img2).astype(np.bool)
intersection = np.logical_and(img1, img2)
return 2. * intersection.sum() / (img1.sum() + img2.sum())
Let's have a look at the data first.
path = '../data'
os.listdir(path)
['train.csv', 'test_images', 'train_images', 'sample_submission.csv']
We have folders with train and test images, file with train image ids and masks and sample submission.
train = pd.read_csv(f'{path}/train.csv')
sub = pd.read_csv(f'{path}/sample_submission.csv')
train['label'] = train['Image_Label'].apply(lambda x: x.split('_')[1])
train['im_id'] = train['Image_Label'].apply(lambda x: x.split('_')[0])
sub['label'] = sub['Image_Label'].apply(lambda x: x.split('_')[1])
sub['im_id'] = sub['Image_Label'].apply(lambda x: x.split('_')[0])
At first, let's create a list of unique image ids and the count of masks for images. This will allow us to make a stratified split based on this count.
id_mask_count = train.loc[train['EncodedPixels'].isnull() == False, 'Image_Label'].apply(lambda x: x.split('_')[0]).value_counts().\
reset_index().rename(columns={'index': 'img_id', 'Image_Label': 'count'})
train_ids, valid_ids = train_test_split(id_mask_count['img_id'].values, random_state=42, stratify=id_mask_count['count'], test_size=0.1)
test_ids = sub['Image_Label'].apply(lambda x: x.split('_')[0]).drop_duplicates().values
class CloudDataset(Dataset):
def __init__(self, df: pd.DataFrame = None, datatype: str = 'train', img_ids: np.array = None,
transforms = albu.Compose([albu.HorizontalFlip(),AT.ToTensor()]),
preprocessing=None):
self.df = df
if datatype != 'test':
self.data_folder = f"{path}/train_images"
else:
self.data_folder = f"{path}/test_images"
self.img_ids = img_ids
self.transforms = transforms
self.preprocessing = preprocessing
def __getitem__(self, idx):
image_name = self.img_ids[idx]
mask = make_mask(self.df, image_name)
image_path = os.path.join(self.data_folder, image_name)
img = cv2.imread(image_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
augmented = self.transforms(image=img, mask=mask)
img = augmented['image']
mask = augmented['mask']
if self.preprocessing:
preprocessed = self.preprocessing(image=img, mask=mask)
img = preprocessed['image']
mask = preprocessed['mask']
return img, mask
def __len__(self):
return len(self.img_ids)
Now we define model and training parameters
ENCODER = 'densenet169'
ENCODER_WEIGHTS = 'imagenet'
DEVICE = 'cuda'
ACTIVATION = None
model = smp.Unet(
encoder_name=ENCODER,
encoder_weights=ENCODER_WEIGHTS,
classes=4,
activation=ACTIVATION,
)
preprocessing_fn = smp.encoders.get_preprocessing_fn(ENCODER, ENCODER_WEIGHTS)
num_workers = 0
bs = 8
train_dataset = CloudDataset(df=train, datatype='train', img_ids=train_ids, transforms = get_training_augmentation(), preprocessing=get_preprocessing(preprocessing_fn))
valid_dataset = CloudDataset(df=train, datatype='valid', img_ids=valid_ids, transforms = get_validation_augmentation(), preprocessing=get_preprocessing(preprocessing_fn))
train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, num_workers=num_workers)
valid_loader = DataLoader(valid_dataset, batch_size=bs, shuffle=False, num_workers=num_workers)
loaders = {
"train": train_loader,
"valid": valid_loader
}
/home/prajwal/anaconda3/envs/pytorch/lib/python3.7/site-packages/albumentations/augmentations/transforms.py:1734: UserWarning:
Using lambda is incompatible with multiprocessing. Consider using regular functions or partial().
num_epochs = 3
logdir = "./logs/d169"
# model, criterion, optimizer
optimizer = torch.optim.Adam([
{'params': model.decoder.parameters(), 'lr': 3e-2},
{'params': model.encoder.parameters(), 'lr': 3e-3},
])
scheduler = ReduceLROnPlateau(optimizer, factor=0.15, patience=2)
criterion = smp.utils.losses.BCEDiceLoss(eps=1.)
runner = SupervisedRunner()
runner.train(
model=model,
criterion=criterion,
optimizer=optimizer,
scheduler=scheduler,
loaders=loaders,
callbacks=[DiceCallback(),
EarlyStoppingCallback(patience=5, min_delta=0.001)],
logdir=logdir,
num_epochs=num_epochs,
fp16=True,
verbose=True
)
Selected optimization level O1: Insert automatic casts around Pytorch functions and Tensor methods.
Defaults for this optimization level are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Warning: multi_tensor_applier fused unscale kernel is unavailable, possibly because apex was installed without --cuda_ext --cpp_ext. Using Python fallback. Original ImportError was: ModuleNotFoundError("No module named 'amp_C'")
0/3 * Epoch (train): 100% 624/624 [13:35<00:00, 1.31s/it, _timers/_fps=10.021, dice=0.357, loss=1.037]
0/3 * Epoch (valid): 100% 70/70 [00:59<00:00, 1.18it/s, _timers/_fps=27.277, dice=0.506, loss=1.126]
[2019-09-16 23:07:03,018]
0/3 * Epoch 0 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=9.4488 | _timers/batch_time=0.8522 | _timers/data_time=0.7871 | _timers/model_time=0.0651 | dice=0.3269 | loss=1.0511
0/3 * Epoch 0 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=10.6167 | _timers/batch_time=0.7702 | _timers/data_time=0.7252 | _timers/model_time=0.0450 | dice=0.3345 | loss=1.3726
1/3 * Epoch (train): 100% 624/624 [13:41<00:00, 1.32s/it, _timers/_fps=10.306, dice=0.254, loss=1.111]
1/3 * Epoch (valid): 100% 70/70 [00:57<00:00, 1.22it/s, _timers/_fps=28.322, dice=0.698, loss=0.555]
[2019-09-16 23:21:48,021]
1/3 * Epoch 1 (train): _base/lr=0.0300 | _base/momentum=0.9000 | _timers/_fps=9.3251 | _timers/batch_time=0.8630 | _timers/data_time=0.7981 | _timers/model_time=0.0648 | dice=0.3644 | loss=0.9991
1/3 * Epoch 1 (valid): _base/lr=0.0300 | _base/momentum=0.9000 | _timers/_fps=11.0109 | _timers/batch_time=0.7439 | _timers/data_time=0.6994 | _timers/model_time=0.0445 | dice=0.4203 | loss=0.9520
2/3 * Epoch (train): 100% 624/624 [13:38<00:00, 1.31s/it, _timers/_fps=11.721, dice=0.491, loss=0.792]
2/3 * Epoch (valid): 100% 70/70 [00:56<00:00, 1.24it/s, _timers/_fps=24.108, dice=0.360, loss=1.339]
[2019-09-16 23:36:26,681]
2/3 * Epoch 2 (train): _base/lr=0.0300 | _base/momentum=0.9000 | _timers/_fps=9.3873 | _timers/batch_time=0.8574 | _timers/data_time=0.7934 | _timers/model_time=0.0639 | dice=0.3870 | loss=0.9682
2/3 * Epoch 2 (valid): _base/lr=0.0300 | _base/momentum=0.9000 | _timers/_fps=11.1256 | _timers/batch_time=0.7321 | _timers/data_time=0.6876 | _timers/model_time=0.0445 | dice=0.3216 | loss=1.2055
Top best models:
logs/d169/checkpoints//train.1.pth 0.9520
utils.plot_metrics(
logdir=logdir,
# specify which metrics we want to plot
metrics=["loss", "dice", 'lr', '_base/lr']
)
num_epochs = 12
logdir = "./logs/d169-from2"
# model, criterion, optimizer
optimizer = torch.optim.Adam([
{'params': model.decoder.parameters(), 'lr': 3e-3},
{'params': model.encoder.parameters(), 'lr': 3e-4},
])
scheduler = ReduceLROnPlateau(optimizer, factor=0.15, patience=2)
criterion = smp.utils.losses.BCEDiceLoss(eps=1.)
runner = SupervisedRunner()
runner.train(
model=model,
criterion=criterion,
optimizer=optimizer,
scheduler=scheduler,
loaders=loaders,
callbacks=[DiceCallback(),
EarlyStoppingCallback(patience=5, min_delta=0.001),
CheckpointCallback(resume='logs/d169/checkpoints/best_full.pth')],
logdir=logdir,
num_epochs=num_epochs,
fp16=True,
verbose=True
)
Selected optimization level O1: Insert automatic casts around Pytorch functions and Tensor methods.
Defaults for this optimization level are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Warning: multi_tensor_applier fused unscale kernel is unavailable, possibly because apex was installed without --cuda_ext --cpp_ext. Using Python fallback. Original ImportError was: ModuleNotFoundError("No module named 'amp_C'")
=> loading checkpoint logs/d169/checkpoints/best_full.pth
loaded checkpoint logs/d169/checkpoints/best_full.pth (epoch 3)
0/12 * Epoch (train): 100% 624/624 [12:35<00:00, 1.21s/it, _timers/_fps=11.829, dice=0.516, loss=0.689]
0/12 * Epoch (valid): 100% 70/70 [00:55<00:00, 1.27it/s, _timers/_fps=25.757, dice=0.478, loss=0.923]
[2019-09-17 08:50:31,620]
0/12 * Epoch 3 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=10.5396 | _timers/batch_time=0.7629 | _timers/data_time=0.6958 | _timers/model_time=0.0670 | dice=0.4238 | loss=0.9157
0/12 * Epoch 3 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=11.3397 | _timers/batch_time=0.7137 | _timers/data_time=0.6658 | _timers/model_time=0.0478 | dice=0.4478 | loss=1.0345
1/12 * Epoch (train): 100% 624/624 [13:08<00:00, 1.26s/it, _timers/_fps=11.737, dice=0.449, loss=1.017]
1/12 * Epoch (valid): 100% 70/70 [00:54<00:00, 1.29it/s, _timers/_fps=26.658, dice=0.479, loss=0.965]
[2019-09-17 09:04:38,332]
1/12 * Epoch 4 (train): _base/lr=0.0300 | _base/momentum=0.9000 | _timers/_fps=9.9579 | _timers/batch_time=0.8081 | _timers/data_time=0.7377 | _timers/model_time=0.0703 | dice=0.4286 | loss=0.9111
1/12 * Epoch 4 (valid): _base/lr=0.0300 | _base/momentum=0.9000 | _timers/_fps=11.4759 | _timers/batch_time=0.7061 | _timers/data_time=0.6576 | _timers/model_time=0.0484 | dice=0.4786 | loss=0.8799
2/12 * Epoch (train): 100% 624/624 [13:02<00:00, 1.25s/it, _timers/_fps=11.253, dice=0.507, loss=0.756]
2/12 * Epoch (valid): 100% 70/70 [00:53<00:00, 1.32it/s, _timers/_fps=27.634, dice=0.434, loss=1.006]
[2019-09-17 09:18:36,200]
2/12 * Epoch 5 (train): _base/lr=0.0300 | _base/momentum=0.9000 | _timers/_fps=10.0516 | _timers/batch_time=0.7993 | _timers/data_time=0.7294 | _timers/model_time=0.0697 | dice=0.4373 | loss=0.8968
2/12 * Epoch 5 (valid): _base/lr=0.0300 | _base/momentum=0.9000 | _timers/_fps=11.8137 | _timers/batch_time=0.6865 | _timers/data_time=0.6393 | _timers/model_time=0.0471 | dice=0.4362 | loss=0.9438
3/12 * Epoch (train): 100% 624/624 [12:53<00:00, 1.24s/it, _timers/_fps=10.858, dice=0.448, loss=1.022]
3/12 * Epoch (valid): 100% 70/70 [00:53<00:00, 1.31it/s, _timers/_fps=27.353, dice=0.454, loss=0.944]
[2019-09-17 09:32:24,800]
3/12 * Epoch 6 (train): _base/lr=0.0300 | _base/momentum=0.9000 | _timers/_fps=10.2235 | _timers/batch_time=0.7859 | _timers/data_time=0.7171 | _timers/model_time=0.0686 | dice=0.4349 | loss=0.8991
3/12 * Epoch 6 (valid): _base/lr=0.0300 | _base/momentum=0.9000 | _timers/_fps=11.7459 | _timers/batch_time=0.6906 | _timers/data_time=0.6431 | _timers/model_time=0.0474 | dice=0.4321 | loss=1.0077
4/12 * Epoch (train): 100% 624/624 [12:39<00:00, 1.22s/it, _timers/_fps=12.224, dice=0.465, loss=0.922]
4/12 * Epoch (valid): 100% 70/70 [00:51<00:00, 1.37it/s, _timers/_fps=28.903, dice=0.414, loss=1.092]
[2019-09-17 09:45:57,541]
4/12 * Epoch 7 (train): _base/lr=0.0300 | _base/momentum=0.9000 | _timers/_fps=10.4994 | _timers/batch_time=0.7657 | _timers/data_time=0.6988 | _timers/model_time=0.0667 | dice=0.4452 | loss=0.8861
4/12 * Epoch 7 (valid): _base/lr=0.0300 | _base/momentum=0.9000 | _timers/_fps=12.2864 | _timers/batch_time=0.6602 | _timers/data_time=0.6149 | _timers/model_time=0.0452 | dice=0.4712 | loss=0.9465
5/12 * Epoch (train): 100% 624/624 [12:24<00:00, 1.19s/it, _timers/_fps=12.619, dice=0.587, loss=0.669]
5/12 * Epoch (valid): 100% 70/70 [00:51<00:00, 1.36it/s, _timers/_fps=31.395, dice=0.544, loss=0.796]
[2019-09-17 09:59:17,231]
5/12 * Epoch 8 (train): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=10.8050 | _timers/batch_time=0.7437 | _timers/data_time=0.6784 | _timers/model_time=0.0651 | dice=0.4656 | loss=0.8520
5/12 * Epoch 8 (valid): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=12.3032 | _timers/batch_time=0.6616 | _timers/data_time=0.6164 | _timers/model_time=0.0452 | dice=0.5154 | loss=0.7922
6/12 * Epoch (train): 100% 624/624 [12:23<00:00, 1.19s/it, _timers/_fps=13.559, dice=0.485, loss=0.813]
6/12 * Epoch (valid): 100% 70/70 [00:50<00:00, 1.38it/s, _timers/_fps=30.842, dice=0.558, loss=0.767]
[2019-09-17 10:12:34,839]
6/12 * Epoch 9 (train): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=10.8386 | _timers/batch_time=0.7422 | _timers/data_time=0.6772 | _timers/model_time=0.0649 | dice=0.4727 | loss=0.8418
6/12 * Epoch 9 (valid): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=12.4876 | _timers/batch_time=0.6504 | _timers/data_time=0.6058 | _timers/model_time=0.0445 | dice=0.5156 | loss=0.7906
7/12 * Epoch (train): 100% 624/624 [12:20<00:00, 1.19s/it, _timers/_fps=12.411, dice=0.498, loss=0.786]
7/12 * Epoch (valid): 100% 70/70 [00:50<00:00, 1.38it/s, _timers/_fps=29.141, dice=0.549, loss=0.782]
[2019-09-17 10:25:47,872]
7/12 * Epoch 10 (train): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=10.8974 | _timers/batch_time=0.7372 | _timers/data_time=0.6723 | _timers/model_time=0.0648 | dice=0.4681 | loss=0.8479
7/12 * Epoch 10 (valid): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=12.4729 | _timers/batch_time=0.6501 | _timers/data_time=0.6053 | _timers/model_time=0.0447 | dice=0.5131 | loss=0.7922
8/12 * Epoch (train): 100% 624/624 [12:24<00:00, 1.19s/it, _timers/_fps=12.891, dice=0.442, loss=0.871]
8/12 * Epoch (valid): 100% 70/70 [00:50<00:00, 1.39it/s, _timers/_fps=28.925, dice=0.522, loss=0.833]
[2019-09-17 10:39:06,024]
8/12 * Epoch 11 (train): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=10.8187 | _timers/batch_time=0.7433 | _timers/data_time=0.6787 | _timers/model_time=0.0645 | dice=0.4749 | loss=0.8375
8/12 * Epoch 11 (valid): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=12.5482 | _timers/batch_time=0.6461 | _timers/data_time=0.6017 | _timers/model_time=0.0443 | dice=0.5206 | loss=0.7856
9/12 * Epoch (train): 100% 624/624 [12:21<00:00, 1.19s/it, _timers/_fps=12.563, dice=0.393, loss=1.136]
9/12 * Epoch (valid): 100% 70/70 [00:50<00:00, 1.39it/s, _timers/_fps=31.314, dice=0.538, loss=0.796]
[2019-09-17 10:52:19,433]
9/12 * Epoch 12 (train): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=10.8893 | _timers/batch_time=0.7388 | _timers/data_time=0.6740 | _timers/model_time=0.0647 | dice=0.4764 | loss=0.8373
9/12 * Epoch 12 (valid): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=12.5916 | _timers/batch_time=0.6447 | _timers/data_time=0.6002 | _timers/model_time=0.0444 | dice=0.5149 | loss=0.7859
10/12 * Epoch (train): 33% 206/624 [04:04<08:28, 1.22s/it, _timers/_fps=11.348, dice=0.478, loss=0.817]Gradient overflow. Skipping step, loss scaler 0 reducing loss scale to 262144.0
10/12 * Epoch (train): 100% 624/624 [12:20<00:00, 1.19s/it, _timers/_fps=12.401, dice=0.578, loss=0.635]
10/12 * Epoch (valid): 100% 70/70 [00:50<00:00, 1.38it/s, _timers/_fps=29.285, dice=0.582, loss=0.713]
[2019-09-17 11:05:34,282]
10/12 * Epoch 13 (train): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=10.9089 | _timers/batch_time=0.7374 | _timers/data_time=0.6728 | _timers/model_time=0.0645 | dice=0.4777 | loss=0.8332
10/12 * Epoch 13 (valid): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=12.4380 | _timers/batch_time=0.6516 | _timers/data_time=0.6067 | _timers/model_time=0.0448 | dice=0.5241 | loss=0.7769
11/12 * Epoch (train): 100% 624/624 [12:21<00:00, 1.19s/it, _timers/_fps=12.894, dice=0.472, loss=0.874]
11/12 * Epoch (valid): 100% 70/70 [00:50<00:00, 1.39it/s, _timers/_fps=29.531, dice=0.541, loss=0.768]
[2019-09-17 11:18:47,854]
11/12 * Epoch 14 (train): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=10.8887 | _timers/batch_time=0.7381 | _timers/data_time=0.6734 | _timers/model_time=0.0646 | dice=0.4791 | loss=0.8311
11/12 * Epoch 14 (valid): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=12.5115 | _timers/batch_time=0.6478 | _timers/data_time=0.6037 | _timers/model_time=0.0441 | dice=0.5226 | loss=0.7775
Top best models:
logs/d169-from2/checkpoints//train.13.pth 0.7769
utils.plot_metrics(
logdir=logdir,
# specify which metrics we want to plot
metrics=["loss", "dice", 'lr', '_base/lr']
)
jovian.notify('training complete 10eps')
[jovian] message_sent:True
num_workers = 0
bs = 24
train_dataset = CloudDataset(df=train, datatype='train', img_ids=train_ids, transforms = get_training_augmentation(), preprocessing=get_preprocessing(preprocessing_fn))
valid_dataset = CloudDataset(df=train, datatype='valid', img_ids=valid_ids, transforms = get_validation_augmentation(), preprocessing=get_preprocessing(preprocessing_fn))
train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, num_workers=num_workers)
valid_loader = DataLoader(valid_dataset, batch_size=bs, shuffle=False, num_workers=num_workers)
loaders = {
"train": train_loader,
"valid": valid_loader
}
/home/prajwal/anaconda3/envs/pytorch/lib/python3.7/site-packages/albumentations/augmentations/transforms.py:1734: UserWarning:
Using lambda is incompatible with multiprocessing. Consider using regular functions or partial().
num_epochs = 6
logdir = "./logs/d169-from14"
# model, criterion, optimizer
optimizer = torch.optim.Adam([
{'params': model.decoder.parameters(), 'lr': 1e-2},
{'params': model.encoder.parameters(), 'lr': 1e-3},
])
scheduler = ReduceLROnPlateau(optimizer, factor=0.15, patience=2)
criterion = smp.utils.losses.BCEDiceLoss(eps=1.)
runner = SupervisedRunner()
runner.train(
model=model,
criterion=criterion,
optimizer=optimizer,
scheduler=scheduler,
loaders=loaders,
callbacks=[DiceCallback(),
EarlyStoppingCallback(patience=5, min_delta=0.001),
CheckpointCallback(resume='logs/d169-from2/checkpoints/best_full.pth')],
logdir=logdir,
num_epochs=num_epochs,
fp16=True,
verbose=True
)
Selected optimization level O1: Insert automatic casts around Pytorch functions and Tensor methods.
Defaults for this optimization level are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Warning: multi_tensor_applier fused unscale kernel is unavailable, possibly because apex was installed without --cuda_ext --cpp_ext. Using Python fallback. Original ImportError was: ModuleNotFoundError("No module named 'amp_C'")
=> loading checkpoint logs/d169-from2/checkpoints/best_full.pth
loaded checkpoint logs/d169-from2/checkpoints/best_full.pth (epoch 13)
0/6 * Epoch (train): 100% 208/208 [12:02<00:00, 3.47s/it, _timers/_fps=10.138, dice=0.561, loss=0.691]
0/6 * Epoch (valid): 100% 24/24 [00:57<00:00, 2.38s/it, _timers/_fps=81.522, dice=0.693, loss=0.450]
[2019-09-17 14:06:08,543]
0/6 * Epoch 13 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=10.0026 | _timers/batch_time=2.4007 | _timers/data_time=2.2114 | _timers/model_time=0.1892 | dice=0.5085 | loss=0.7872
0/6 * Epoch 13 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=13.9827 | _timers/batch_time=2.0953 | _timers/data_time=1.9771 | _timers/model_time=0.1182 | dice=0.5496 | loss=0.7421
1/6 * Epoch (train): 100% 208/208 [12:03<00:00, 3.48s/it, _timers/_fps=10.563, dice=0.521, loss=0.762]
1/6 * Epoch (valid): 100% 24/24 [00:57<00:00, 2.39s/it, _timers/_fps=80.777, dice=0.691, loss=0.459]
[2019-09-17 14:19:12,481]
1/6 * Epoch 14 (train): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=9.9913 | _timers/batch_time=2.4034 | _timers/data_time=2.2145 | _timers/model_time=0.1889 | dice=0.5094 | loss=0.7873
1/6 * Epoch 14 (valid): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=13.8712 | _timers/batch_time=2.1114 | _timers/data_time=1.9930 | _timers/model_time=0.1183 | dice=0.5507 | loss=0.7494
2/6 * Epoch (train): 100% 208/208 [12:25<00:00, 3.58s/it, _timers/_fps=9.938, dice=0.509, loss=0.783]
2/6 * Epoch (valid): 100% 24/24 [00:58<00:00, 2.45s/it, _timers/_fps=76.603, dice=0.692, loss=0.460]
[2019-09-17 14:32:38,738]
2/6 * Epoch 15 (train): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=9.6022 | _timers/batch_time=2.5036 | _timers/data_time=2.3140 | _timers/model_time=0.1894 | dice=0.5094 | loss=0.7905
2/6 * Epoch 15 (valid): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=13.4679 | _timers/batch_time=2.1606 | _timers/data_time=2.0425 | _timers/model_time=0.1180 | dice=0.5503 | loss=0.7429
3/6 * Epoch (train): 100% 208/208 [12:05<00:00, 3.49s/it, _timers/_fps=10.381, dice=0.489, loss=0.811]
3/6 * Epoch (valid): 100% 24/24 [00:57<00:00, 2.39s/it, _timers/_fps=80.565, dice=0.684, loss=0.467]
[2019-09-17 14:45:43,619]
3/6 * Epoch 16 (train): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=9.9597 | _timers/batch_time=2.4114 | _timers/data_time=2.2223 | _timers/model_time=0.1889 | dice=0.5115 | loss=0.7855
3/6 * Epoch 16 (valid): _base/lr=0.0045 | _base/momentum=0.9000 | _timers/_fps=13.8838 | _timers/batch_time=2.1071 | _timers/data_time=1.9889 | _timers/model_time=0.1182 | dice=0.5511 | loss=0.7422
4/6 * Epoch (train): 100% 208/208 [12:03<00:00, 3.48s/it, _timers/_fps=10.104, dice=0.424, loss=0.970]
4/6 * Epoch (valid): 100% 24/24 [00:57<00:00, 2.38s/it, _timers/_fps=81.633, dice=0.696, loss=0.450]
Early stop at 4 epoch
[2019-09-17 14:58:48,132]
4/6 * Epoch 17 (train): _base/lr=0.0007 | _base/momentum=0.9000 | _timers/_fps=9.9817 | _timers/batch_time=2.4066 | _timers/data_time=2.2175 | _timers/model_time=0.1890 | dice=0.5131 | loss=0.7834
4/6 * Epoch 17 (valid): _base/lr=0.0007 | _base/momentum=0.9000 | _timers/_fps=14.0070 | _timers/batch_time=2.0914 | _timers/data_time=1.9726 | _timers/model_time=0.1187 | dice=0.5532 | loss=0.7414
Top best models:
logs/d169-from14/checkpoints//train.17.pth 0.7414
utils.plot_metrics(
logdir=logdir,
# specify which metrics we want to plot
metrics=["loss", "dice", 'lr', '_base/lr']
)
jovian.notify('training complete 5eps')
[jovian] message_sent:True
num_workers = 0
bs = 24
train_dataset = CloudDataset(df=train, datatype='train', img_ids=train_ids, transforms = get_training_augmentation(), preprocessing=get_preprocessing(preprocessing_fn))
valid_dataset = CloudDataset(df=train, datatype='valid', img_ids=valid_ids, transforms = get_validation_augmentation(), preprocessing=get_preprocessing(preprocessing_fn))
train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, num_workers=num_workers)
valid_loader = DataLoader(valid_dataset, batch_size=bs, shuffle=False, num_workers=num_workers)
loaders = {
"train": train_loader,
"valid": valid_loader
}
/home/prajwal/anaconda3/envs/pytorch/lib/python3.7/site-packages/albumentations/augmentations/transforms.py:1734: UserWarning:
Using lambda is incompatible with multiprocessing. Consider using regular functions or partial().
num_epochs = 5
logdir = "./logs/d169-from17"
# model, criterion, optimizer
optimizer = torch.optim.Adam([
{'params': model.decoder.parameters(), 'lr': 1e-1},
{'params': model.encoder.parameters(), 'lr': 1e-2},
])
scheduler = ReduceLROnPlateau(optimizer, factor=0.20, patience=2)
criterion = smp.utils.losses.BCEDiceLoss(eps=1.)
runner = SupervisedRunner()
runner.train(
model=model,
criterion=criterion,
optimizer=optimizer,
scheduler=scheduler,
loaders=loaders,
callbacks=[DiceCallback(),
EarlyStoppingCallback(patience=5, min_delta=0.001),
CheckpointCallback(resume='logs/d169-from14/checkpoints/best_full.pth')],
logdir=logdir,
num_epochs=num_epochs,
fp16=True,
verbose=True
)
Selected optimization level O1: Insert automatic casts around Pytorch functions and Tensor methods.
Defaults for this optimization level are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Warning: multi_tensor_applier fused unscale kernel is unavailable, possibly because apex was installed without --cuda_ext --cpp_ext. Using Python fallback. Original ImportError was: ModuleNotFoundError("No module named 'amp_C'")
=> loading checkpoint logs/d169-from14/checkpoints/best_full.pth
loaded checkpoint logs/d169-from14/checkpoints/best_full.pth (epoch 17)
0/5 * Epoch (train): 100% 208/208 [11:51<00:00, 3.42s/it, _timers/_fps=10.843, dice=0.490, loss=0.847]
0/5 * Epoch (valid): 100% 24/24 [00:56<00:00, 2.34s/it, _timers/_fps=83.857, dice=0.518, loss=0.730]
[2019-09-17 22:48:17,771]
0/5 * Epoch 17 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=10.2486 | _timers/batch_time=2.3431 | _timers/data_time=2.1535 | _timers/model_time=0.1896 | dice=0.5128 | loss=0.7832
0/5 * Epoch 17 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=14.2855 | _timers/batch_time=2.0553 | _timers/data_time=1.9365 | _timers/model_time=0.1188 | dice=0.5508 | loss=0.7353
1/5 * Epoch (train): 23% 48/208 [02:43<09:01, 3.38s/it, _timers/_fps=10.545, dice=0.596, loss=0.678]
utils.plot_metrics(
logdir=logdir,
# specify which metrics we want to plot
metrics=["loss", "dice", 'lr', '_base/lr']
)
jovian.notify('training complete 5eps')
[jovian] message_sent:True
Let's make predictions on validation dataset.
At first we need to optimize thresholds
encoded_pixels = []
loaders = {"infer": valid_loader}
runner.infer(
model=model,
loaders=loaders,
callbacks=[
CheckpointCallback(
resume=f"{logdir}/checkpoints/best.pth"),
InferCallback()
],
)
valid_masks = []
probabilities = np.zeros((2220, 350, 525))
for i, (batch, output) in enumerate(tqdm.tqdm(zip(
valid_dataset, runner.callbacks[0].predictions["logits"]))):
image, mask = batch
for m in mask:
if m.shape != (350, 525):
m = cv2.resize(m, dsize=(525, 350), interpolation=cv2.INTER_LINEAR)
valid_masks.append(m)
for j, probability in enumerate(output):
if probability.shape != (350, 525):
probability = cv2.resize(probability, dsize=(525, 350), interpolation=cv2.INTER_LINEAR)
probabilities[i * 4 + j, :, :] = probability
---------------------------------------
ExceptionTraceback (most recent call last)
<ipython-input-15-f2477811b7df> in <module>
7 CheckpointCallback(
8 resume=f"{logdir}/checkpoints/best.pth"),
----> 9 InferCallback()
10 ],
11 )
~/anaconda3/envs/pytorch/lib/python3.7/site-packages/catalyst/dl/runner/supervised.py in infer(self, model, loaders, callbacks, verbose, state_kwargs, fp16, check)
152 distributed_params=fp16
153 )
--> 154 self.run_experiment(experiment, check=check)
155
156 def predict_loader(
~/anaconda3/envs/pytorch/lib/python3.7/site-packages/catalyst/dl/core/runner.py in run_experiment(self, experiment, check)
194 except (Exception, KeyboardInterrupt) as ex:
195 self.state.exception = ex
--> 196 self._run_event("exception")
197
198 return self
~/anaconda3/envs/pytorch/lib/python3.7/site-packages/catalyst/dl/core/runner.py in _run_event(self, event)
94
95 if self.state is not None and hasattr(self.state, f"on_{event}_post"):
---> 96 getattr(self.state, f"on_{event}_post")()
97
98 @abstractmethod
~/anaconda3/envs/pytorch/lib/python3.7/site-packages/catalyst/dl/core/state.py in on_exception_post(self)
175 def on_exception_post(self):
176 for logger in self.loggers.values():
--> 177 logger.on_exception(self)
178
179
~/anaconda3/envs/pytorch/lib/python3.7/site-packages/catalyst/dl/callbacks/logging.py in on_exception(self, state)
192
193 if state.need_reraise_exception:
--> 194 raise exception
195
196
~/anaconda3/envs/pytorch/lib/python3.7/site-packages/catalyst/dl/core/runner.py in run_experiment(self, experiment, check)
191 try:
192 for stage in self.experiment.stages:
--> 193 self._run_stage(stage)
194 except (Exception, KeyboardInterrupt) as ex:
195 self.state.exception = ex
~/anaconda3/envs/pytorch/lib/python3.7/site-packages/catalyst/dl/core/runner.py in _run_stage(self, stage)
168 self.callbacks = self.experiment.get_callbacks(stage)
169
--> 170 self._run_event("stage_start")
171 for epoch in range(self.state.num_epochs):
172 self.state.stage_epoch = epoch
~/anaconda3/envs/pytorch/lib/python3.7/site-packages/catalyst/dl/core/runner.py in _run_event(self, event)
91 if self.callbacks is not None:
92 for callback in self.callbacks.values():
---> 93 getattr(callback, f"on_{event}")(self.state)
94
95 if self.state is not None and hasattr(self.state, f"on_{event}_post"):
~/anaconda3/envs/pytorch/lib/python3.7/site-packages/catalyst/dl/callbacks/checkpoint.py in on_stage_start(self, state)
209
210 if self.resume is not None:
--> 211 self.load_checkpoint(filename=self.resume, state=state)
212
213 def on_epoch_end(self, state: RunnerState):
~/anaconda3/envs/pytorch/lib/python3.7/site-packages/catalyst/dl/callbacks/checkpoint.py in load_checkpoint(filename, state)
126 )
127 else:
--> 128 raise Exception(f"No checkpoint found at {filename}")
129
130 def get_metric(self, last_valid_metrics) -> Dict:
Exception: No checkpoint found at ./logs/d169-from17/checkpoints/best.pth
First of all, my thanks to @samusram for finding a mistake in my validation https://www.kaggle.com/c/understanding_cloud_organization/discussion/107711#622412
And now I find optimal values separately for each class.
class_params = {}
for class_id in range(4):
print(class_id)
attempts = []
for t in range(0, 100, 5):
t /= 100
for ms in [0, 100, 1200, 5000, 10000]:
masks = []
for i in range(class_id, len(probabilities), 4):
probability = probabilities[i]
predict, num_predict = post_process(sigmoid(probability), t, ms)
masks.append(predict)
d = []
for i, j in zip(masks, valid_masks[class_id::4]):
if (i.sum() == 0) & (j.sum() == 0):
d.append(1)
else:
d.append(dice(i, j))
attempts.append((t, ms, np.mean(d)))
attempts_df = pd.DataFrame(attempts, columns=['threshold', 'size', 'dice'])
attempts_df = attempts_df.sort_values('dice', ascending=False)
print(attempts_df.head())
best_threshold = attempts_df['threshold'].values[0]
best_size = attempts_df['size'].values[0]
class_params[class_id] = (best_threshold, best_size)
print(class_params)
sns.lineplot(x='threshold', y='dice', hue='size', data=attempts_df);
plt.title('Threshold and min size vs dice for one of the classes');
Now let's have a look at our masks.
for i, (input, output) in enumerate(zip(
valid_dataset, runner.callbacks[0].predictions["logits"])):
image, mask = input
image_vis = image.transpose(1, 2, 0)
mask = mask.astype('uint8').transpose(1, 2, 0)
pr_mask = np.zeros((350, 525, 4))
for j in range(4):
probability = cv2.resize(output[:, :, j], dsize=(525, 350), interpolation=cv2.INTER_LINEAR)
pr_mask[:, :, j], _ = post_process(sigmoid(probability), class_params[j][0], class_params[j][1])
#pr_mask = (sigmoid(output) > best_threshold).astype('uint8').transpose(1, 2, 0)
visualize_with_raw(image=image_vis, mask=pr_mask, original_image=image_vis, original_mask=mask, raw_image=image_vis, raw_mask=output.transpose(1, 2, 0))
if i >= 2:
break
import gc
torch.cuda.empty_cache()
gc.collect()
test_dataset = CloudDataset(df=sub, datatype='test', img_ids=test_ids, transforms = get_validation_augmentation(), preprocessing=get_preprocessing(preprocessing_fn))
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=0)
loaders = {"test": test_loader}
encoded_pixels = []
image_id = 0
for i, test_batch in enumerate(tqdm.tqdm(loaders['test'])):
runner_out = runner.predict_batch({"features": test_batch[0].cuda()})['logits']
for i, batch in enumerate(runner_out):
for probability in batch:
probability = probability.cpu().detach().numpy()
if probability.shape != (350, 525):
probability = cv2.resize(probability, dsize=(525, 350), interpolation=cv2.INTER_LINEAR)
predict, num_predict = post_process(sigmoid(probability), class_params[image_id % 4][0], class_params[image_id % 4][1])
if num_predict == 0:
encoded_pixels.append('')
else:
r = mask2rle(predict)
encoded_pixels.append(r)
image_id += 1
sub['EncodedPixels'] = encoded_pixels
sub.to_csv('submissiond169_e19+5.csv', columns=['Image_Label', 'EncodedPixels'], index=False)
jovian.commit(secret=True, artifacts=['submissiond169_e19+5.csv'], nb_filename='Catalyst-pytorch-densenet.ipynb')