Learn data science and machine learning by building real-world projects on Jovian
In [1]:
!nvidia-smi
Thu Nov 14 10:41:37 2019 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 418.67 Driver Version: 418.67 CUDA Version: 10.1 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | |===============================+======================+======================| | 0 Tesla V100-SXM2... On | 00000000:00:04.0 Off | 0 | | N/A 35C P0 37W / 300W | 0MiB / 16130MiB | 0% Default | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: GPU Memory | | GPU PID Type Process name Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+

Importing libraries

In [1]:
import os
import cv2
import collections
import time 
import tqdm
from PIL import Image
from functools import partial
train_on_gpu = True

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import torchvision
import torchvision.transforms as transforms
import torch
from torch.utils.data import TensorDataset, DataLoader,Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data.sampler import SubsetRandomSampler
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR

import albumentations as albu
from albumentations import torch as AT

from catalyst.data import Augmentor
from catalyst.dl import utils
from catalyst.data.reader import ImageReader, ScalarReader, ReaderCompose, LambdaReader
from catalyst.dl.runner import SupervisedRunner
from catalyst.contrib.models.segmentation import Unet
from catalyst.dl.callbacks import DiceCallback, EarlyStoppingCallback, InferCallback, CheckpointCallback, OptimizerCallback, CriterionCallback

import segmentation_models_pytorch as smp

import jovian

Helper functions and classes

In [2]:
def get_img(x, folder: str='train_images'):
    """
    Return image based on image name and folder.
    """
    data_folder = f"{path}/{folder}"
    image_path = os.path.join(data_folder, x)
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img


def rle_decode(mask_rle: str = '', shape: tuple = (1400, 2100)):
    '''
    Decode rle encoded mask.
    
    :param mask_rle: run-length as string formatted (start length)
    :param shape: (height, width) of array to return 
    Returns numpy array, 1 - mask, 0 - background
    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape, order='F')


def make_mask(df: pd.DataFrame, image_name: str='img.jpg', shape: tuple = (1400, 2100)):
    """
    Create mask based on df, image name and shape.
    """
    encoded_masks = df.loc[df['im_id'] == image_name, 'EncodedPixels']
    masks = np.zeros((shape[0], shape[1], 4), dtype=np.float32)

    for idx, label in enumerate(encoded_masks.values):
        if label is not np.nan:
            mask = rle_decode(label)
            masks[:, :, idx] = mask
            
    return masks


def to_tensor(x, **kwargs):
    """
    Convert image or mask.
    """
    return x.transpose(2, 0, 1).astype('float32')


def mask2rle(img):
    '''
    Convert mask to rle.
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels= img.T.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)


def visualize(image, mask, original_image=None, original_mask=None):
    """
    Plot image and masks.
    If two pairs of images and masks are passes, show both.
    """
    fontsize = 14
    class_dict = {0: 'Fish', 1: 'Flower', 2: 'Gravel', 3: 'Sugar'}
    
    if original_image is None and original_mask is None:
        f, ax = plt.subplots(1, 5, figsize=(24, 24))

        ax[0].imshow(image)
        for i in range(4):
            ax[i + 1].imshow(mask[:, :, i])
            ax[i + 1].set_title(f'Mask {class_dict[i]}', fontsize=fontsize)
    else:
        f, ax = plt.subplots(2, 5, figsize=(24, 12))

        ax[0, 0].imshow(original_image)
        ax[0, 0].set_title('Original image', fontsize=fontsize)
                
        for i in range(4):
            ax[0, i + 1].imshow(original_mask[:, :, i])
            ax[0, i + 1].set_title(f'Original mask {class_dict[i]}', fontsize=fontsize)
        
        ax[1, 0].imshow(image)
        ax[1, 0].set_title('Transformed image', fontsize=fontsize)
        
        
        for i in range(4):
            ax[1, i + 1].imshow(mask[:, :, i])
            ax[1, i + 1].set_title(f'Transformed mask {class_dict[i]}', fontsize=fontsize)
            
            
def visualize_with_raw(image, mask, original_image=None, original_mask=None, raw_image=None, raw_mask=None):
    """
    Plot image and masks.
    If two pairs of images and masks are passes, show both.
    """
    fontsize = 14
    class_dict = {0: 'Fish', 1: 'Flower', 2: 'Gravel', 3: 'Sugar'}

    f, ax = plt.subplots(3, 5, figsize=(24, 12))

    ax[0, 0].imshow(original_image)
    ax[0, 0].set_title('Original image', fontsize=fontsize)

    for i in range(4):
        ax[0, i + 1].imshow(original_mask[:, :, i])
        ax[0, i + 1].set_title(f'Original mask {class_dict[i]}', fontsize=fontsize)


    ax[1, 0].imshow(raw_image)
    ax[1, 0].set_title('Original image', fontsize=fontsize)

    for i in range(4):
        ax[1, i + 1].imshow(raw_mask[:, :, i])
        ax[1, i + 1].set_title(f'Raw predicted mask {class_dict[i]}', fontsize=fontsize)
        
    ax[2, 0].imshow(image)
    ax[2, 0].set_title('Transformed image', fontsize=fontsize)


    for i in range(4):
        ax[2, i + 1].imshow(mask[:, :, i])
        ax[2, i + 1].set_title(f'Predicted mask with processing {class_dict[i]}', fontsize=fontsize)
            
            
def plot_with_augmentation(image, mask, augment):
    """
    Wrapper for `visualize` function.
    """
    augmented = augment(image=image, mask=mask)
    image_flipped = augmented['image']
    mask_flipped = augmented['mask']
    visualize(image_flipped, mask_flipped, original_image=image, original_mask=mask)
    
    
sigmoid = lambda x: 1 / (1 + np.exp(-x))


def post_process(probability, threshold, min_size):
    """
    Post processing of each predicted mask, components with lesser number of pixels
    than `min_size` are ignored
    """
    # don't remember where I saw it
    mask = cv2.threshold(probability, threshold, 1, cv2.THRESH_BINARY)[1]
    num_component, component = cv2.connectedComponents(mask.astype(np.uint8))
    predictions = np.zeros((350, 525), np.float32)
    num = 0
    for c in range(1, num_component):
        p = (component == c)
        if p.sum() > min_size:
            predictions[p] = 1
            num += 1
    return predictions, num


def get_training_augmentation():
    train_transform = [

        albu.Resize(320, 640),
        
        albu.RandomBrightnessContrast(brightness_limit=(.1,.4), contrast_limit=(-.2,.2), p=0.5),
        albu.CLAHE(clip_limit=8.0, p=.1),
        
        albu.RandomSunFlare(flare_roi=(0,0,1,1), num_flare_circles_lower=1, 
                        num_flare_circles_upper=3, src_radius=10, src_color=(255,255,255)),
        albu.RandomFog(fog_coef_lower=.1, fog_coef_upper=.2, alpha_coef=.1),
        
        albu.ShiftScaleRotate(scale_limit=0.5, rotate_limit=30, shift_limit=0.1, border_mode=4),
        albu.OpticalDistortion(p=.4, distort_limit=(-1,1))
        
    ]
    return albu.Compose(train_transform)


def get_validation_augmentation():
    """Add paddings to make image shape divisible by 32"""
    test_transform = [
        albu.Resize(320, 640)
    ]
    return albu.Compose(test_transform)


def get_preprocessing(preprocessing_fn):
    """Construct preprocessing transform
    
    Args:
        preprocessing_fn (callbale): data normalization function 
            (can be specific for each pretrained neural network)
    Return:
        transform: albumentations.Compose
    
    """
    
    _transform = [
        albu.Lambda(image=preprocessing_fn),
        albu.Lambda(image=to_tensor, mask=to_tensor),
    ]
    return albu.Compose(_transform)


def dice(img1, img2):
    img1 = np.asarray(img1).astype(np.bool)
    img2 = np.asarray(img2).astype(np.bool)

    intersection = np.logical_and(img1, img2)

    return 2. * intersection.sum() / (img1.sum() + img2.sum())

Data overview

Let's have a look at the data first.

In [3]:
path = '../data'
os.listdir(path)
Out[3]:
['train.csv', 'test_images', 'train_images', 'sample_submission.csv']

We have folders with train and test images, file with train image ids and masks and sample submission.

In [4]:
train = pd.read_csv(f'{path}/train.csv')
sub = pd.read_csv(f'{path}/sample_submission.csv')
In [5]:
train['label'] = train['Image_Label'].apply(lambda x: x.split('_')[1])
train['im_id'] = train['Image_Label'].apply(lambda x: x.split('_')[0])


sub['label'] = sub['Image_Label'].apply(lambda x: x.split('_')[1])
sub['im_id'] = sub['Image_Label'].apply(lambda x: x.split('_')[0])

Preparing data for modelling

At first, let's create a list of unique image ids and the count of masks for images. This will allow us to make a stratified split based on this count.

In [6]:
id_mask_count = train.loc[train['EncodedPixels'].isnull() == False, 'Image_Label'].apply(lambda x: x.split('_')[0]).value_counts().\
reset_index().rename(columns={'index': 'img_id', 'Image_Label': 'count'})
train_ids, valid_ids = train_test_split(id_mask_count['img_id'].values, random_state=42, stratify=id_mask_count['count'], test_size=0.1)
test_ids = sub['Image_Label'].apply(lambda x: x.split('_')[0]).drop_duplicates().values

Setting up data for training in Catalyst

In [7]:
class CloudDataset(Dataset):
    def __init__(self, df: pd.DataFrame = None, datatype: str = 'train', img_ids: np.array = None,
                 transforms = albu.Compose([albu.HorizontalFlip(),AT.ToTensor()]),
                preprocessing=None):
        self.df = df
        if datatype != 'test':
            self.data_folder = f"{path}/train_images"
        else:
            self.data_folder = f"{path}/test_images"
        self.img_ids = img_ids
        self.transforms = transforms
        self.preprocessing = preprocessing

    def __getitem__(self, idx):
        image_name = self.img_ids[idx]
        mask = make_mask(self.df, image_name)
        image_path = os.path.join(self.data_folder, image_name)
        img = cv2.imread(image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        augmented = self.transforms(image=img, mask=mask)
        img = augmented['image']
        mask = augmented['mask']
        if self.preprocessing:
            preprocessed = self.preprocessing(image=img, mask=mask)
            img = preprocessed['image']
            mask = preprocessed['mask']
        return img, mask

    def __len__(self):
        return len(self.img_ids)

Now we define model and training parameters

In [8]:
ENCODER = 'efficientnet-b2'
ENCODER_WEIGHTS = 'imagenet'
DEVICE = 'cuda'

ACTIVATION = None
model = smp.Unet(
    encoder_name=ENCODER, 
    encoder_weights=ENCODER_WEIGHTS, 
    classes=4, 
    activation=torch.nn.functional.sigmoid,
)
preprocessing_fn = smp.encoders.get_preprocessing_fn(ENCODER, ENCODER_WEIGHTS)
In [9]:
num_workers = 8
bs = 24
train_dataset = CloudDataset(df=train, datatype='train', img_ids=train_ids, transforms = get_training_augmentation(), preprocessing=get_preprocessing(preprocessing_fn))
valid_dataset = CloudDataset(df=train, datatype='valid', img_ids=valid_ids, transforms = get_validation_augmentation(), preprocessing=get_preprocessing(preprocessing_fn))

train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, num_workers=num_workers)
valid_loader = DataLoader(valid_dataset, batch_size=bs, shuffle=False, num_workers=num_workers)

loaders = {
    "train": train_loader,
    "valid": valid_loader
}
/home/prajwal/anaconda3/envs/pytorch/lib/python3.7/site-packages/albumentations/augmentations/transforms.py:1734: UserWarning: Using lambda is incompatible with multiprocessing. Consider using regular functions or partial().

Model training 3 but next is loaded from 2nd epoch

In [10]:
num_epochs = 20
logdir = "./logs/eb2-320x480"

# model, criterion, optimizer
optimizer = torch.optim.Adam([
    {'params': model.decoder.parameters(), 'lr': 1e-2}, 
    {'params': model.encoder.parameters(), 'lr': 1e-3},  
])
scheduler = ReduceLROnPlateau(optimizer, factor=0.15, patience=2)
criterion = smp.utils.losses.BCEDiceLoss(eps=1.)
runner = SupervisedRunner()
In [ ]:
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=loaders,
    callbacks=[DiceCallback(),
               EarlyStoppingCallback(patience=5, min_delta=0.001)],
    logdir=logdir,
    num_epochs=num_epochs,
    fp16=True,
    verbose=True
)
Selected optimization level O1: Insert automatic casts around Pytorch functions and Tensor methods. Defaults for this optimization level are: enabled : True opt_level : O1 cast_model_type : None patch_torch_functions : True keep_batchnorm_fp32 : None master_weights : None loss_scale : dynamic Processing user overrides (additional kwargs that are not None)... After processing overrides, optimization options are: enabled : True opt_level : O1 cast_model_type : None patch_torch_functions : True keep_batchnorm_fp32 : None master_weights : None loss_scale : dynamic Warning: multi_tensor_applier fused unscale kernel is unavailable, possibly because apex was installed without --cuda_ext --cpp_ext. Using Python fallback. Original ImportError was: ModuleNotFoundError("No module named 'amp_C'") 0/20 * Epoch (train): 100% 208/208 [02:09<00:00, 1.60it/s, _timers/_fps=300.515, dice=0.431, loss=0.935] 0/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.97it/s, _timers/_fps=582.964, dice=0.391, loss=1.296] [2019-11-13 16:15:02,663] 0/20 * Epoch 0 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=168.4699 | _timers/batch_time=0.2602 | _timers/data_time=0.1800 | _timers/model_time=0.0799 | dice=0.4005 | loss=0.9794 0/20 * Epoch 0 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=231.1988 | _timers/batch_time=0.4745 | _timers/data_time=0.4249 | _timers/model_time=0.0495 | dice=0.4464 | loss=1.1487 1/20 * Epoch (train): 100% 208/208 [02:06<00:00, 1.65it/s, _timers/_fps=299.968, dice=0.520, loss=0.809] 1/20 * Epoch (valid): 100% 24/24 [00:11<00:00, 2.00it/s, _timers/_fps=554.640, dice=0.453, loss=0.877] [2019-11-13 16:17:22,562] 1/20 * Epoch 1 (train): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=157.5959 | _timers/batch_time=0.2012 | _timers/data_time=0.1054 | _timers/model_time=0.0956 | dice=0.4670 | loss=0.8859 1/20 * Epoch 1 (valid): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=236.8007 | _timers/batch_time=0.4665 | _timers/data_time=0.4180 | _timers/model_time=0.0484 | dice=0.4905 | loss=0.8650 2/20 * Epoch (train): 100% 208/208 [02:05<00:00, 1.65it/s, _timers/_fps=286.833, dice=0.438, loss=0.983] 2/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.98it/s, _timers/_fps=526.986, dice=0.491, loss=0.772] [2019-11-13 16:19:41,588] 2/20 * Epoch 2 (train): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=156.4076 | _timers/batch_time=0.2034 | _timers/data_time=0.1081 | _timers/model_time=0.0951 | dice=0.4836 | loss=0.8621 2/20 * Epoch 2 (valid): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=218.5928 | _timers/batch_time=0.4685 | _timers/data_time=0.4134 | _timers/model_time=0.0550 | dice=0.4722 | loss=0.8862 3/20 * Epoch (train): 100% 208/208 [02:06<00:00, 1.64it/s, _timers/_fps=300.965, dice=0.490, loss=0.835] 3/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.97it/s, _timers/_fps=550.403, dice=0.478, loss=0.786] [2019-11-13 16:22:01,461] 3/20 * Epoch 3 (train): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=158.5079 | _timers/batch_time=0.2015 | _timers/data_time=0.1092 | _timers/model_time=0.0922 | dice=0.4954 | loss=0.8435 3/20 * Epoch 3 (valid): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=242.4640 | _timers/batch_time=0.4754 | _timers/data_time=0.4258 | _timers/model_time=0.0496 | dice=0.4655 | loss=0.9121 4/20 * Epoch (train): 100% 208/208 [02:07<00:00, 1.63it/s, _timers/_fps=297.380, dice=0.536, loss=0.763] 4/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.95it/s, _timers/_fps=546.415, dice=0.489, loss=0.836] [2019-11-13 16:24:23,246] 4/20 * Epoch 4 (train): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=157.6246 | _timers/batch_time=0.1984 | _timers/data_time=0.1038 | _timers/model_time=0.0945 | dice=0.5072 | loss=0.8249 4/20 * Epoch 4 (valid): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=235.1258 | _timers/batch_time=0.4788 | _timers/data_time=0.4266 | _timers/model_time=0.0521 | dice=0.5263 | loss=0.7914 5/20 * Epoch (train): 100% 208/208 [02:07<00:00, 1.64it/s, _timers/_fps=300.675, dice=0.472, loss=0.867] 5/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.97it/s, _timers/_fps=545.527, dice=0.520, loss=0.759] [2019-11-13 16:26:43,985] 5/20 * Epoch 5 (train): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=157.1426 | _timers/batch_time=0.1980 | _timers/data_time=0.1017 | _timers/model_time=0.0962 | dice=0.5100 | loss=0.8223 5/20 * Epoch 5 (valid): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=230.0511 | _timers/batch_time=0.4749 | _timers/data_time=0.4233 | _timers/model_time=0.0514 | dice=0.5291 | loss=0.7846 6/20 * Epoch (train): 100% 208/208 [02:09<00:00, 1.61it/s, _timers/_fps=281.731, dice=0.559, loss=0.754] 6/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.93it/s, _timers/_fps=541.696, dice=0.506, loss=0.777] [2019-11-13 16:29:06,276] 6/20 * Epoch 6 (train): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=149.3209 | _timers/batch_time=0.2220 | _timers/data_time=0.1246 | _timers/model_time=0.0968 | dice=0.5189 | loss=0.8071 6/20 * Epoch 6 (valid): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=246.1768 | _timers/batch_time=0.4846 | _timers/data_time=0.4371 | _timers/model_time=0.0475 | dice=0.5362 | loss=0.7880 7/20 * Epoch (train): 100% 208/208 [02:09<00:00, 1.61it/s, _timers/_fps=284.734, dice=0.545, loss=0.746] 7/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.94it/s, _timers/_fps=539.900, dice=0.501, loss=0.760] [2019-11-13 16:31:28,864] 7/20 * Epoch 7 (train): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=152.0071 | _timers/batch_time=0.2112 | _timers/data_time=0.1158 | _timers/model_time=0.0953 | dice=0.5228 | loss=0.7984 7/20 * Epoch 7 (valid): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=221.2228 | _timers/batch_time=0.4788 | _timers/data_time=0.4189 | _timers/model_time=0.0598 | dice=0.5030 | loss=0.8316 8/20 * Epoch (train): 100% 208/208 [02:12<00:00, 1.57it/s, _timers/_fps=284.849, dice=0.533, loss=0.770] 8/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.91it/s, _timers/_fps=539.660, dice=0.484, loss=0.855] [2019-11-13 16:33:54,848] 8/20 * Epoch 8 (train): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=158.5999 | _timers/batch_time=0.2451 | _timers/data_time=0.1553 | _timers/model_time=0.0897 | dice=0.5282 | loss=0.7893 8/20 * Epoch 8 (valid): _base/lr=0.0100 | _base/momentum=0.9000 | _timers/_fps=228.2955 | _timers/batch_time=0.4893 | _timers/data_time=0.4351 | _timers/model_time=0.0540 | dice=0.5138 | loss=0.8328 9/20 * Epoch (train): 100% 208/208 [02:10<00:00, 1.60it/s, _timers/_fps=297.739, dice=0.567, loss=0.714] 9/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.93it/s, _timers/_fps=541.230, dice=0.540, loss=0.728] [2019-11-13 16:36:19,162] 9/20 * Epoch 9 (train): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=150.3547 | _timers/batch_time=0.2129 | _timers/data_time=0.1115 | _timers/model_time=0.1009 | dice=0.5554 | loss=0.7397 9/20 * Epoch 9 (valid): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=223.7382 | _timers/batch_time=0.4848 | _timers/data_time=0.4273 | _timers/model_time=0.0575 | dice=0.5472 | loss=0.7501 10/20 * Epoch (train): 100% 208/208 [02:10<00:00, 1.60it/s, _timers/_fps=292.962, dice=0.550, loss=0.726] 10/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.91it/s, _timers/_fps=550.223, dice=0.527, loss=0.752] [2019-11-13 16:38:43,666] 10/20 * Epoch 10 (train): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=153.0683 | _timers/batch_time=0.2103 | _timers/data_time=0.1119 | _timers/model_time=0.0983 | dice=0.5678 | loss=0.7242 10/20 * Epoch 10 (valid): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=228.7451 | _timers/batch_time=0.4906 | _timers/data_time=0.4378 | _timers/model_time=0.0527 | dice=0.5490 | loss=0.7500 11/20 * Epoch (train): 100% 208/208 [02:10<00:00, 1.60it/s, _timers/_fps=292.496, dice=0.590, loss=0.698] 11/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.90it/s, _timers/_fps=533.050, dice=0.539, loss=0.752] [2019-11-13 16:41:07,965] 11/20 * Epoch 11 (train): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=151.2250 | _timers/batch_time=0.2169 | _timers/data_time=0.1169 | _timers/model_time=0.0997 | dice=0.5710 | loss=0.7180 11/20 * Epoch 11 (valid): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=240.2356 | _timers/batch_time=0.4929 | _timers/data_time=0.4425 | _timers/model_time=0.0503 | dice=0.5583 | loss=0.7474
12/20 * Epoch (train): 100% 208/208 [02:11<00:00, 1.59it/s, _timers/_fps=283.001, dice=0.577, loss=0.713] 12/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.94it/s, _timers/_fps=555.059, dice=0.523, loss=0.782] [2019-11-13 16:43:32,897] 12/20 * Epoch 12 (train): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=150.8126 | _timers/batch_time=0.2244 | _timers/data_time=0.1231 | _timers/model_time=0.1008 | dice=0.5789 | loss=0.7072 12/20 * Epoch 12 (valid): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=230.6721 | _timers/batch_time=0.4831 | _timers/data_time=0.4304 | _timers/model_time=0.0526 | dice=0.5588 | loss=0.7456 13/20 * Epoch (train): 100% 208/208 [02:10<00:00, 1.59it/s, _timers/_fps=280.248, dice=0.595, loss=0.702] 13/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.93it/s, _timers/_fps=548.105, dice=0.530, loss=0.763] [2019-11-13 16:45:57,020] 13/20 * Epoch 13 (train): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=151.3243 | _timers/batch_time=0.2178 | _timers/data_time=0.1199 | _timers/model_time=0.0979 | dice=0.5841 | loss=0.6995 13/20 * Epoch 13 (valid): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=222.2551 | _timers/batch_time=0.4855 | _timers/data_time=0.4347 | _timers/model_time=0.0507 | dice=0.5574 | loss=0.7520 14/20 * Epoch (train): 100% 208/208 [02:11<00:00, 1.59it/s, _timers/_fps=282.941, dice=0.585, loss=0.667] 14/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.90it/s, _timers/_fps=531.536, dice=0.516, loss=0.783] [2019-11-13 16:48:21,747] 14/20 * Epoch 14 (train): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=154.3691 | _timers/batch_time=0.2089 | _timers/data_time=0.1105 | _timers/model_time=0.0983 | dice=0.5876 | loss=0.6919 14/20 * Epoch 14 (valid): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=225.4951 | _timers/batch_time=0.4927 | _timers/data_time=0.4423 | _timers/model_time=0.0503 | dice=0.5535 | loss=0.7538 15/20 * Epoch (train): 100% 208/208 [02:11<00:00, 1.58it/s, _timers/_fps=290.965, dice=0.608, loss=0.652] 15/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.89it/s, _timers/_fps=496.101, dice=0.511, loss=0.806] [2019-11-13 16:50:46,952] 15/20 * Epoch 15 (train): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=149.0953 | _timers/batch_time=0.2144 | _timers/data_time=0.1167 | _timers/model_time=0.0976 | dice=0.5900 | loss=0.6895 15/20 * Epoch 15 (valid): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=242.0496 | _timers/batch_time=0.4969 | _timers/data_time=0.4519 | _timers/model_time=0.0449 | dice=0.5552 | loss=0.7562 16/20 * Epoch (train): 100% 208/208 [02:10<00:00, 1.59it/s, _timers/_fps=283.523, dice=0.608, loss=0.668] 16/20 * Epoch (valid): 100% 24/24 [00:12<00:00, 1.90it/s, _timers/_fps=487.856, dice=0.528, loss=0.776] [2019-11-13 16:53:10,942] 16/20 * Epoch 16 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=153.0887 | _timers/batch_time=0.2107 | _timers/data_time=0.1119 | _timers/model_time=0.0985 | dice=0.6001 | loss=0.6715 16/20 * Epoch 16 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=191.7669 | _timers/batch_time=0.4920 | _timers/data_time=0.4238 | _timers/model_time=0.0682 | dice=0.5582 | loss=0.7535 17/20 * Epoch (train): 95% 198/208 [02:08<00:04, 2.03it/s, _timers/_fps=223.534, dice=0.660, loss=0.565]
In [ ]:
utils.plot_metrics(
    logdir=logdir, 
    # specify which metrics we want to plot
    metrics=["loss", "dice", 'lr', '_base/lr']
)
In [ ]:
jovian.notify("20 epochs done")

Model training

In [14]:
num_epochs = 15

# model, criterion, optimizer
optimizer = torch.optim.Adam([
    {'params': model.decoder.parameters(), 'lr': 3e-4}, 
    {'params': model.encoder.parameters(), 'lr': 1e-4},  
])
scheduler = ReduceLROnPlateau(optimizer, factor=0.15, patience=2)
criterion = smp.utils.losses.BCEDiceLoss(eps=1.)
runner = SupervisedRunner()
In [15]:
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=loaders,
    callbacks=[DiceCallback(),
               EarlyStoppingCallback(patience=5, min_delta=0.001),
              CheckpointCallback(resume='logs/eb2-320x480/checkpoints/best_full.pth')],
    logdir=logdir,
    num_epochs=num_epochs,
    fp16=True,
    verbose=True
)
Selected optimization level O1: Insert automatic casts around Pytorch functions and Tensor methods. Defaults for this optimization level are: enabled : True opt_level : O1 cast_model_type : None patch_torch_functions : True keep_batchnorm_fp32 : None master_weights : None loss_scale : dynamic Processing user overrides (additional kwargs that are not None)... After processing overrides, optimization options are: enabled : True opt_level : O1 cast_model_type : None patch_torch_functions : True keep_batchnorm_fp32 : None master_weights : None loss_scale : dynamic Warning: multi_tensor_applier fused unscale kernel is unavailable, possibly because apex was installed without --cuda_ext --cpp_ext. Using Python fallback. Original ImportError was: ModuleNotFoundError("No module named 'amp_C'") => loading checkpoint logs/eb2-320x480/checkpoints/best_full.pth loaded checkpoint logs/eb2-320x480/checkpoints/best_full.pth (epoch 12) 0/15 * Epoch (train): 100% 208/208 [02:35<00:00, 1.34it/s, _timers/_fps=274.846, dice=0.561, loss=0.733] 0/15 * Epoch (valid): 100% 24/24 [00:14<00:00, 1.60it/s, _timers/_fps=497.088, dice=0.624, loss=0.623] [2019-11-14 10:45:23,708] 0/15 * Epoch 12 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=149.7818 | _timers/batch_time=0.3811 | _timers/data_time=0.2999 | _timers/model_time=0.0810 | dice=0.5649 | loss=0.7319 0/15 * Epoch 12 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=228.5129 | _timers/batch_time=0.5917 | _timers/data_time=0.5439 | _timers/model_time=0.0476 | dice=0.6201 | loss=0.6258 1/15 * Epoch (train): 100% 208/208 [02:37<00:00, 1.32it/s, _timers/_fps=304.386, dice=0.576, loss=0.709] 1/15 * Epoch (valid): 100% 24/24 [00:14<00:00, 1.62it/s, _timers/_fps=534.888, dice=0.605, loss=0.657] [2019-11-14 10:48:17,053] 1/15 * Epoch 13 (train): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=138.4406 | _timers/batch_time=0.3427 | _timers/data_time=0.2429 | _timers/model_time=0.0996 | dice=0.5670 | loss=0.7255 1/15 * Epoch 13 (valid): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=234.1444 | _timers/batch_time=0.5858 | _timers/data_time=0.5299 | _timers/model_time=0.0558 | dice=0.6128 | loss=0.6360 2/15 * Epoch (train): 100% 208/208 [02:36<00:00, 1.33it/s, _timers/_fps=300.153, dice=0.600, loss=0.658] 2/15 * Epoch (valid): 100% 24/24 [00:14<00:00, 1.62it/s, _timers/_fps=483.083, dice=0.622, loss=0.619] [2019-11-14 10:51:10,333] 2/15 * Epoch 14 (train): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=128.0244 | _timers/batch_time=0.3320 | _timers/data_time=0.2250 | _timers/model_time=0.1067 | dice=0.5709 | loss=0.7184 2/15 * Epoch 14 (valid): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=228.2680 | _timers/batch_time=0.5875 | _timers/data_time=0.5331 | _timers/model_time=0.0543 | dice=0.6208 | loss=0.6242 3/15 * Epoch (train): 100% 208/208 [02:37<00:00, 1.32it/s, _timers/_fps=292.036, dice=0.573, loss=0.764] 3/15 * Epoch (valid): 100% 24/24 [00:14<00:00, 1.60it/s, _timers/_fps=526.666, dice=0.617, loss=0.635] [2019-11-14 10:54:03,823] 3/15 * Epoch 15 (train): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=134.4778 | _timers/batch_time=0.3325 | _timers/data_time=0.2294 | _timers/model_time=0.1029 | dice=0.5775 | loss=0.7081 3/15 * Epoch 15 (valid): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=224.6018 | _timers/batch_time=0.5929 | _timers/data_time=0.5393 | _timers/model_time=0.0535 | dice=0.6235 | loss=0.6257 4/15 * Epoch (train): 100% 208/208 [02:36<00:00, 1.33it/s, _timers/_fps=300.307, dice=0.613, loss=0.611] 4/15 * Epoch (valid): 100% 24/24 [00:14<00:00, 1.62it/s, _timers/_fps=535.953, dice=0.606, loss=0.643] [2019-11-14 10:56:56,388] 4/15 * Epoch 16 (train): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=125.5049 | _timers/batch_time=0.3231 | _timers/data_time=0.2127 | _timers/model_time=0.1100 | dice=0.5801 | loss=0.7044 4/15 * Epoch 16 (valid): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=237.6184 | _timers/batch_time=0.5848 | _timers/data_time=0.5318 | _timers/model_time=0.0529 | dice=0.6221 | loss=0.6296 5/15 * Epoch (train): 100% 208/208 [02:37<00:00, 1.32it/s, _timers/_fps=294.476, dice=0.543, loss=0.784] 5/15 * Epoch (valid): 100% 24/24 [00:14<00:00, 1.62it/s, _timers/_fps=539.027, dice=0.631, loss=0.608] [2019-11-14 10:59:49,689] 5/15 * Epoch 17 (train): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=133.8733 | _timers/batch_time=0.3285 | _timers/data_time=0.2229 | _timers/model_time=0.1054 | dice=0.5817 | loss=0.7041 5/15 * Epoch 17 (valid): _base/lr=0.0015 | _base/momentum=0.9000 | _timers/_fps=236.0372 | _timers/batch_time=0.5888 | _timers/data_time=0.5364 | _timers/model_time=0.0523 | dice=0.6225 | loss=0.6291 6/15 * Epoch (train): 100% 208/208 [02:37<00:00, 1.32it/s, _timers/_fps=271.316, dice=0.554, loss=0.742] 6/15 * Epoch (valid): 100% 24/24 [00:14<00:00, 1.60it/s, _timers/_fps=485.159, dice=0.625, loss=0.622] [2019-11-14 11:02:43,892] 6/15 * Epoch 18 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=130.8651 | _timers/batch_time=0.3368 | _timers/data_time=0.2330 | _timers/model_time=0.1036 | dice=0.5857 | loss=0.6918 6/15 * Epoch 18 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=218.0487 | _timers/batch_time=0.5904 | _timers/data_time=0.5265 | _timers/model_time=0.0638 | dice=0.6263 | loss=0.6208 7/15 * Epoch (train): 100% 208/208 [02:38<00:00, 1.32it/s, _timers/_fps=283.434, dice=0.623, loss=0.633] 7/15 * Epoch (valid): 100% 24/24 [00:15<00:00, 1.59it/s, _timers/_fps=466.088, dice=0.622, loss=0.631] [2019-11-14 11:05:38,025] 7/15 * Epoch 19 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=134.3739 | _timers/batch_time=0.3344 | _timers/data_time=0.2347 | _timers/model_time=0.0994 | dice=0.5886 | loss=0.6882 7/15 * Epoch 19 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=228.7360 | _timers/batch_time=0.5986 | _timers/data_time=0.5487 | _timers/model_time=0.0497 | dice=0.6273 | loss=0.6212 8/15 * Epoch (train): 100% 208/208 [02:37<00:00, 1.32it/s, _timers/_fps=287.971, dice=0.559, loss=0.731] 8/15 * Epoch (valid): 100% 24/24 [00:15<00:00, 1.59it/s, _timers/_fps=517.597, dice=0.616, loss=0.645] [2019-11-14 11:08:31,934] 8/15 * Epoch 20 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=134.5922 | _timers/batch_time=0.3223 | _timers/data_time=0.2176 | _timers/model_time=0.1045 | dice=0.5943 | loss=0.6790 8/15 * Epoch 20 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=234.5711 | _timers/batch_time=0.5976 | _timers/data_time=0.5466 | _timers/model_time=0.0510 | dice=0.6287 | loss=0.6228 9/15 * Epoch (train): 100% 208/208 [02:37<00:00, 1.32it/s, _timers/_fps=261.653, dice=0.590, loss=0.725] 9/15 * Epoch (valid): 100% 24/24 [00:14<00:00, 1.61it/s, _timers/_fps=528.869, dice=0.622, loss=0.646] [2019-11-14 11:11:25,853] 9/15 * Epoch 21 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=128.4523 | _timers/batch_time=0.3242 | _timers/data_time=0.2139 | _timers/model_time=0.1102 | dice=0.5955 | loss=0.6766 9/15 * Epoch 21 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=223.5535 | _timers/batch_time=0.5881 | _timers/data_time=0.5289 | _timers/model_time=0.0591 | dice=0.6307 | loss=0.6199 10/15 * Epoch (train): 100% 208/208 [02:38<00:00, 1.31it/s, _timers/_fps=268.347, dice=0.602, loss=0.697] 10/15 * Epoch (valid): 100% 24/24 [00:15<00:00, 1.60it/s, _timers/_fps=521.064, dice=0.617, loss=0.653] [2019-11-14 11:14:20,474] 10/15 * Epoch 22 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=130.7441 | _timers/batch_time=0.3361 | _timers/data_time=0.2292 | _timers/model_time=0.1067 | dice=0.5947 | loss=0.6797 10/15 * Epoch 22 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=235.6346 | _timers/batch_time=0.5966 | _timers/data_time=0.5440 | _timers/model_time=0.0525 | dice=0.6303 | loss=0.6202 11/15 * Epoch (train): 100% 208/208 [02:38<00:00, 1.32it/s, _timers/_fps=268.808, dice=0.568, loss=0.721]
11/15 * Epoch (valid): 100% 24/24 [00:14<00:00, 1.61it/s, _timers/_fps=518.856, dice=0.609, loss=0.669] Early stop at 11 epoch [2019-11-14 11:17:14,366] 11/15 * Epoch 23 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=126.3406 | _timers/batch_time=0.3250 | _timers/data_time=0.2165 | _timers/model_time=0.1083 | dice=0.5926 | loss=0.6849 11/15 * Epoch 23 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=216.1596 | _timers/batch_time=0.5910 | _timers/data_time=0.5326 | _timers/model_time=0.0583 | dice=0.6293 | loss=0.6227 Top best models: logs/eb2-320x480/checkpoints//train.21.pth 0.6199
In [16]:
utils.plot_metrics(
    logdir=logdir, 
    # specify which metrics we want to plot
    metrics=["loss", "dice", 'lr', '_base/lr']
)
In [17]:
jovian.notify('15 eps done')
[jovian] message_sent:True

Model training after 13th epoch

In [11]:
num_epochs = 10


# model, criterion, optimizer
optimizer = torch.optim.Adam([
    {'params': model.decoder.parameters(), 'lr': 1e-5}, 
    {'params': model.encoder.parameters(), 'lr': 1e-5},  
])
scheduler = ReduceLROnPlateau(optimizer, factor=0.15, patience=2)
criterion = smp.utils.losses.BCEDiceLoss(eps=1.)
runner = SupervisedRunner()
In [12]:
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=loaders,
    callbacks=[DiceCallback(),
               EarlyStoppingCallback(patience=5, min_delta=0.001),
              CheckpointCallback(resume='logs/eb2-320x480/checkpoints/best_full.pth')],
    logdir=logdir,
    num_epochs=num_epochs,
    fp16=True,
    verbose=True
)
Selected optimization level O1: Insert automatic casts around Pytorch functions and Tensor methods. Defaults for this optimization level are: enabled : True opt_level : O1 cast_model_type : None patch_torch_functions : True keep_batchnorm_fp32 : None master_weights : None loss_scale : dynamic Processing user overrides (additional kwargs that are not None)... After processing overrides, optimization options are: enabled : True opt_level : O1 cast_model_type : None patch_torch_functions : True keep_batchnorm_fp32 : None master_weights : None loss_scale : dynamic Warning: multi_tensor_applier fused unscale kernel is unavailable, possibly because apex was installed without --cuda_ext --cpp_ext. Using Python fallback. Original ImportError was: ModuleNotFoundError("No module named 'amp_C'") => loading checkpoint logs/eb2-320x480/checkpoints/best_full.pth loaded checkpoint logs/eb2-320x480/checkpoints/best_full.pth (epoch 21) 0/10 * Epoch (train): 100% 208/208 [02:35<00:00, 1.34it/s, _timers/_fps=257.365, dice=0.591, loss=0.674] 0/10 * Epoch (valid): 100% 24/24 [00:14<00:00, 1.62it/s, _timers/_fps=476.601, dice=0.548, loss=0.809] [2019-11-14 11:39:44,045] 0/10 * Epoch 21 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=137.2908 | _timers/batch_time=0.3153 | _timers/data_time=0.2271 | _timers/model_time=0.0880 | dice=0.5845 | loss=0.6983 0/10 * Epoch 21 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=204.0633 | _timers/batch_time=0.5728 | _timers/data_time=0.5185 | _timers/model_time=0.0542 | dice=0.6369 | loss=0.6021 1/10 * Epoch (train): 100% 208/208 [02:34<00:00, 1.34it/s, _timers/_fps=234.981, dice=0.627, loss=0.605] 1/10 * Epoch (valid): 100% 24/24 [00:14<00:00, 1.61it/s, _timers/_fps=452.334, dice=0.537, loss=0.843] [2019-11-14 11:42:34,660] 1/10 * Epoch 22 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=130.9066 | _timers/batch_time=0.3026 | _timers/data_time=0.2112 | _timers/model_time=0.0913 | dice=0.5846 | loss=0.6973 1/10 * Epoch 22 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=193.8036 | _timers/batch_time=0.5771 | _timers/data_time=0.5165 | _timers/model_time=0.0605 | dice=0.6360 | loss=0.6024 2/10 * Epoch (train): 100% 208/208 [02:33<00:00, 1.35it/s, _timers/_fps=241.046, dice=0.577, loss=0.689] 2/10 * Epoch (valid): 100% 24/24 [00:14<00:00, 1.61it/s, _timers/_fps=493.503, dice=0.546, loss=0.815] [2019-11-14 11:45:24,719] 2/10 * Epoch 23 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=137.2677 | _timers/batch_time=0.3112 | _timers/data_time=0.2263 | _timers/model_time=0.0847 | dice=0.5884 | loss=0.6890 2/10 * Epoch 23 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=202.1017 | _timers/batch_time=0.5751 | _timers/data_time=0.5203 | _timers/model_time=0.0547 | dice=0.6382 | loss=0.5992 3/10 * Epoch (train): 100% 208/208 [02:34<00:00, 1.35it/s, _timers/_fps=255.078, dice=0.587, loss=0.737] 3/10 * Epoch (valid): 100% 24/24 [00:15<00:00, 1.59it/s, _timers/_fps=445.074, dice=0.546, loss=0.810] [2019-11-14 11:48:15,959] 3/10 * Epoch 24 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=132.4858 | _timers/batch_time=0.3081 | _timers/data_time=0.2181 | _timers/model_time=0.0898 | dice=0.5880 | loss=0.6906 3/10 * Epoch 24 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=189.8330 | _timers/batch_time=0.5826 | _timers/data_time=0.5210 | _timers/model_time=0.0615 | dice=0.6392 | loss=0.5972 4/10 * Epoch (train): 100% 208/208 [02:37<00:00, 1.32it/s, _timers/_fps=246.836, dice=0.566, loss=0.726] 4/10 * Epoch (valid): 100% 24/24 [00:15<00:00, 1.59it/s, _timers/_fps=477.961, dice=0.547, loss=0.807] [2019-11-14 11:51:10,204] 4/10 * Epoch 25 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=128.6625 | _timers/batch_time=0.3163 | _timers/data_time=0.2237 | _timers/model_time=0.0924 | dice=0.5896 | loss=0.6858 4/10 * Epoch 25 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=198.0681 | _timers/batch_time=0.5819 | _timers/data_time=0.5341 | _timers/model_time=0.0477 | dice=0.6406 | loss=0.5957 5/10 * Epoch (train): 100% 208/208 [02:38<00:00, 1.31it/s, _timers/_fps=248.973, dice=0.582, loss=0.747] 5/10 * Epoch (valid): 100% 24/24 [00:15<00:00, 1.59it/s, _timers/_fps=489.403, dice=0.551, loss=0.797] [2019-11-14 11:54:05,529] 5/10 * Epoch 26 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=133.2627 | _timers/batch_time=0.3216 | _timers/data_time=0.2348 | _timers/model_time=0.0867 | dice=0.5897 | loss=0.6890 5/10 * Epoch 26 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=191.6245 | _timers/batch_time=0.5807 | _timers/data_time=0.5231 | _timers/model_time=0.0575 | dice=0.6387 | loss=0.5953 6/10 * Epoch (train): 100% 208/208 [02:35<00:00, 1.34it/s, _timers/_fps=245.941, dice=0.507, loss=0.818] 6/10 * Epoch (valid): 100% 24/24 [00:15<00:00, 1.58it/s, _timers/_fps=487.405, dice=0.552, loss=0.803] [2019-11-14 11:56:57,377] 6/10 * Epoch 27 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=132.8474 | _timers/batch_time=0.3058 | _timers/data_time=0.2146 | _timers/model_time=0.0911 | dice=0.5917 | loss=0.6842 6/10 * Epoch 27 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=202.6300 | _timers/batch_time=0.5875 | _timers/data_time=0.5353 | _timers/model_time=0.0521 | dice=0.6398 | loss=0.5955 7/10 * Epoch (train): 100% 208/208 [02:33<00:00, 1.35it/s, _timers/_fps=249.808, dice=0.579, loss=0.730] 7/10 * Epoch (valid): 100% 24/24 [00:15<00:00, 1.58it/s, _timers/_fps=461.348, dice=0.552, loss=0.800] [2019-11-14 11:59:47,906] 7/10 * Epoch 28 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=134.0294 | _timers/batch_time=0.3011 | _timers/data_time=0.2098 | _timers/model_time=0.0912 | dice=0.5937 | loss=0.6800 7/10 * Epoch 28 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=211.4497 | _timers/batch_time=0.5877 | _timers/data_time=0.5421 | _timers/model_time=0.0454 | dice=0.6409 | loss=0.5945 8/10 * Epoch (train): 100% 208/208 [02:41<00:00, 1.29it/s, _timers/_fps=229.911, dice=0.585, loss=0.707] 8/10 * Epoch (valid): 100% 24/24 [00:15<00:00, 1.59it/s, _timers/_fps=448.789, dice=0.537, loss=0.827] [2019-11-14 12:02:45,287] 8/10 * Epoch 29 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=128.6135 | _timers/batch_time=0.3302 | _timers/data_time=0.2365 | _timers/model_time=0.0934 | dice=0.5952 | loss=0.6791 8/10 * Epoch 29 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=192.2337 | _timers/batch_time=0.5815 | _timers/data_time=0.5272 | _timers/model_time=0.0542 | dice=0.6391 | loss=0.5967 9/10 * Epoch (train): 100% 208/208 [02:30<00:00, 1.38it/s, _timers/_fps=245.578, dice=0.606, loss=0.617] 9/10 * Epoch (valid): 100% 24/24 [00:15<00:00, 1.59it/s, _timers/_fps=470.115, dice=0.541, loss=0.815] [2019-11-14 12:05:32,110] 9/10 * Epoch 30 (train): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=132.4318 | _timers/batch_time=0.2460 | _timers/data_time=0.1450 | _timers/model_time=0.1008 | dice=0.5976 | loss=0.6742 9/10 * Epoch 30 (valid): _base/lr=0.0002 | _base/momentum=0.9000 | _timers/_fps=185.9768 | _timers/batch_time=0.5828 | _timers/data_time=0.5175 | _timers/model_time=0.0652 | dice=0.6409 | loss=0.5950 Top best models: logs/eb2-320x480/checkpoints//train.28.pth 0.5945
In [13]:
utils.plot_metrics(
    logdir=logdir, 
    # specify which metrics we want to plot
    metrics=["loss", "dice", 'lr', '_base/lr']
)
In [14]:
jovian.notify('done')
[jovian] message_sent:True

Exploring predictions

Let's make predictions on validation dataset.

At first we need to optimize thresholds

In [15]:
encoded_pixels = []
loaders = {"infer": valid_loader}
runner.infer(
    model=model,
    loaders=loaders,
    callbacks=[
        CheckpointCallback(
            resume=f"{logdir}/checkpoints/best.pth"),
        InferCallback()
    ],
)
valid_masks = []
probabilities = np.zeros((2220, 350, 525))
for i, (batch, output) in enumerate(tqdm.tqdm(zip(
        valid_dataset, runner.callbacks[0].predictions["logits"]))):
    image, mask = batch
    for m in mask:
        if m.shape != (350, 525):
            m = cv2.resize(m, dsize=(525, 350), interpolation=cv2.INTER_LINEAR)
        valid_masks.append(m)

    for j, probability in enumerate(output):
        if probability.shape != (350, 525):
            probability = cv2.resize(probability, dsize=(525, 350), interpolation=cv2.INTER_LINEAR)
        probabilities[i * 4 + j, :, :] = probability
=> loading checkpoint ./logs/eb2-320x480/checkpoints/best.pth loaded checkpoint ./logs/eb2-320x480/checkpoints/best.pth (epoch 28)
1it [00:00, 8.03it/s]
Top best models:
555it [01:02, 8.92it/s]

Find optimal values

First of all, my thanks to @samusram for finding a mistake in my validation https://www.kaggle.com/c/understanding_cloud_organization/discussion/107711#622412

And now I find optimal values separately for each class.

In [16]:
class_params = {}
for class_id in range(4):
    print(class_id)
    attempts = []
    for t in range(0, 100, 5):
        t /= 100
        for ms in [0, 100, 1200, 5000, 10000]:
            masks = []
            for i in range(class_id, len(probabilities), 4):
                probability = probabilities[i]
                predict, num_predict = post_process(sigmoid(probability), t, ms)
                masks.append(predict)

            d = []
            for i, j in zip(masks, valid_masks[class_id::4]):
                if (i.sum() == 0) & (j.sum() == 0):
                    d.append(1)
                else:
                    d.append(dice(i, j))

            attempts.append((t, ms, np.mean(d)))

    attempts_df = pd.DataFrame(attempts, columns=['threshold', 'size', 'dice'])


    attempts_df = attempts_df.sort_values('dice', ascending=False)
    print(attempts_df.head())
    best_threshold = attempts_df['threshold'].values[0]
    best_size = attempts_df['size'].values[0]
    
    class_params[class_id] = (best_threshold, best_size)
0 threshold size dice 64 0.60 10000 0.721111 59 0.55 10000 0.720943 54 0.50 10000 0.716097 69 0.65 10000 0.715335 44 0.40 10000 0.713688 1 threshold size dice 64 0.60 10000 0.796572 68 0.65 5000 0.796486 79 0.75 10000 0.795864 84 0.80 10000 0.795810 73 0.70 5000 0.795756 2 threshold size dice 64 0.60 10000 0.690839 74 0.70 10000 0.688239 69 0.65 10000 0.686557 59 0.55 10000 0.684478 79 0.75 10000 0.684445 3 threshold size dice 59 0.55 10000 0.640206 54 0.50 10000 0.640026 64 0.60 10000 0.639688 49 0.45 10000 0.638334 69 0.65 10000 0.637871
In [17]:
print(class_params)
{0: (0.6, 10000), 1: (0.6, 10000), 2: (0.6, 10000), 3: (0.55, 10000)}
In [18]:
sns.lineplot(x='threshold', y='dice', hue='size', data=attempts_df);
plt.title('Threshold and min size vs dice for one of the classes');
Notebook Image

Now let's have a look at our masks.

In [19]:
for i, (input, output) in enumerate(zip(
        valid_dataset, runner.callbacks[0].predictions["logits"])):
    image, mask = input
        
    image_vis = image.transpose(1, 2, 0)
    mask = mask.astype('uint8').transpose(1, 2, 0)
    pr_mask = np.zeros((350, 525, 4))
    for j in range(4):
        probability = cv2.resize(output[:, :, j], dsize=(525, 350), interpolation=cv2.INTER_LINEAR)
        pr_mask[:, :, j], _ = post_process(sigmoid(probability), class_params[j][0], class_params[j][1])
    #pr_mask = (sigmoid(output) > best_threshold).astype('uint8').transpose(1, 2, 0)
    
        
    visualize_with_raw(image=image_vis, mask=pr_mask, original_image=image_vis, original_mask=mask, raw_image=image_vis, raw_mask=output.transpose(1, 2, 0))
    
    if i >= 2:
        break
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Notebook Image
Notebook Image