Jovian
⭐️
Sign In
In [ ]:
!pip install jovian --upgrade -q
In [ ]:
import os
import gc
import numpy as np
import jovian

import torch
import torch.nn as nn
import torch.nn.functional as F

from fastai import *
from fastai.vision import *
from fastai.metrics import accuracy, error_rate
from fastai.callbacks import *

from PIL import Image
from tqdm.notebook import tqdm 
from pathlib import Path
In [ ]:
# setup the jovian API key
jvn = !cat ../input/sgr-jovian/jovian.txt
jovian.utils.credentials.write_api_key(jvn[0])
In [ ]:
# jovian.commit(notebook_id="6b427266339c470ba5d4d40b64504e56")
# jovian.commit(nb_filename="protein-location")

jovian.commit(nb_filename="__notebook__.ipynb")
In [ ]:
PATH = '../input/human-protein-atlas-image-classification/'
TRAIN = '../input/human-protein-atlas-image-classification/train/'
TEST = '../input/human-protein-atlas-image-classification/test/'
LABELS = '../input/human-protein-atlas-image-classification/train.csv'

path_working = Path('/kaggle/working/')
In [ ]:
channels = ['_yellow', '_red', '_green', '_blue']
In [ ]:
index_class_dict = {
0:  'Nucleoplasm',
1:  'Nuclear membrane',
2:  'Nucleoli',   
3:  'Nucleoli fibrillar center',
4:  'Nuclear speckles',
5:  'Nuclear bodies',
6:  'Endoplasmic reticulum',   
7:  'Golgi apparatus',
8:  'Peroxisomes',
9:  'Endosomes',
10:  'Lysosomes',
11:  'Intermediate filaments',
12:  'Actin filaments',
13:  'Focal adhesion sites',   
14:  'Microtubules',
15:  'Microtubule ends',  
16:  'Cytokinetic bridge',   
17:  'Mitotic spindle',
18:  'Microtubule organizing center',  
19:  'Centrosome',
20:  'Lipid droplets',
21:  'Plasma membrane',   
22:  'Cell junctions', 
23:  'Mitochondria',
24:  'Aggresome',
25:  'Cytosol',
26:  'Cytoplasmic bodies',   
27:  'Rods & rings' }
In [ ]:
# read the training data
train_df = pd.read_csv(LABELS)
train_df.head()
In [ ]:
# create cols for each class
train_df[f'target_vec'] = train_df['Target'].map(lambda x: list(map(int, x.strip().split())))
for i in range(28):
    train_df[f'{index_class_dict[i]}'] = train_df['Target'].map(
             lambda x: 1 if str(i) in x.strip().split() else 0)
train_df.head()
In [ ]:
# from kernel: https://www.kaggle.com/kwentar/visualization-examples-of-each-class-in-rgb

def make_rgb_image_from_four_channels(channels: list, image_width=512, image_height=512) -> np.ndarray:
    """
    It makes literally RGB image from source four channels, 
    where yellow image will be yellow color, red will be red and so on  
    """
    rgb_image = np.zeros(shape=(image_height, image_width, 3), dtype=np.float)
    yellow = np.array(Image.open(channels[0]))
    # yellow is red + green
    rgb_image[:, :, 0] += yellow/2   
    rgb_image[:, :, 1] += yellow/2
    # loop for R,G and B channels
    for index, channel in enumerate(channels[1:]):
        current_image = Image.open(channel)
        rgb_image[:, :, index] += current_image
    # Normalize image
    rgb_image = rgb_image / rgb_image.max() * 255
    return rgb_image.astype(np.uint8)
In [ ]:
def visualize_part(start_class_index=0, nrows=4, ncols=3):
    """
    Visualize the part of classes, started from class with index start_class_index,
    make nrows classes, ncols examples for each one
    """
    fig, ax = plt.subplots(nrows = nrows, ncols=ncols, figsize=(15, 25))
    for class_index in range(nrows):
        current_index = class_index + start_class_index
        for sample in range(ncols):
            current_part = train_df[train_df[index_class_dict[current_index]] == 1] 
            # 0 index is id
            random_index = np.random.choice(current_part.values.shape[0], 1, replace=False)
            # random line from data with selected class
            current_line = current_part.values[random_index][0]
            image_names = [os.path.join(TRAIN, current_line[0]) 
                           + x + '.png' for x in channels]
            rgb_image = make_rgb_image_from_four_channels(image_names)
            # text annotations, main title and subclasses (may be empty in case one label)
            main_class = index_class_dict[current_index]+'\n'
            # 2 index is vector with classes, split version of Target col
            other_classes = [index_class_dict[x] for x in current_line[2] 
                             if x != (current_index)]
            subtitle = ', '.join(other_classes)
            # show image
            ax[class_index, sample].set_title(main_class, fontsize=18)
            ax[class_index, sample].text(250, -10, subtitle, 
                                         fontsize=14, horizontalalignment='center')
            ax[class_index, sample].imshow(rgb_image)
            ax[class_index, sample].set_xticklabels([])
            ax[class_index, sample].set_yticklabels([])
            ax[class_index, sample].tick_params(left=False, bottom=False)
In [ ]:
visualize_part(0)
In [ ]:
# remove the specified folder and contents if it exists
def remove_image_folder( path ):
    if path.exists(): 
        shutil.rmtree(path)   
In [ ]:
# convert the specified image to RGB, resize it to the given dimensions and save it
def convert_and_resize_image( image_name, source_path, target_path, size=256 ):
    image_names = [os.path.join(source_path, image_name) + x + '.png' for x in channels]
    
    # create the 512x512 RGB image
    rgb_image = make_rgb_image_from_four_channels(image_names)    
    im = Image.fromarray(rgb_image)    
    
    # resize to the defined size
    im = im.resize((size, size)) 
    
    # save the resized RGB image
    new_image = target_path/(image_name + '.png')
    im.save(new_image)     
In [ ]:
def create_resized_images( a_source_path, a_target_path, a_df, a_size ):
    if not a_target_path.exists(): 
        a_target_path.mkdir(parents=True, exist_ok=True) 
        print(f"created folder {a_target_path}")    
        
        # resize all the images from the test set
        for idx in tqdm(range(a_df.shape[0])):        
            image_name = a_df.iloc[idx].Id
            convert_and_resize_image( image_name, a_source_path, a_target_path, size=a_size )         
            
        gc.collect()
    else:
        print(f"folder {a_target_path} already exists")   

Resize images and convert to RGB

In [ ]:
size = 256
In [ ]:
# write to commit log
os.system('echo '+ 'Creating resized training images')

# creating 256 RGB training images
train_rgb_256 = path_working/'train-rgb-256'
create_resized_images( TRAIN, train_rgb_256, train_df, size )
In [ ]:
# read the submission file to get the names of the test images
test_df = pd.read_csv(PATH + 'sample_submission.csv')
test_df.head()
In [ ]:
# write to commit log
os.system('echo '+ 'Creating resized test images')

# creating 256 RGB test images
test_rgb_256 = path_working/'test-rgb-256'
create_resized_images( TEST, test_rgb_256, test_df, 256 )
In [ ]:
# create an image list from the resized image data
test = ImageList.from_folder(test_rgb_256)
len(test)
In [ ]:
# create the databunch from the resized data
batch_size = 32

data = ( ImageList.from_df(train_df,path_working,folder='train-rgb-256',suffix='.png')
       .split_by_rand_pct(0.2)
       .label_from_df(cols='Target',label_delim=' ')
       .add_test(test)
       .databunch(bs=batch_size)
       .normalize(imagenet_stats))
In [ ]:
data.show_batch( rows=3, figsize=(12,9) )

Train the model

In [ ]:
# write to commit log
os.system('echo '+ 'Creating and training model')

arch = models.resnet50
acc_02 = partial(accuracy_thresh, thresh=0.2)
f_score = partial(fbeta, thresh=0.2)

learn = cnn_learner(data, arch, metrics=[acc_02, f_score])
In [ ]:
lr_find(learn)
learn.recorder.plot(suggestion=True)
In [ ]:
lr = 1e-2

# hyperparams = {
#     'arch_name': 'resnet50',
#     'lr': lr,
#     'image_size': size,
#     'batch_size': batch_size,
#     'threshold': 0.2
# }
# jovian.log_hyperparams(hyperparams)

# learn.fit_one_cycle(5, slice(lr))
In [ ]:
# metrics = {
#     'epoch': 5,
#     'train_loss': 0.111074,
#     'val_loss': 0.114668,
#     'acc': 0.948101,
#     'fbeta': 0.630454
# }
# jovian.log_metrics(metrics)
In [ ]:
from torch import Tensor
from fastai.basic_train import Learner
from fastai.callback import Callback

from jovian import log_hyperparams, log_metrics
from jovian.utils.logger import log

class JovianFastaiCallback(Callback):
    """Fastai callback to automatically log hyperparameters and metrics.
    Args:
        learn (Learner): A learner object reference of your current model.
        arch_name (string): A name for the model you're training. 
    Example
        .. code-block::
            from jovian.callbacks.fastai_callback import FastaiCallback
            jvn_cb = FastaiCallback(learn, 'res18')
            learn.fit_one_cycle(5, callbacks = jvn_cb)
    .. admonition:: Tutorial
        Visit `this`_ for a detailed example on using the keras callback, also visit the *Records* tab
        to see all the logs of that notebook logged by the callback.
    .. _this: https://jovian.ml/PrajwalPrashanth/7f16274fc3224d829941bc2553ef6061?utm_source=docs
    """

    def __init__(self, learn: Learner, arch_name=None, reset_tracking=True):
        self.learn = learn
        self.arch_name = arch_name
        self.met_names = ['epoch', 'train_loss']
        # existence of validation dataset
#         self.valid_set = self.learn.data.valid_dl.items.any()
        self.valid_set = (self.learn.data.valid_dl.items.size > 0)
        self.reset_tracking = reset_tracking
        if self.valid_set:
            self.met_names.append('valid_loss')

    def on_train_begin(self, n_epochs: int, metrics_names: list, **ka):
        if self.reset_tracking:
            reset('hyperparams')
            reset('metrics')
        hyp_dict = {
            'epochs': n_epochs,
            'batch_size': self.learn.data.batch_size,
            'loss_func': str(self.learn.loss_func.func),
            'opt_func': str(self.learn.opt_func.func).split("'")[1],
            'weight_decay': self.learn.wd,
            'learning_rate': str(self.learn.opt.lr)
        }
        if self.arch_name:
            hyp_dict['arch_name'] = self.arch_name
        log_hyperparams(hyp_dict)

        if self.valid_set:
            self.met_names.extend(metrics_names)

    def on_epoch_end(self, epoch: int, smooth_loss: Tensor, last_metrics: list, **ka):
        met_values = [epoch,
                      smooth_loss.item()]  # smoothened avg. train loss for the epoch

        if self.valid_set:
            # last_metrics is a list with first elem as valid_loss followed by all
            # the metrics of the learner
            met_values.extend([str(last_metrics[0])] + [i.item()
                                                        for i in last_metrics[1:]])
        log_metrics(dict(zip(self.met_names, met_values)))

    def on_train_end(self, **ka):
        if not self.valid_set:
            log('Metrics apart from train_loss are not calculated in fastai without a validation dataset')
In [ ]:
jvn_cb = JovianFastaiCallback(learn, 'resnet50-commit2', reset_tracking=False)
In [ ]:
learn.fit_one_cycle(5, callbacks = jvn_cb)
In [ ]:
learn.unfreeze()  
learn.fit_one_cycle(5, slice(1e-5, lr/5), callbacks = jvn_cb)  

Create the predictions

In [ ]:
# write to commit log
os.system('echo '+ 'Generating predictions')

preds,_ = learn.get_preds(DatasetType.Test)
In [ ]:
thresh = 0.2
labelled_preds = [' '.join([learn.data.classes[i] for i,p in enumerate(pred) if p > thresh]) for pred in preds]
In [ ]:
labelled_preds[:5]
In [ ]:
fnames = [f.name[:-4] for f in learn.data.test_ds.items]
In [ ]:
# sample_df = pd.read_csv(SAMPLE)
sample_list = list(test_df.Id)
pred_dic = dict((key, value) for (key, value) in zip(fnames,labelled_preds))
pred_list_cor = [pred_dic[id] for id in sample_list]
df = pd.DataFrame({'Id':sample_list,'Predicted':pred_list_cor})
df.to_csv('protein_classification.csv', header=True, index=False)
In [ ]:
df.head()
In [ ]:
# write to commit log
os.system('echo '+ 'Performing cleanup')

# remove the generated images (otherwise can have problems committing)
remove_image_folder( train_rgb_256 )
remove_image_folder( test_rgb_256 )
In [ ]:
# jovian.commit(notebook_id="6b427266339c470ba5d4d40b64504e56")
jovian.commit(nb_filename="__notebook__.ipynb")
In [ ]: