Jovian
⭐️
Sign In
In [1]:
%store -r __imp
In [2]:
__imp
/home/ubuntu/anaconda3/envs/fastai/lib/python3.6/site-packages/matplotlib/__init__.py:1057: UserWarning: Duplicate key in file "/home/ubuntu/.config/matplotlib/matplotlibrc", line #2 (fname, cnt)) /home/ubuntu/anaconda3/envs/fastai/lib/python3.6/site-packages/matplotlib/__init__.py:1057: UserWarning: Duplicate key in file "/home/ubuntu/.config/matplotlib/matplotlibrc", line #3 (fname, cnt))
In [3]:
PATH = "data/myntra/myntra_extra/"
PATH_CROPPED = "data/myntra/cropped/"
train_path = "train_combined_merged"
test_path ="test"
In [4]:
cuda.set_device(4)
In [5]:
cuda.current_device()
Out[5]:
4
In [6]:
from utils.myntra import *
from utils.ensemble import *

copy test folders into train

In [20]:
res = copy_with_prefix(PATH, train_path, test_path, dry_run=False)

res[0:4]
Out[20]:
['data/myntra/myntra_extra/train_combined_merged/test_12470.jpg',
 'data/myntra/myntra_extra/train_combined_merged/test_12387.jpg',
 'data/myntra/myntra_extra/train_combined_merged/test_2227.jpg',
 'data/myntra/myntra_extra/train_combined_merged/test_3479.jpg']

read pseudo labels

In [11]:
test_pl_60pct = pd.read_csv("data/myntra/en_dn_rn_rn_lc.csv")

test_pl_60pct.head()
Out[11]:
In [12]:
test_pl_60pct.shape
Out[12]:
(10995, 2)

read previous labels to combine pseudo labels

In [13]:
prev_labels_csv = f"{PATH}labels_solid_60_pct_oversampled.csv"
In [14]:
prev_labels = pd.read_csv(prev_labels_csv)

prev_labels.head()
Out[14]:
In [15]:
prev_labels.shape
Out[15]:
(169189, 2)
In [18]:
new_labels = prev_labels.append(test_pl_60pct)
new_labels.shape
Out[18]:
(180184, 2)
In [21]:
new_labels["Sub_category"].value_counts()
Out[21]:
Solid                     17669
Typography                15363
Striped                   11626
Colourblocked              9963
Abstract                   8599
People_and_Places          7944
Geometric                  7889
Conversational             7601
Floral                     7558
Humour_and_Comic           7039
Superhero                  6704
Biker                      6638
Music                      6619
Graphic                    6412
Self_Design                6219
Sports                     6174
Varsity                    6134
Camouflage                 5839
Tie_and_Dye                5829
Sports_and_Team_Jersey     5736
Polka_Dots                 5677
Checked                    5591
Tribal                     5361
Name: Sub_category, dtype: int64
In [22]:
new_labels.to_csv(f"{PATH}labels_solid_60_pct_oversampled_pseudo_labels_1.csv", index=False)

Use newly created labels file for training

In [23]:
base_labels = f"{PATH}labels_solid_60_pct.csv"
orig_fnames = get_fnames_orig(base_labels)

labels_csv = f"{PATH}labels_solid_60_pct_oversampled_pseudo_labels_1.csv"
val_idxs = get_val_idxs(labels_csv, orig_fnames)
In [54]:
# n = len(list(open(base_labels)))
# val_idxs_orig = get_cv_idxs(n, val_pct=0.1)
# len(val_idxs_orig)
Out[54]:
6009

Generate loss function weights based on base labels distribution

In [25]:
df = pd.read_csv(base_labels)
In [26]:
df["Sub_category"].value_counts(normalize=True).keys()
Out[26]:
Index(['Typography', 'Solid', 'Striped', 'Graphic', 'Colourblocked',
       'Abstract', 'Geometric', 'People_and_Places', 'Floral',
       'Humour_and_Comic', 'Conversational', 'Superhero', 'Biker', 'Sports',
       'Varsity', 'Sports_and_Team_Jersey', 'Music', 'Self_Design',
       'Tie_and_Dye', 'Camouflage', 'Checked', 'Tribal', 'Polka_Dots'],
      dtype='object')
In [27]:
class_keys = df["Sub_category"].value_counts(normalize=True).keys().tolist()
class_pct = df["Sub_category"].value_counts(normalize=True).tolist()
class_freq = df["Sub_category"].value_counts(normalize=False).tolist()
In [28]:
class_freq_log = np.log(np.array(class_freq))
class_freq_sqrt = np.sqrt(np.array(class_freq))
In [29]:
class_freq_log
Out[29]:
array([9.47746, 9.4737 , 9.16094, 8.75163, 7.83479, 7.735  , 7.62315, 7.4378 , 7.37651, 7.31588, 7.21891,
       7.09423, 6.50877, 6.42811, 6.41017, 5.99146, 5.87774, 5.78383, 5.72685, 5.30827, 5.27811, 4.94876,
       4.84419])
In [30]:
class_wts_dist = class_freq_sqrt
class_wts_dist = class_wts_dist/np.sum(class_wts_dist)
class_wts_dist
Out[30]:
array([0.12138, 0.12115, 0.10361, 0.08443, 0.05339, 0.05079, 0.04803, 0.04377, 0.04245, 0.04119, 0.03924,
       0.03687, 0.02751, 0.02642, 0.02619, 0.02124, 0.02007, 0.01915, 0.01861, 0.01509, 0.01487, 0.01261,
       0.01197])
In [31]:
np.sum(class_wts_dist)
Out[31]:
1.0
In [33]:
class_wt_map = {}
for k, v in zip(class_keys, class_wts_dist):
    class_wt_map[k] = np.round(v, 8)
In [179]:
 
In [44]:
arch=resnet50
bs=128
sz=224
old_save_name = "labels_31_3_solid_50_oversampled"
save_name = "pl_labels_1_4_solid_50_oversampled"
In [35]:
tfms_aug = [RandomRotate(10), RandomLighting(0.05, 0.05), RandomFlip()]
In [36]:
def get_data(sz, bs, aug=True, test_name=None):
    if aug:
        tfms = tfms_from_model(arch, sz, aug_tfms=tfms_aug, max_zoom=1.4)
    else:
        tfms = tfms_from_model(arch, sz)
    data = ImageClassifierData.from_csv(PATH, train_path, val_idxs=val_idxs, csv_fname=labels_csv, test_name='test', suffix=".jpg", tfms=tfms, bs=bs)
    return data
In [38]:
data = get_data(sz, bs, aug=True)
learn = ConvLearner.pretrained(arch, data, precompute=False)
In [39]:
def wnll_loss(wts= None):
    return lambda preds,targs: F.nll_loss(preds, targs, weight=wts)
In [40]:
def acc_th_cuda(weights = None):
    def accuracy_th(preds, targs, weights= weights):
        if weights is None:
            return accuracy(preds, targs)
        else:
            return accuracy(preds * weights, targs)
    return accuracy_th
        
In [41]:
class_wts = []
for k in data.classes:
    class_wts.append(class_wt_map[k])
In [42]:
base_wtt = np.square((np.array(class_wts)).tolist())
In [43]:
learn.crit=wnll_loss(VV_(class_wts))
In [185]:
learn.metrics = [accuracy]

load previous model

In [45]:
learn.precompute=False
learn.load(f"{old_save_name}all_3_cyc_0")
In [46]:
learn.fit(0.01, 2)
epoch trn_loss val_loss accuracy 0 0.472179 0.758915 0.737274 1 0.479626 0.743488 0.74142
Out[46]:
[0.74348766, 0.7414195753158407]
In [47]:
learn.fit(0.01, 2, cycle_len=1, cycle_mult=2, cycle_save_name=f"{save_name}precompute_1")
epoch trn_loss val_loss accuracy 0 0.424223 0.740313 0.745556 1 0.433786 0.739033 0.746915 2 0.41079 0.745149 0.748391
Out[47]:
[0.7451486, 0.7483913416558123]

Results prediction

In [48]:
log_preds, y = learn.TTA()
In [49]:
# learn.freeze()
# learn.precompute=False
In [52]:
probs = np.mean(np.exp(log_preds), 0)
In [53]:
accuracy_np(probs, y)
Out[53]:
0.7535363621234814
In [54]:
a = 1
In [187]:
log_preds_2 = learn.predict()
In [188]:
probs = np.exp(log_preds_2)
In [189]:
accuracy_np(probs, data.val_ds.y)
Out[189]:
0.6886337160925279
In [224]:
def acc(preds, targs, k):
    preds = np.argmax(preds, 1)
    mask = np.where(targs == k)[0]
    res = (preds[mask]==targs[mask]).mean()
    return np.round(res * 100, 2)
In [225]:
index = 0
y = data.val_ds.y
for k in data.classes:
    print(k, acc(probs, y, data.classes.index(k)))
Abstract 40.83 Biker 66.67 Camouflage 87.5 Checked 47.06 Colourblocked 83.13 Conversational 55.0 Floral 66.23 Geometric 54.29 Graphic 47.69 Humour_and_Comic 62.94 Music 27.27 People_and_Places 47.37 Polka_Dots 33.33 Self_Design 25.81 Solid 88.8 Sports 32.81 Sports_and_Team_Jersey 60.87 Striped 92.39 Superhero 69.92 Tie_and_Dye 60.61 Tribal 58.82 Typography 84.32 Varsity 37.1
In [53]:
def acc_th_np(probs, targs, weights=None):
    if weights is None:
        return accuracy_np(probs, targs)
    else:
        return accuracy_np(probs - weights, targs)
In [192]:
from sklearn.metrics import confusion_matrix
def plot_cm(probs, y, normalize=False):
    preds = np.argmax(probs, axis=1)
    probs = probs[:,1]
    cm = confusion_matrix(y, preds)
    plot_confusion_matrix(cm, data.classes, figsize=(10,10), normalize=normalize)
In [226]:
y = data.val_ds.y
In [227]:
plot_cm(probs, y, False)
[[ 89 2 1 0 4 5 8 14 11 5 1 13 0 0 16 1 0 4 5 3 2 34 0] [ 4 52 0 0 0 0 0 0 5 0 0 3 0 0 1 0 0 0 1 0 0 12 0] [ 1 0 14 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0] [ 0 0 0 8 1 0 0 5 0 0 0 0 0 0 2 0 0 0 0 0 0 1 0] [ 0 0 0 0 207 0 0 1 0 0 1 0 0 1 17 0 1 13 0 2 0 6 0] [ 3 1 0 0 0 77 5 17 5 5 0 1 0 0 6 0 0 2 0 0 0 18 0] [ 13 0 0 0 0 4 102 4 11 1 0 1 0 0 4 0 0 1 0 0 0 12 1] [ 17 0 0 2 4 7 4 95 1 0 0 0 2 1 19 0 0 7 0 0 4 12 0] [ 38 10 0 1 2 15 14 10 289 16 4 47 0 1 6 7 1 2 5 0 0 138 0] [ 4 1 0 0 0 5 3 0 12 90 1 1 0 0 1 2 0 2 10 0 0 11 0] [ 2 0 0 0 0 1 0 1 5 1 12 9 0 0 0 0 0 0 1 0 0 12 0] [ 13 1 0 0 0 1 1 1 17 1 4 81 0 0 4 0 0 1 1 0 0 43 2] [ 0 0 0 0 0 1 0 4 0 0 0 0 3 0 1 0 0 0 0 0 0 0 0] [ 1 0 0 0 1 0 0 1 0 0 0 1 0 8 14 0 0 3 0 0 0 2 0] [ 7 0 1 0 56 3 3 3 6 0 0 2 1 5 1150 0 0 15 0 2 0 40 1] [ 1 1 0 0 1 0 0 2 5 1 0 2 0 0 1 21 3 0 2 0 0 23 1] [ 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 2 28 0 0 0 0 13 1] [ 4 0 0 2 16 0 1 10 1 1 0 1 0 2 27 0 1 910 0 0 0 9 0] [ 4 1 0 0 0 1 0 1 9 11 1 2 0 0 2 1 0 0 86 0 0 4 0] [ 2 0 0 0 0 0 0 0 4 0 0 0 0 0 3 0 0 0 0 20 0 4 0] [ 1 0 0 0 0 0 0 3 0 0 0 0 0 0 2 0 0 1 0 0 10 0 0] [ 22 17 1 0 4 5 7 9 35 15 2 17 1 0 46 2 1 4 5 0 0 1124 16] [ 0 1 0 0 2 0 0 1 1 1 0 1 0 0 2 1 0 0 0 0 0 29 23]]
Notebook Image

create submission

In [54]:
# test_log_preds = learn.predict(is_test=True)
test_log_tta, _ = learn.TTA(is_test=True)
test_log_preds = np.mean(test_log_tta, 0)
In [55]:
test_probs = np.exp(test_log_preds)
In [56]:
def create_myntra_submission(probs, submission_file, preds_classes=None, default="Solid", fnames=data.test_ds.fnames):
    test_df = pd.read_csv(f'{PATH}myntra_test.csv')
    y_hat = np.argmax(probs, 1)
    test_df["Sub_category"] = default  
    for (y_pred, fname) in tqdm(zip(y_hat, fnames)):
        loc = int(fname.split("/")[1].split(".")[0])
        data_c = data.classes[y_pred]
        klass = data_c.replace("_", " ")
        if (preds_classes is None) or ((preds_classes is not None) and (data_c in preds_classes)):
            test_df.loc[loc, "Sub_category"] = klass
            
    test_df.to_csv(f'{PATH}{submission_file}', index=False)

    return test_df
In [57]:
submission_file = "pl_1_rn50_7_27_precompute_1.csv"
In [ ]:
 
In [58]:
test_df = create_myntra_submission(test_probs, submission_file)
14723it [00:27, 531.00it/s]
In [59]:
FileLink(f'{PATH}{submission_file}')
Out[59]:
In [60]:
ss = pd.read_csv(f'{PATH}{submission_file}')
In [62]:
from utils.ensemble import *
In [449]:
 
In [64]:
probs_df = get_test_df(test_probs, data.test_ds.fnames, data.classes, "pl_1_rn50_7_27_precompute_1")
read_probs_df = read_test_df("pl_1_rn50_7_27_precompute_1")
join_test_df([read_probs_df], data.classes).head()
Out[64]: