Jovian
⭐️
Sign In
Learn data science and machine learning by building real-world projects on Jovian
In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 
In [2]:
import jovian
from fastai.vision import *
import pandas as pd
import numpy as np

Read Dataset Files

In [3]:
# get_category_names
with open('list_category_cloth.txt', 'r') as f:
    categories = []
    for i, line in enumerate(f.readlines()):
        if i > 1:
            categories.append(line.split(' ')[0])
In [4]:
# get image category map
with open('list_category_image.txt', 'r') as f:
    images = []
    for i, line in enumerate(f.readlines()):
        if i > 1:
            images.append([word.strip() for word in line.split(' ') if len(word) > 0])
In [5]:
#get train, valid, test split
with open('list_eval_partition.txt', 'r') as f:
    images_partition = []
    for i, line in enumerate(f.readlines()):
        if i > 1:
            images_partition.append([word.strip() for word in line.split(' ') if len(word) > 0])
In [6]:
data_df = pd.DataFrame(images, columns=['images', 'category_label'])
partition_df = pd.DataFrame(images_partition, columns=['images', 'dataset'])
In [7]:
data_df['category_label'] = data_df['category_label'].astype(int)
In [8]:
data_df = data_df.merge(partition_df, on='images')
In [9]:
data_df['dataset'].value_counts()
Out[9]:
train    209222
val       40000
test      40000
Name: dataset, dtype: int64
In [10]:
data_df['category'] = data_df['category_label'].apply(lambda x: categories[int(x) - 1])
In [11]:
data_df['category_label'].nunique()
# seems like few labels were merged in Dress label
Out[11]:
46
In [12]:
data_df.head()
Out[12]:

prepare DataBunch

In [13]:
from pathlib import Path
images_path = Path('/home/jupyter/deepFashion')
In [14]:
data_source = (ImageList.from_df(df=data_df, path=images_path, cols='images')
                    .split_by_idxs((data_df[data_df['dataset']=='train'].index), (data_df[data_df['dataset']=='val'].index))
                    .label_from_df(cols='category')
              )
In [15]:
tmfs = get_transforms()

data = data_source.transform(tmfs, size=224).databunch(bs=128).normalize(imagenet_stats)
In [16]:
test_data = ImageList.from_df(df=data_df[data_df['dataset'] == 'test'], path=images_path, cols='images')
data.add_test(test_data)
In [17]:
# To maintain the order of images in train data, turning off shuffle
# data.train_dl = data.train_dl = data.train_dl.new(shuffle=False)
In [23]:
data.show_batch()