import gc
import glob
import os
import json
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
import warnings
import jovian
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from joblib import Parallel, delayed
from tqdm import tqdm, tqdm_notebook
%matplotlib inline
np.random.seed(seed=1337)
warnings.filterwarnings('ignore')
split_char = '/'
os.listdir('../input')
['densenet-keras', 'petfinder-adoption-prediction']
train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')
sample_submission = pd.read_csv('../input/petfinder-adoption-prediction/test/sample_submission.csv')
import cv2
import os
from keras.applications.densenet import preprocess_input, DenseNet121
Using TensorFlow backend.
jovian.commit()
[jovian] Saving notebook..
def resize_to_square(im):
old_size = im.shape[:2]
ratio = float(img_size)/max(old_size)
new_size = tuple([int(x*ratio) for x in old_size])
im = cv2.resize(im, (new_size[1], new_size[0]))
delta_w = img_size - new_size[1]
delta_h = img_size - new_size[0]
top, bottom = delta_h//2, delta_h-(delta_h//2)
left, right = delta_w//2, delta_w-(delta_w//2)
color = [0, 0, 0]
new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT,value=color)
return new_im
def load_image(path, pet_id):
image = cv2.imread(f'{path}{pet_id}-1.jpg')
new_image = resize_to_square(image)
new_image = preprocess_input(new_image)
return new_image
img_size = 256
batch_size = 256
from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D
import keras.backend as K
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor = inp,
weights="../input/densenet-keras/DenseNet-BC-121-32-no-top.h5",
include_top = False)
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
x = AveragePooling1D(4)(x)
out = Lambda(lambda x: x[:,:,0])(x)
m = Model(inp,out)
pet_ids = train['PetID'].values
n_batches = len(pet_ids) // batch_size + 1
features = {}
for b in tqdm(range(n_batches)):
start = b*batch_size
end = (b+1)*batch_size
batch_pets = pet_ids[start:end]
batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
for i,pet_id in enumerate(batch_pets):
try:
batch_images[i] = load_image("../input/petfinder-adoption-prediction/train_images/", pet_id)
except:
pass
batch_preds = m.predict(batch_images)
for i,pet_id in enumerate(batch_pets):
features[pet_id] = batch_preds[i]
train_feats = pd.DataFrame.from_dict(features, orient='index')
train_feats.columns = [f'pic_{i}' for i in range(train_feats.shape[1])]
pet_ids = test['PetID'].values
n_batches = len(pet_ids) // batch_size + 1
features = {}
for b in tqdm(range(n_batches)):
start = b*batch_size
end = (b+1)*batch_size
batch_pets = pet_ids[start:end]
batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
for i,pet_id in enumerate(batch_pets):
try:
batch_images[i] = load_image("../input/petfinder-adoption-prediction/test_images/", pet_id)
except:
pass
batch_preds = m.predict(batch_images)
for i,pet_id in enumerate(batch_pets):
features[pet_id] = batch_preds[i]
test_feats = pd.DataFrame.from_dict(features, orient='index')
test_feats.columns = [f'pic_{i}' for i in range(test_feats.shape[1])]
train_feats = train_feats.reset_index()
train_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)
test_feats = test_feats.reset_index()
test_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)
all_ids = pd.concat([train, test], axis=0, ignore_index=True, sort=False)[['PetID']]
all_ids.shape
n_components = 32
svd_ = TruncatedSVD(n_components=n_components, random_state=1337)
features_df = pd.concat([train_feats, test_feats], axis=0)
features = features_df[[f'pic_{i}' for i in range(256)]].values
svd_col = svd_.fit_transform(features)
svd_col = pd.DataFrame(svd_col)
svd_col = svd_col.add_prefix('IMG_SVD_')
img_features = pd.concat([all_ids, svd_col], axis=1)
labels_breed = pd.read_csv('../input/petfinder-adoption-prediction/breed_labels.csv')
labels_state = pd.read_csv('../input/petfinder-adoption-prediction/color_labels.csv')
labels_color = pd.read_csv('../input/petfinder-adoption-prediction/state_labels.csv')
train_image_files = sorted(glob.glob('../input/petfinder-adoption-prediction/train_images/*.jpg'))
train_metadata_files = sorted(glob.glob('../input/petfinder-adoption-prediction/train_metadata/*.json'))
train_sentiment_files = sorted(glob.glob('../input/petfinder-adoption-prediction/train_sentiment/*.json'))
print(f'num of train images files: {len(train_image_files)}')
print(f'num of train metadata files: {len(train_metadata_files)}')
print(f'num of train sentiment files: {len(train_sentiment_files)}')
test_image_files = sorted(glob.glob('../input/petfinder-adoption-prediction/test_images/*.jpg'))
test_metadata_files = sorted(glob.glob('../input/petfinder-adoption-prediction/test_metadata/*.json'))
test_sentiment_files = sorted(glob.glob('../input/petfinder-adoption-prediction/test_sentiment/*.json'))
print(f'num of test images files: {len(test_image_files)}')
print(f'num of test metadata files: {len(test_metadata_files)}')
print(f'num of test sentiment files: {len(test_sentiment_files)}')
# Images:
train_df_ids = train[['PetID']]
print(train_df_ids.shape)
# Metadata:
train_df_ids = train[['PetID']]
train_df_metadata = pd.DataFrame(train_metadata_files)
train_df_metadata.columns = ['metadata_filename']
train_metadata_pets = train_df_metadata['metadata_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])
train_df_metadata = train_df_metadata.assign(PetID=train_metadata_pets)
print(len(train_metadata_pets.unique()))
pets_with_metadatas = len(np.intersect1d(train_metadata_pets.unique(), train_df_ids['PetID'].unique()))
print(f'fraction of pets with metadata: {pets_with_metadatas / train_df_ids.shape[0]:.3f}')
# Sentiment:
train_df_ids = train[['PetID']]
train_df_sentiment = pd.DataFrame(train_sentiment_files)
train_df_sentiment.columns = ['sentiment_filename']
train_sentiment_pets = train_df_sentiment['sentiment_filename'].apply(lambda x: x.split(split_char)[-1].split('.')[0])
train_df_sentiment = train_df_sentiment.assign(PetID=train_sentiment_pets)
print(len(train_sentiment_pets.unique()))
pets_with_sentiments = len(np.intersect1d(train_sentiment_pets.unique(), train_df_ids['PetID'].unique()))
print(f'fraction of pets with sentiment: {pets_with_sentiments / train_df_ids.shape[0]:.3f}')
# Images:
test_df_ids = test[['PetID']]
print(test_df_ids.shape)
# Metadata:
test_df_metadata = pd.DataFrame(test_metadata_files)
test_df_metadata.columns = ['metadata_filename']
test_metadata_pets = test_df_metadata['metadata_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])
test_df_metadata = test_df_metadata.assign(PetID=test_metadata_pets)
print(len(test_metadata_pets.unique()))
pets_with_metadatas = len(np.intersect1d(test_metadata_pets.unique(), test_df_ids['PetID'].unique()))
print(f'fraction of pets with metadata: {pets_with_metadatas / test_df_ids.shape[0]:.3f}')
# Sentiment:
test_df_sentiment = pd.DataFrame(test_sentiment_files)
test_df_sentiment.columns = ['sentiment_filename']
test_sentiment_pets = test_df_sentiment['sentiment_filename'].apply(lambda x: x.split(split_char)[-1].split('.')[0])
test_df_sentiment = test_df_sentiment.assign(PetID=test_sentiment_pets)
print(len(test_sentiment_pets.unique()))
pets_with_sentiments = len(np.intersect1d(test_sentiment_pets.unique(), test_df_ids['PetID'].unique()))
print(f'fraction of pets with sentiment: {pets_with_sentiments / test_df_ids.shape[0]:.3f}')
class PetFinderParser(object):
def __init__(self, debug=False):
self.debug = debug
self.sentence_sep = ' '
self.extract_sentiment_text = False
def open_json_file(self, filename):
with open(filename, 'r', encoding='utf-8') as f:
json_file = json.load(f)
return json_file
def parse_sentiment_file(self, file):
"""
Parse sentiment file. Output DF with sentiment features.
"""
file_sentiment = file['documentSentiment']
file_entities = [x['name'] for x in file['entities']]
file_entities = self.sentence_sep.join(file_entities)
file_sentences_sentiment = [x['sentiment'] for x in file['sentences']]
file_sentences_sentiment = pd.DataFrame.from_dict(
file_sentences_sentiment, orient='columns')
file_sentences_sentiment_df = pd.DataFrame(
{
'magnitude_sum': file_sentences_sentiment['magnitude'].sum(axis=0),
'score_sum': file_sentences_sentiment['score'].sum(axis=0),
'magnitude_mean': file_sentences_sentiment['magnitude'].mean(axis=0),
'score_mean': file_sentences_sentiment['score'].mean(axis=0),
'magnitude_var': file_sentences_sentiment['magnitude'].var(axis=0),
'score_var': file_sentences_sentiment['score'].var(axis=0),
}, index=[0]
)
df_sentiment = pd.DataFrame.from_dict(file_sentiment, orient='index').T
df_sentiment = pd.concat([df_sentiment, file_sentences_sentiment_df], axis=1)
df_sentiment['entities'] = file_entities
df_sentiment = df_sentiment.add_prefix('sentiment_')
return df_sentiment
def parse_metadata_file(self, file):
"""
Parse metadata file. Output DF with metadata features.
"""
file_keys = list(file.keys())
if 'labelAnnotations' in file_keys:
file_annots = file['labelAnnotations']
file_top_score = np.asarray([x['score'] for x in file_annots]).mean()
file_top_desc = [x['description'] for x in file_annots]
else:
file_top_score = np.nan
file_top_desc = ['']
file_colors = file['imagePropertiesAnnotation']['dominantColors']['colors']
file_crops = file['cropHintsAnnotation']['cropHints']
file_color_score = np.asarray([x['score'] for x in file_colors]).mean()
file_color_pixelfrac = np.asarray([x['pixelFraction'] for x in file_colors]).mean()
file_crop_conf = np.asarray([x['confidence'] for x in file_crops]).mean()
if 'importanceFraction' in file_crops[0].keys():
file_crop_importance = np.asarray([x['importanceFraction'] for x in file_crops]).mean()
else:
file_crop_importance = np.nan
df_metadata = {
'annots_score': file_top_score,
'color_score': file_color_score,
'color_pixelfrac': file_color_pixelfrac,
'crop_conf': file_crop_conf,
'crop_importance': file_crop_importance,
'annots_top_desc': self.sentence_sep.join(file_top_desc)
}
df_metadata = pd.DataFrame.from_dict(df_metadata, orient='index').T
df_metadata = df_metadata.add_prefix('metadata_')
return df_metadata
def extract_additional_features(pet_id, mode='train'):
sentiment_filename = f'../input/petfinder-adoption-prediction/{mode}_sentiment/{pet_id}.json'
try:
sentiment_file = pet_parser.open_json_file(sentiment_filename)
df_sentiment = pet_parser.parse_sentiment_file(sentiment_file)
df_sentiment['PetID'] = pet_id
except FileNotFoundError:
df_sentiment = []
dfs_metadata = []
metadata_filenames = sorted(glob.glob(f'../input/petfinder-adoption-prediction/{mode}_metadata/{pet_id}*.json'))
if len(metadata_filenames) > 0:
for f in metadata_filenames:
metadata_file = pet_parser.open_json_file(f)
df_metadata = pet_parser.parse_metadata_file(metadata_file)
df_metadata['PetID'] = pet_id
dfs_metadata.append(df_metadata)
dfs_metadata = pd.concat(dfs_metadata, ignore_index=True, sort=False)
dfs = [df_sentiment, dfs_metadata]
return dfs
pet_parser = PetFinderParser()
debug = False
train_pet_ids = train.PetID.unique()
test_pet_ids = test.PetID.unique()
if debug:
train_pet_ids = train_pet_ids[:1000]
test_pet_ids = test_pet_ids[:500]
dfs_train = Parallel(n_jobs=-1, verbose=1)(
delayed(extract_additional_features)(i, mode='train') for i in train_pet_ids)
train_dfs_sentiment = [x[0] for x in dfs_train if isinstance(x[0], pd.DataFrame)]
train_dfs_metadata = [x[1] for x in dfs_train if isinstance(x[1], pd.DataFrame)]
train_dfs_sentiment = pd.concat(train_dfs_sentiment, ignore_index=True, sort=False)
train_dfs_metadata = pd.concat(train_dfs_metadata, ignore_index=True, sort=False)
print(train_dfs_sentiment.shape, train_dfs_metadata.shape)
dfs_test = Parallel(n_jobs=-1, verbose=1)(
delayed(extract_additional_features)(i, mode='test') for i in test_pet_ids)
test_dfs_sentiment = [x[0] for x in dfs_test if isinstance(x[0], pd.DataFrame)]
test_dfs_metadata = [x[1] for x in dfs_test if isinstance(x[1], pd.DataFrame)]
test_dfs_sentiment = pd.concat(test_dfs_sentiment, ignore_index=True, sort=False)
test_dfs_metadata = pd.concat(test_dfs_metadata, ignore_index=True, sort=False)
print(test_dfs_sentiment.shape, test_dfs_metadata.shape)
aggregates = ['sum', 'mean', 'var']
sent_agg = ['sum']
# Train
train_metadata_desc = train_dfs_metadata.groupby(['PetID'])['metadata_annots_top_desc'].unique()
train_metadata_desc = train_metadata_desc.reset_index()
train_metadata_desc[
'metadata_annots_top_desc'] = train_metadata_desc[
'metadata_annots_top_desc'].apply(lambda x: ' '.join(x))
prefix = 'metadata'
train_metadata_gr = train_dfs_metadata.drop(['metadata_annots_top_desc'], axis=1)
for i in train_metadata_gr.columns:
if 'PetID' not in i:
train_metadata_gr[i] = train_metadata_gr[i].astype(float)
train_metadata_gr = train_metadata_gr.groupby(['PetID']).agg(aggregates)
train_metadata_gr.columns = pd.Index([f'{c[0]}_{c[1].upper()}' for c in train_metadata_gr.columns.tolist()])
train_metadata_gr = train_metadata_gr.reset_index()
train_sentiment_desc = train_dfs_sentiment.groupby(['PetID'])['sentiment_entities'].unique()
train_sentiment_desc = train_sentiment_desc.reset_index()
train_sentiment_desc[
'sentiment_entities'] = train_sentiment_desc[
'sentiment_entities'].apply(lambda x: ' '.join(x))
prefix = 'sentiment'
train_sentiment_gr = train_dfs_sentiment.drop(['sentiment_entities'], axis=1)
for i in train_sentiment_gr.columns:
if 'PetID' not in i:
train_sentiment_gr[i] = train_sentiment_gr[i].astype(float)
train_sentiment_gr = train_sentiment_gr.groupby(['PetID']).agg(sent_agg)
train_sentiment_gr.columns = pd.Index([f'{c[0]}' for c in train_sentiment_gr.columns.tolist()])
train_sentiment_gr = train_sentiment_gr.reset_index()
# Test
test_metadata_desc = test_dfs_metadata.groupby(['PetID'])['metadata_annots_top_desc'].unique()
test_metadata_desc = test_metadata_desc.reset_index()
test_metadata_desc[
'metadata_annots_top_desc'] = test_metadata_desc[
'metadata_annots_top_desc'].apply(lambda x: ' '.join(x))
prefix = 'metadata'
test_metadata_gr = test_dfs_metadata.drop(['metadata_annots_top_desc'], axis=1)
for i in test_metadata_gr.columns:
if 'PetID' not in i:
test_metadata_gr[i] = test_metadata_gr[i].astype(float)
test_metadata_gr = test_metadata_gr.groupby(['PetID']).agg(aggregates)
test_metadata_gr.columns = pd.Index([f'{c[0]}_{c[1].upper()}' for c in test_metadata_gr.columns.tolist()])
test_metadata_gr = test_metadata_gr.reset_index()
test_sentiment_desc = test_dfs_sentiment.groupby(['PetID'])['sentiment_entities'].unique()
test_sentiment_desc = test_sentiment_desc.reset_index()
test_sentiment_desc[
'sentiment_entities'] = test_sentiment_desc[
'sentiment_entities'].apply(lambda x: ' '.join(x))
prefix = 'sentiment'
test_sentiment_gr = test_dfs_sentiment.drop(['sentiment_entities'], axis=1)
for i in test_sentiment_gr.columns:
if 'PetID' not in i:
test_sentiment_gr[i] = test_sentiment_gr[i].astype(float)
test_sentiment_gr = test_sentiment_gr.groupby(['PetID']).agg(sent_agg)
test_sentiment_gr.columns = pd.Index([f'{c[0]}' for c in test_sentiment_gr.columns.tolist()])
test_sentiment_gr = test_sentiment_gr.reset_index()
# Train merges:
train_proc = train.copy()
train_proc = train_proc.merge(
train_sentiment_gr, how='left', on='PetID')
train_proc = train_proc.merge(
train_metadata_gr, how='left', on='PetID')
train_proc = train_proc.merge(
train_metadata_desc, how='left', on='PetID')
train_proc = train_proc.merge(
train_sentiment_desc, how='left', on='PetID')
# Test merges:
test_proc = test.copy()
test_proc = test_proc.merge(
test_sentiment_gr, how='left', on='PetID')
test_proc = test_proc.merge(
test_metadata_gr, how='left', on='PetID')
test_proc = test_proc.merge(
test_metadata_desc, how='left', on='PetID')
test_proc = test_proc.merge(
test_sentiment_desc, how='left', on='PetID')
print(train_proc.shape, test_proc.shape)
assert train_proc.shape[0] == train.shape[0]
assert test_proc.shape[0] == test.shape[0]
train_breed_main = train_proc[['Breed1']].merge(
labels_breed, how='left',
left_on='Breed1', right_on='BreedID',
suffixes=('', '_main_breed'))
train_breed_main = train_breed_main.iloc[:, 2:]
train_breed_main = train_breed_main.add_prefix('main_breed_')
train_breed_second = train_proc[['Breed2']].merge(
labels_breed, how='left',
left_on='Breed2', right_on='BreedID',
suffixes=('', '_second_breed'))
train_breed_second = train_breed_second.iloc[:, 2:]
train_breed_second = train_breed_second.add_prefix('second_breed_')
train_proc = pd.concat(
[train_proc, train_breed_main, train_breed_second], axis=1)
test_breed_main = test_proc[['Breed1']].merge(
labels_breed, how='left',
left_on='Breed1', right_on='BreedID',
suffixes=('', '_main_breed'))
test_breed_main = test_breed_main.iloc[:, 2:]
test_breed_main = test_breed_main.add_prefix('main_breed_')
test_breed_second = test_proc[['Breed2']].merge(
labels_breed, how='left',
left_on='Breed2', right_on='BreedID',
suffixes=('', '_second_breed'))
test_breed_second = test_breed_second.iloc[:, 2:]
test_breed_second = test_breed_second.add_prefix('second_breed_')
test_proc = pd.concat(
[test_proc, test_breed_main, test_breed_second], axis=1)
print(train_proc.shape, test_proc.shape)
X = pd.concat([train_proc, test_proc], ignore_index=True, sort=False)
X_temp = X.copy()
text_columns = ['Description', 'metadata_annots_top_desc', 'sentiment_entities']
categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName']
to_drop_columns = ['PetID', 'Name', 'RescuerID']
rescuer_count = X.groupby(['RescuerID'])['PetID'].count().reset_index()
rescuer_count.columns = ['RescuerID', 'RescuerID_COUNT']
X_temp = X_temp.merge(rescuer_count, how='left', on='RescuerID')
for i in categorical_columns:
X_temp.loc[:, i] = pd.factorize(X_temp.loc[:, i])[0]
X_text = X_temp[text_columns]
for i in X_text.columns:
X_text.loc[:, i] = X_text.loc[:, i].fillna('none')
X_temp['Length_Description'] = X_text['Description'].map(len)
X_temp['Length_metadata_annots_top_desc'] = X_text['metadata_annots_top_desc'].map(len)
X_temp['Lengths_sentiment_entities'] = X_text['sentiment_entities'].map(len)
n_components = 16
text_features = []
# Generate text features:
for i in X_text.columns:
# Initialize decomposition methods:
print(f'generating features from: {i}')
tfv = TfidfVectorizer(min_df=2, max_features=None,
strip_accents='unicode', analyzer='word', token_pattern=r'(?u)\b\w+\b',
ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1)
svd_ = TruncatedSVD(
n_components=n_components, random_state=1337)
tfidf_col = tfv.fit_transform(X_text.loc[:, i].values)
svd_col = svd_.fit_transform(tfidf_col)
svd_col = pd.DataFrame(svd_col)
svd_col = svd_col.add_prefix('TFIDF_{}_'.format(i))
text_features.append(svd_col)
text_features = pd.concat(text_features, axis=1)
X_temp = pd.concat([X_temp, text_features], axis=1)
for i in X_text.columns:
X_temp = X_temp.drop(i, axis=1)
X_temp = X_temp.merge(img_features, how='left', on='PetID')
from PIL import Image
train_df_ids = train[['PetID']]
test_df_ids = test[['PetID']]
train_df_imgs = pd.DataFrame(train_image_files)
train_df_imgs.columns = ['image_filename']
train_imgs_pets = train_df_imgs['image_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])
test_df_imgs = pd.DataFrame(test_image_files)
test_df_imgs.columns = ['image_filename']
test_imgs_pets = test_df_imgs['image_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])
train_df_imgs = train_df_imgs.assign(PetID=train_imgs_pets)
test_df_imgs = test_df_imgs.assign(PetID=test_imgs_pets)
def getSize(filename):
st = os.stat(filename)
return st.st_size
def getDimensions(filename):
img_size = Image.open(filename).size
return img_size
train_df_imgs['image_size'] = train_df_imgs['image_filename'].apply(getSize)
train_df_imgs['temp_size'] = train_df_imgs['image_filename'].apply(getDimensions)
train_df_imgs['width'] = train_df_imgs['temp_size'].apply(lambda x : x[0])
train_df_imgs['height'] = train_df_imgs['temp_size'].apply(lambda x : x[1])
train_df_imgs = train_df_imgs.drop(['temp_size'], axis=1)
test_df_imgs['image_size'] = test_df_imgs['image_filename'].apply(getSize)
test_df_imgs['temp_size'] = test_df_imgs['image_filename'].apply(getDimensions)
test_df_imgs['width'] = test_df_imgs['temp_size'].apply(lambda x : x[0])
test_df_imgs['height'] = test_df_imgs['temp_size'].apply(lambda x : x[1])
test_df_imgs = test_df_imgs.drop(['temp_size'], axis=1)
aggs = {
'image_size': ['sum', 'mean', 'var'],
'width': ['sum', 'mean', 'var'],
'height': ['sum', 'mean', 'var'],
}
agg_train_imgs = train_df_imgs.groupby('PetID').agg(aggs)
new_columns = [
k + '_' + agg for k in aggs.keys() for agg in aggs[k]
]
agg_train_imgs.columns = new_columns
agg_train_imgs = agg_train_imgs.reset_index()
agg_test_imgs = test_df_imgs.groupby('PetID').agg(aggs)
new_columns = [
k + '_' + agg for k in aggs.keys() for agg in aggs[k]
]
agg_test_imgs.columns = new_columns
agg_test_imgs = agg_test_imgs.reset_index()
agg_imgs = pd.concat([agg_train_imgs, agg_test_imgs], axis=0).reset_index(drop=True)
X_temp = X_temp.merge(agg_imgs, how='left', on='PetID')
X_temp = X_temp.drop(to_drop_columns, axis=1)
X_train = X_temp.loc[np.isfinite(X_temp.AdoptionSpeed), :]
X_test = X_temp.loc[~np.isfinite(X_temp.AdoptionSpeed), :]
X_test = X_test.drop(['AdoptionSpeed'], axis=1)
assert X_train.shape[0] == train.shape[0]
assert X_test.shape[0] == test.shape[0]
train_cols = X_train.columns.tolist()
train_cols.remove('AdoptionSpeed')
test_cols = X_test.columns.tolist()
assert np.all(train_cols == test_cols)
X_train_non_null = X_train.fillna(-1)
X_test_non_null = X_test.fillna(-1)
X_train_non_null.isnull().any().any(), X_test_non_null.isnull().any().any()
X_train_non_null.shape, X_test_non_null.shape
import scipy as sp
from collections import Counter
from functools import partial
from math import sqrt
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix
# FROM: https://www.kaggle.com/myltykritik/simple-lgbm-image-features
# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
"""
Returns the confusion matrix between rater's ratings
"""
assert(len(rater_a) == len(rater_b))
if min_rating is None:
min_rating = min(rater_a + rater_b)
if max_rating is None:
max_rating = max(rater_a + rater_b)
num_ratings = int(max_rating - min_rating + 1)
conf_mat = [[0 for i in range(num_ratings)]
for j in range(num_ratings)]
for a, b in zip(rater_a, rater_b):
conf_mat[a - min_rating][b - min_rating] += 1
return conf_mat
def histogram(ratings, min_rating=None, max_rating=None):
"""
Returns the counts of each type of rating that a rater made
"""
if min_rating is None:
min_rating = min(ratings)
if max_rating is None:
max_rating = max(ratings)
num_ratings = int(max_rating - min_rating + 1)
hist_ratings = [0 for x in range(num_ratings)]
for r in ratings:
hist_ratings[r - min_rating] += 1
return hist_ratings
def quadratic_weighted_kappa(y, y_pred):
"""
Calculates the quadratic weighted kappa
axquadratic_weighted_kappa calculates the quadratic weighted kappa
value, which is a measure of inter-rater agreement between two raters
that provide discrete numeric ratings. Potential values range from -1
(representing complete disagreement) to 1 (representing complete
agreement). A kappa value of 0 is expected if all agreement is due to
chance.
quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
each correspond to a list of integer ratings. These lists must have the
same length.
The ratings should be integers, and it is assumed that they contain
the complete range of possible ratings.
quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
is the minimum possible rating, and max_rating is the maximum possible
rating
"""
rater_a = y
rater_b = y_pred
min_rating=None
max_rating=None
rater_a = np.array(rater_a, dtype=int)
rater_b = np.array(rater_b, dtype=int)
assert(len(rater_a) == len(rater_b))
if min_rating is None:
min_rating = min(min(rater_a), min(rater_b))
if max_rating is None:
max_rating = max(max(rater_a), max(rater_b))
conf_mat = confusion_matrix(rater_a, rater_b,
min_rating, max_rating)
num_ratings = len(conf_mat)
num_scored_items = float(len(rater_a))
hist_rater_a = histogram(rater_a, min_rating, max_rating)
hist_rater_b = histogram(rater_b, min_rating, max_rating)
numerator = 0.0
denominator = 0.0
for i in range(num_ratings):
for j in range(num_ratings):
expected_count = (hist_rater_a[i] * hist_rater_b[j]
/ num_scored_items)
d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
numerator += d * conf_mat[i][j] / num_scored_items
denominator += d * expected_count / num_scored_items
return (1.0 - numerator / denominator)
class OptimizedRounder(object):
def __init__(self):
self.coef_ = 0
def _kappa_loss(self, coef, X, y):
preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
return -cohen_kappa_score(y, preds, weights='quadratic')
def fit(self, X, y):
loss_partial = partial(self._kappa_loss, X = X, y = y)
initial_coef = [0.5, 1.5, 2.5, 3.5]
self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
def predict(self, X, coef):
preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
return preds
def coefficients(self):
return self.coef_['x']
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
xgb_params = {
'eval_metric': 'rmse',
'seed': 1337,
'eta': 0.0123,
'subsample': 0.8,
'colsample_bytree': 0.85,
'tree_method': 'gpu_hist',
'device': 'gpu',
'silent': 1,
}
def run_xgb(params, X_train, X_test):
n_splits = 10
verbose_eval = 1000
num_rounds = 60000
early_stop = 500
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1337)
oof_train = np.zeros((X_train.shape[0]))
oof_test = np.zeros((X_test.shape[0], n_splits))
i = 0
for train_idx, valid_idx in kf.split(X_train, X_train['AdoptionSpeed'].values):
X_tr = X_train.iloc[train_idx, :]
X_val = X_train.iloc[valid_idx, :]
y_tr = X_tr['AdoptionSpeed'].values
X_tr = X_tr.drop(['AdoptionSpeed'], axis=1)
y_val = X_val['AdoptionSpeed'].values
X_val = X_val.drop(['AdoptionSpeed'], axis=1)
d_train = xgb.DMatrix(data=X_tr, label=y_tr, feature_names=X_tr.columns)
d_valid = xgb.DMatrix(data=X_val, label=y_val, feature_names=X_val.columns)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
model = xgb.train(dtrain=d_train, num_boost_round=num_rounds, evals=watchlist,
early_stopping_rounds=early_stop, verbose_eval=verbose_eval, params=params)
valid_pred = model.predict(xgb.DMatrix(X_val, feature_names=X_val.columns), ntree_limit=model.best_ntree_limit)
test_pred = model.predict(xgb.DMatrix(X_test, feature_names=X_test.columns), ntree_limit=model.best_ntree_limit)
oof_train[valid_idx] = valid_pred
oof_test[:, i] = test_pred
i += 1
return model, oof_train, oof_test
model, oof_train, oof_test = run_xgb(xgb_params, X_train_non_null, X_test_non_null)
def plot_pred(pred):
sns.distplot(pred, kde=True, hist_kws={'range': [0, 5]})
plot_pred(oof_train)
plot_pred(oof_test.mean(axis=1))
optR = OptimizedRounder()
optR.fit(oof_train, X_train['AdoptionSpeed'].values)
coefficients = optR.coefficients()
valid_pred = optR.predict(oof_train, coefficients)
qwk = quadratic_weighted_kappa(X_train['AdoptionSpeed'].values, valid_pred)
print("QWK = ", qwk)
coefficients_ = coefficients.copy()
coefficients_[0] = 1.66
coefficients_[1] = 2.13
coefficients_[3] = 2.85
train_predictions = optR.predict(oof_train, coefficients_).astype(np.int8)
print(f'train pred distribution: {Counter(train_predictions)}')
test_predictions = optR.predict(oof_test.mean(axis=1), coefficients_).astype(np.int8)
print(f'test pred distribution: {Counter(test_predictions)}')
Counter(train_predictions)
Counter(test_predictions)
submission = pd.DataFrame({'PetID': test['PetID'].values, 'AdoptionSpeed': test_predictions})
submission.to_csv('submission.csv', index=False)
submission.head()