Jovian
⭐️
Sign In
In [4]:
import jovian
In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm

import os
print(os.listdir("/storage/santander_comp/"))
['.ipynb_checkpoints', 'submissions', 'test.csv', 'models', 'train.csv', 'nbs']
In [2]:
test_path = "/storage/santander_comp/test.csv"

df_test = pd.read_csv(test_path)
df_test.drop(['ID_code'], axis = 1, inplace = True)
df_test = df_test.values
In [4]:
unique_samples = []
unique_count = np.zeros_like(df_test)
for feature in tqdm(range(df_test.shape[1])):
    _, index_, count_ = np.unique(df_test[:, feature], 
                                  return_counts=True, 
                                  return_index=True)
    unique_count[index_[count_ == 1], feature] += 1
HBox(children=(IntProgress(value=0, max=200), HTML(value='')))
In [8]:
real_samples_indexes = np.argwhere(np.sum(unique_count, axis = 1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis = 1) == 0)[:, 0]
In [9]:
print(len(real_samples_indexes))
print(len(synthetic_samples_indexes))
100000 100000
In [10]:
df_test_real = df_test[real_samples_indexes].copy()

generator_for_each_synthetic_sample = []
# Using 20,000 samples should be enough. 
# You can use all of the 100,000 and get the same results (but 5 times slower)
for cur_sample_index in tqdm(synthetic_samples_indexes[:20000]):
    cur_synthetic_sample = df_test[cur_sample_index]
    potential_generators = df_test_real == cur_synthetic_sample

    # A verified generator for a synthetic sample is achieved
    # only if the value of a feature appears only once in the
    # entire real samples set
    features_mask = np.sum(potential_generators, axis=0) == 1
    verified_generators_mask = np.any(potential_generators[:, features_mask], axis=1)
    verified_generators_for_sample = real_samples_indexes[np.argwhere(verified_generators_mask)[:, 0]]
    generator_for_each_synthetic_sample.append(set(verified_generators_for_sample))
HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))
In [11]:
public_LB = generator_for_each_synthetic_sample[0]
for x in tqdm(generator_for_each_synthetic_sample):
    if public_LB.intersection(x):
        public_LB = public_LB.union(x)

private_LB = generator_for_each_synthetic_sample[1]
for x in tqdm(generator_for_each_synthetic_sample):
    if private_LB.intersection(x):
        private_LB = private_LB.union(x)
        
print(len(public_LB))
print(len(private_LB))
HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))
HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))
50000 50000
In [12]:
np.save('public_LB', list(public_LB))
np.save('private_LB', list(private_LB))
np.save('synthetic_samples_indexes', list(synthetic_samples_indexes))
In [ ]:
jovian.commit()
[jovian] Saving notebook..
In [ ]: