Learn data science and machine learning by building real-world projects on Jovian
In [3]:
jovian.commit()
[jovian] Detected Colab notebook... [jovian] Please enter your API key ( from https://jovian.ai/ ): API KEY: ·········· [jovian] Uploading colab notebook to Jovian... [jovian] Capturing environment.. [jovian] Committed successfully! https://jovian.ai/abdlquadri/customer-purchase-pattern-analysis

Customer Purchase Pattern Analysis - Chip Industry

Load required libraries and datasets

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.tight_layout()
#import seaborn as sns

pd.pandas.set_option('display.max_columns', None)

transaction_data = pd.read_excel('https://cdn.theforage.com/vinternships/companyassets/32A6DqtsbF7LbKdcq/QVI_transaction_data.xlsx')
customer_data = pd.read_csv('https://cdn.theforage.com/vinternships/companyassets/32A6DqtsbF7LbKdcq/QVI_purchase_behaviour.csv')
<Figure size 432x288 with 0 Axes>

Exploratory Data Analysis

Examining Transaction Data

In [5]:
print('Transaction Data', transaction_data.shape)
transaction_data.head(10)
Transaction Data (264836, 8)
Out[5]:
In [6]:
transaction_data.describe()
Out[6]:
In [7]:
transaction_data.dtypes
Out[7]:
DATE                int64
STORE_NBR           int64
LYLTY_CARD_NBR      int64
TXN_ID              int64
PROD_NBR            int64
PROD_NAME          object
PROD_QTY            int64
TOT_SALES         float64
dtype: object

Fix Date Column. Should be datetime type not forgetting Excel date format specifics.

In [8]:
import datetime as dt
transaction_data['DATE_CONVERTED'] = pd.TimedeltaIndex(transaction_data['DATE'], unit='d') + dt.datetime(1899, 12, 30)
transaction_data['DATE_CONVERTED']
Out[8]:
0        2018-10-17
1        2019-05-14
2        2019-05-20
3        2018-08-17
4        2018-08-18
            ...    
264831   2019-03-09
264832   2018-08-13
264833   2018-11-06
264834   2018-12-27
264835   2018-09-22
Name: DATE_CONVERTED, Length: 264836, dtype: datetime64[ns]

Examine PROD_NAME

In [9]:
transaction_data['PROD_NAME'].describe()
Out[9]:
count                                     264836
unique                                       114
top       Kettle Mozzarella   Basil & Pesto 175g
freq                                        3304
Name: PROD_NAME, dtype: object
In [10]:
prod_names = transaction_data['PROD_NAME'].unique()
prod_names
Out[10]:
array(['Natural Chip        Compny SeaSalt175g',
       'CCs Nacho Cheese    175g',
       'Smiths Crinkle Cut  Chips Chicken 170g',
       'Smiths Chip Thinly  S/Cream&Onion 175g',
       'Kettle Tortilla ChpsHny&Jlpno Chili 150g',
       'Old El Paso Salsa   Dip Tomato Mild 300g',
       'Smiths Crinkle Chips Salt & Vinegar 330g',
       'Grain Waves         Sweet Chilli 210g',
       'Doritos Corn Chip Mexican Jalapeno 150g',
       'Grain Waves Sour    Cream&Chives 210G',
       'Kettle Sensations   Siracha Lime 150g',
       'Twisties Cheese     270g', 'WW Crinkle Cut      Chicken 175g',
       'Thins Chips Light&  Tangy 175g', 'CCs Original 175g',
       'Burger Rings 220g', 'NCC Sour Cream &    Garden Chives 175g',
       'Doritos Corn Chip Southern Chicken 150g',
       'Cheezels Cheese Box 125g', 'Smiths Crinkle      Original 330g',
       'Infzns Crn Crnchers Tangy Gcamole 110g',
       'Kettle Sea Salt     And Vinegar 175g',
       'Smiths Chip Thinly  Cut Original 175g', 'Kettle Original 175g',
       'Red Rock Deli Thai  Chilli&Lime 150g',
       'Pringles Sthrn FriedChicken 134g', 'Pringles Sweet&Spcy BBQ 134g',
       'Red Rock Deli SR    Salsa & Mzzrlla 150g',
       'Thins Chips         Originl saltd 175g',
       'Red Rock Deli Sp    Salt & Truffle 150G',
       'Smiths Thinly       Swt Chli&S/Cream175G', 'Kettle Chilli 175g',
       'Doritos Mexicana    170g',
       'Smiths Crinkle Cut  French OnionDip 150g',
       'Natural ChipCo      Hony Soy Chckn175g',
       'Dorito Corn Chp     Supreme 380g', 'Twisties Chicken270g',
       'Smiths Thinly Cut   Roast Chicken 175g',
       'Smiths Crinkle Cut  Tomato Salsa 150g',
       'Kettle Mozzarella   Basil & Pesto 175g',
       'Infuzions Thai SweetChili PotatoMix 110g',
       'Kettle Sensations   Camembert & Fig 150g',
       'Smith Crinkle Cut   Mac N Cheese 150g',
       'Kettle Honey Soy    Chicken 175g',
       'Thins Chips Seasonedchicken 175g',
       'Smiths Crinkle Cut  Salt & Vinegar 170g',
       'Infuzions BBQ Rib   Prawn Crackers 110g',
       'GrnWves Plus Btroot & Chilli Jam 180g',
       'Tyrrells Crisps     Lightly Salted 165g',
       'Kettle Sweet Chilli And Sour Cream 175g',
       'Doritos Salsa       Medium 300g', 'Kettle 135g Swt Pot Sea Salt',
       'Pringles SourCream  Onion 134g',
       'Doritos Corn Chips  Original 170g',
       'Twisties Cheese     Burger 250g',
       'Old El Paso Salsa   Dip Chnky Tom Ht300g',
       'Cobs Popd Swt/Chlli &Sr/Cream Chips 110g',
       'Woolworths Mild     Salsa 300g',
       'Natural Chip Co     Tmato Hrb&Spce 175g',
       'Smiths Crinkle Cut  Chips Original 170g',
       'Cobs Popd Sea Salt  Chips 110g',
       'Smiths Crinkle Cut  Chips Chs&Onion170g',
       'French Fries Potato Chips 175g',
       'Old El Paso Salsa   Dip Tomato Med 300g',
       'Doritos Corn Chips  Cheese Supreme 170g',
       'Pringles Original   Crisps 134g',
       'RRD Chilli&         Coconut 150g',
       'WW Original Corn    Chips 200g',
       'Thins Potato Chips  Hot & Spicy 175g',
       'Cobs Popd Sour Crm  &Chives Chips 110g',
       'Smiths Crnkle Chip  Orgnl Big Bag 380g',
       'Doritos Corn Chips  Nacho Cheese 170g',
       'Kettle Sensations   BBQ&Maple 150g',
       'WW D/Style Chip     Sea Salt 200g',
       'Pringles Chicken    Salt Crips 134g',
       'WW Original Stacked Chips 160g',
       'Smiths Chip Thinly  CutSalt/Vinegr175g', 'Cheezels Cheese 330g',
       'Tostitos Lightly    Salted 175g',
       'Thins Chips Salt &  Vinegar 175g',
       'Smiths Crinkle Cut  Chips Barbecue 170g', 'Cheetos Puffs 165g',
       'RRD Sweet Chilli &  Sour Cream 165g',
       'WW Crinkle Cut      Original 175g',
       'Tostitos Splash Of  Lime 175g', 'Woolworths Medium   Salsa 300g',
       'Kettle Tortilla ChpsBtroot&Ricotta 150g',
       'CCs Tasty Cheese    175g', 'Woolworths Cheese   Rings 190g',
       'Tostitos Smoked     Chipotle 175g', 'Pringles Barbeque   134g',
       'WW Supreme Cheese   Corn Chips 200g',
       'Pringles Mystery    Flavour 134g',
       'Tyrrells Crisps     Ched & Chives 165g',
       'Snbts Whlgrn Crisps Cheddr&Mstrd 90g',
       'Cheetos Chs & Bacon Balls 190g', 'Pringles Slt Vingar 134g',
       'Infuzions SourCream&Herbs Veg Strws 110g',
       'Kettle Tortilla ChpsFeta&Garlic 150g',
       'Infuzions Mango     Chutny Papadums 70g',
       'RRD Steak &         Chimuchurri 150g',
       'RRD Honey Soy       Chicken 165g',
       'Sunbites Whlegrn    Crisps Frch/Onin 90g',
       'RRD Salt & Vinegar  165g', 'Doritos Cheese      Supreme 330g',
       'Smiths Crinkle Cut  Snag&Sauce 150g',
       'WW Sour Cream &OnionStacked Chips 160g',
       'RRD Lime & Pepper   165g',
       'Natural ChipCo Sea  Salt & Vinegr 175g',
       'Red Rock Deli Chikn&Garlic Aioli 150g',
       'RRD SR Slow Rst     Pork Belly 150g', 'RRD Pc Sea Salt     165g',
       'Smith Crinkle Cut   Bolognese 150g', 'Doritos Salsa Mild  300g'],
      dtype=object)

Text Analysis of PROD_NAME fields

Remove digits, special characters and numbers

In [11]:
import re
import string

def clean_up_text(text):
    result = text.lower()
    result = re.sub('[%s]' % re.escape(string.punctuation), '', result)
    result = re.sub('\w*\d\w*', '', result)
    return result

clean_prod_names = []
for prod_name in prod_names:
    clean_prod_names.append(clean_up_text(prod_name))
clean_prod_names
Out[11]:
['natural chip        compny ',
 'ccs nacho cheese    ',
 'smiths crinkle cut  chips chicken ',
 'smiths chip thinly  screamonion ',
 'kettle tortilla chpshnyjlpno chili ',
 'old el paso salsa   dip tomato mild ',
 'smiths crinkle chips salt  vinegar ',
 'grain waves         sweet chilli ',
 'doritos corn chip mexican jalapeno ',
 'grain waves sour    creamchives ',
 'kettle sensations   siracha lime ',
 'twisties cheese     ',
 'ww crinkle cut      chicken ',
 'thins chips light  tangy ',
 'ccs original ',
 'burger rings ',
 'ncc sour cream     garden chives ',
 'doritos corn chip southern chicken ',
 'cheezels cheese box ',
 'smiths crinkle      original ',
 'infzns crn crnchers tangy gcamole ',
 'kettle sea salt     and vinegar ',
 'smiths chip thinly  cut original ',
 'kettle original ',
 'red rock deli thai  chillilime ',
 'pringles sthrn friedchicken ',
 'pringles sweetspcy bbq ',
 'red rock deli sr    salsa  mzzrlla ',
 'thins chips         originl saltd ',
 'red rock deli sp    salt  truffle ',
 'smiths thinly       swt ',
 'kettle chilli ',
 'doritos mexicana    ',
 'smiths crinkle cut  french oniondip ',
 'natural chipco      hony soy ',
 'dorito corn chp     supreme ',
 'twisties ',
 'smiths thinly cut   roast chicken ',
 'smiths crinkle cut  tomato salsa ',
 'kettle mozzarella   basil  pesto ',
 'infuzions thai sweetchili potatomix ',
 'kettle sensations   camembert  fig ',
 'smith crinkle cut   mac n cheese ',
 'kettle honey soy    chicken ',
 'thins chips seasonedchicken ',
 'smiths crinkle cut  salt  vinegar ',
 'infuzions bbq rib   prawn crackers ',
 'grnwves plus btroot  chilli jam ',
 'tyrrells crisps     lightly salted ',
 'kettle sweet chilli and sour cream ',
 'doritos salsa       medium ',
 'kettle  swt pot sea salt',
 'pringles sourcream  onion ',
 'doritos corn chips  original ',
 'twisties cheese     burger ',
 'old el paso salsa   dip chnky tom ',
 'cobs popd swtchlli srcream chips ',
 'woolworths mild     salsa ',
 'natural chip co     tmato hrbspce ',
 'smiths crinkle cut  chips original ',
 'cobs popd sea salt  chips ',
 'smiths crinkle cut  chips ',
 'french fries potato chips ',
 'old el paso salsa   dip tomato med ',
 'doritos corn chips  cheese supreme ',
 'pringles original   crisps ',
 'rrd chilli         coconut ',
 'ww original corn    chips ',
 'thins potato chips  hot  spicy ',
 'cobs popd sour crm  chives chips ',
 'smiths crnkle chip  orgnl big bag ',
 'doritos corn chips  nacho cheese ',
 'kettle sensations   bbqmaple ',
 'ww dstyle chip     sea salt ',
 'pringles chicken    salt crips ',
 'ww original stacked chips ',
 'smiths chip thinly  ',
 'cheezels cheese ',
 'tostitos lightly    salted ',
 'thins chips salt   vinegar ',
 'smiths crinkle cut  chips barbecue ',
 'cheetos puffs ',
 'rrd sweet chilli   sour cream ',
 'ww crinkle cut      original ',
 'tostitos splash of  lime ',
 'woolworths medium   salsa ',
 'kettle tortilla chpsbtrootricotta ',
 'ccs tasty cheese    ',
 'woolworths cheese   rings ',
 'tostitos smoked     chipotle ',
 'pringles barbeque   ',
 'ww supreme cheese   corn chips ',
 'pringles mystery    flavour ',
 'tyrrells crisps     ched  chives ',
 'snbts whlgrn crisps cheddrmstrd ',
 'cheetos chs  bacon balls ',
 'pringles slt vingar ',
 'infuzions sourcreamherbs veg strws ',
 'kettle tortilla chpsfetagarlic ',
 'infuzions mango     chutny papadums ',
 'rrd steak          chimuchurri ',
 'rrd honey soy       chicken ',
 'sunbites whlegrn    crisps frchonin ',
 'rrd salt  vinegar  ',
 'doritos cheese      supreme ',
 'smiths crinkle cut  snagsauce ',
 'ww sour cream onionstacked chips ',
 'rrd lime  pepper   ',
 'natural chipco sea  salt  vinegr ',
 'red rock deli chikngarlic aioli ',
 'rrd sr slow rst     pork belly ',
 'rrd pc sea salt     ',
 'smith crinkle cut   bolognese ',
 'doritos salsa mild  ']

Most Common Word using CountVectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

def most_common_word(data):
    cv = CountVectorizer(stop_words='english')
    bag_of_words = cv.fit_transform(data)
    sum_word_freq = bag_of_words.sum(axis=0)
    word_freq = [(name, sum_word_freq[0, idx]) for name, idx in cv.vocabulary_.items()]
    word_freq = sorted(word_freq, key = lambda x:x[1], reverse=True)
    return word_freq
most_common_word(clean_prod_names)
Out[12]:
[('chips', 21),
 ('smiths', 16),
 ('crinkle', 14),
 ('cut', 14),
 ('kettle', 13),
 ('cheese', 12),
 ('salt', 12),
 ('original', 10),
 ('chip', 9),
 ('salsa', 9),
 ('doritos', 9),
 ('corn', 8),
 ('pringles', 8),
 ('rrd', 8),
 ('chicken', 7),
 ('ww', 7),
 ('chilli', 6),
 ('sour', 6),
 ('sea', 6),
 ('thinly', 5),
 ('vinegar', 5),
 ('thins', 5),
 ('crisps', 5),
 ('natural', 4),
 ('cream', 4),
 ('red', 4),
 ('rock', 4),
 ('deli', 4),
 ('supreme', 4),
 ('infuzions', 4),
 ('ccs', 3),
 ('tortilla', 3),
 ('old', 3),
 ('el', 3),
 ('paso', 3),
 ('dip', 3),
 ('tomato', 3),
 ('mild', 3),
 ('sweet', 3),
 ('sensations', 3),
 ('lime', 3),
 ('twisties', 3),
 ('chives', 3),
 ('soy', 3),
 ('cobs', 3),
 ('popd', 3),
 ('woolworths', 3),
 ('tostitos', 3),
 ('nacho', 2),
 ('grain', 2),
 ('waves', 2),
 ('tangy', 2),
 ('burger', 2),
 ('rings', 2),
 ('cheezels', 2),
 ('thai', 2),
 ('bbq', 2),
 ('sr', 2),
 ('swt', 2),
 ('french', 2),
 ('chipco', 2),
 ('smith', 2),
 ('honey', 2),
 ('tyrrells', 2),
 ('lightly', 2),
 ('salted', 2),
 ('medium', 2),
 ('potato', 2),
 ('cheetos', 2),
 ('compny', 1),
 ('screamonion', 1),
 ('chpshnyjlpno', 1),
 ('chili', 1),
 ('mexican', 1),
 ('jalapeno', 1),
 ('creamchives', 1),
 ('siracha', 1),
 ('light', 1),
 ('ncc', 1),
 ('garden', 1),
 ('southern', 1),
 ('box', 1),
 ('infzns', 1),
 ('crn', 1),
 ('crnchers', 1),
 ('gcamole', 1),
 ('chillilime', 1),
 ('sthrn', 1),
 ('friedchicken', 1),
 ('sweetspcy', 1),
 ('mzzrlla', 1),
 ('originl', 1),
 ('saltd', 1),
 ('sp', 1),
 ('truffle', 1),
 ('mexicana', 1),
 ('oniondip', 1),
 ('hony', 1),
 ('dorito', 1),
 ('chp', 1),
 ('roast', 1),
 ('mozzarella', 1),
 ('basil', 1),
 ('pesto', 1),
 ('sweetchili', 1),
 ('potatomix', 1),
 ('camembert', 1),
 ('fig', 1),
 ('mac', 1),
 ('seasonedchicken', 1),
 ('rib', 1),
 ('prawn', 1),
 ('crackers', 1),
 ('grnwves', 1),
 ('plus', 1),
 ('btroot', 1),
 ('jam', 1),
 ('pot', 1),
 ('sourcream', 1),
 ('onion', 1),
 ('chnky', 1),
 ('tom', 1),
 ('swtchlli', 1),
 ('srcream', 1),
 ('tmato', 1),
 ('hrbspce', 1),
 ('fries', 1),
 ('med', 1),
 ('coconut', 1),
 ('hot', 1),
 ('spicy', 1),
 ('crm', 1),
 ('crnkle', 1),
 ('orgnl', 1),
 ('big', 1),
 ('bag', 1),
 ('bbqmaple', 1),
 ('dstyle', 1),
 ('crips', 1),
 ('stacked', 1),
 ('barbecue', 1),
 ('puffs', 1),
 ('splash', 1),
 ('chpsbtrootricotta', 1),
 ('tasty', 1),
 ('smoked', 1),
 ('chipotle', 1),
 ('barbeque', 1),
 ('mystery', 1),
 ('flavour', 1),
 ('ched', 1),
 ('snbts', 1),
 ('whlgrn', 1),
 ('cheddrmstrd', 1),
 ('chs', 1),
 ('bacon', 1),
 ('balls', 1),
 ('slt', 1),
 ('vingar', 1),
 ('sourcreamherbs', 1),
 ('veg', 1),
 ('strws', 1),
 ('chpsfetagarlic', 1),
 ('mango', 1),
 ('chutny', 1),
 ('papadums', 1),
 ('steak', 1),
 ('chimuchurri', 1),
 ('sunbites', 1),
 ('whlegrn', 1),
 ('frchonin', 1),
 ('snagsauce', 1),
 ('onionstacked', 1),
 ('pepper', 1),
 ('vinegr', 1),
 ('chikngarlic', 1),
 ('aioli', 1),
 ('slow', 1),
 ('rst', 1),
 ('pork', 1),
 ('belly', 1),
 ('pc', 1),
 ('bolognese', 1)]

Remove Salsa Products (they are not chips)

In [13]:
transaction_data[~transaction_data['PROD_NAME'].isin(['salsa'])]
# transaction_data.describe()
Out[13]:

Transaction Data Summary

In [14]:
transaction_data.describe()
Out[14]:

Investigate Case of 200 packs

In [16]:
qty_mask = transaction_data['PROD_QTY'] == 200
transaction_data[qty_mask]
Out[16]:

Which is the outlier customer that bought 200 packs

In [17]:
cust_mask = transaction_data['LYLTY_CARD_NBR'] == 226000
transaction_data[cust_mask]
Out[17]:

Remove Outlier Customer (most likely not a retail customer)

In [18]:
transaction_data_filtered = transaction_data[transaction_data['LYLTY_CARD_NBR'] != 226000]
transaction_data_filtered.describe()
Out[18]:

Transaction Frequency by day

In [19]:
transaction_data_by_day_count = pd.DataFrame(transaction_data_filtered.groupby('DATE_CONVERTED')['STORE_NBR'].count())
transaction_data_by_day_count
Out[19]:
In [20]:
dates = pd.date_range(start='2018-07-01', end='2019-06-30', freq='D')
days = pd.DataFrame(index=dates)
print(type(days))
days
<class 'pandas.core.frame.DataFrame'>
Out[20]:

Prepare Data for plotting

In [21]:
date_count_joined = days.join(transaction_data_by_day_count).rename(columns={'STORE_NBR':'COUNT'})
date_count_joined
Out[21]:

Plot Transactions over time

In [22]:
def plot_transaction_time(data, title):
    plt.xticks(rotation=90)
    plt.autoscale()
    plt.title(title)
    plt.xlabel('Day')
    plt.ylabel('Number of transactions')
    plt.plot(data)
    
plot_transaction_time(date_count_joined, 'Transaction over time')
Notebook Image

Zoom In to December

There seem to be missing data

In [23]:
december_sales = date_count_joined['20181201':'20181231']
december_sales
Out[23]:
In [24]:
plot_transaction_time(december_sales, 'December Transactions')
Notebook Image

Ok. It is most like during holidays

Feature Engineering

Create PACK_SIZE feature

In [25]:
transaction_data_filtered['PACK_SIZE'] = transaction_data_filtered['PROD_NAME'].str.extract('(\d+)')
transaction_data_filtered.head()
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy """Entry point for launching an IPython kernel.
Out[25]:
In [26]:
transaction_data_filtered['PACK_SIZE']
Out[26]:
0         175
1         175
2         170
3         175
4         150
         ... 
264831    175
264832    175
264833    170
264834    150
264835    175
Name: PACK_SIZE, Length: 264834, dtype: object
In [27]:
transaction_data_filtered['PACK_SIZE'].describe()
Out[27]:
count     264834
unique        21
top          175
freq       66390
Name: PACK_SIZE, dtype: object
In [28]:
transaction_data_filtered['PACK_SIZE'].unique()
Out[28]:
array(['175', '170', '150', '300', '330', '210', '270', '220', '125',
       '110', '134', '380', '180', '165', '135', '250', '200', '160',
       '190', '90', '70'], dtype=object)
In [29]:
pack_sizes = transaction_data_filtered['PACK_SIZE']
print(transaction_data.shape)
print(type(pack_sizes))
plt.title('Pack Sizes')
plt.hist(pack_sizes)
plt.xticks(rotation=90)
plt.ylabel('Frequency')
plt.xlabel('Pack Sizes')
(264836, 9) <class 'pandas.core.series.Series'>
Out[29]:
Text(0.5, 0, 'Pack Sizes')
Notebook Image

It seems the most bought pack size is 175Kg

Create BRAND feature

In [30]:
import string
def extract_brand(prod_name):
    return str.split(clean_up_text(prod_name))[0].upper()
In [31]:
transaction_data_filtered['BRAND'] = transaction_data_filtered['PROD_NAME'].apply(extract_brand)
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy """Entry point for launching an IPython kernel.
In [ ]:
transaction_data_filtered['BRAND'].unique()
Out[]:
array(['NATURAL', 'CCS', 'SMITHS', 'KETTLE', 'OLD', 'GRAIN', 'DORITOS',
       'TWISTIES', 'WW', 'THINS', 'BURGER', 'NCC', 'CHEEZELS', 'INFZNS',
       'RED', 'PRINGLES', 'DORITO', 'INFUZIONS', 'SMITH', 'GRNWVES',
       'TYRRELLS', 'COBS', 'WOOLWORTHS', 'FRENCH', 'RRD', 'TOSTITOS',
       'CHEETOS', 'SNBTS', 'SUNBITES'], dtype=object)

Clean Brand Name

In [32]:
transaction_data_filtered['BRAND']=transaction_data_filtered['BRAND'].replace('RED','RRD').replace('SNBTS','SUNBITES').replace('INFZNS','INFUZIONS').replace('DORITO','DORITOS').replace('WW','WOOLWORTHS').replace('SMITH','SMITHS')
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy """Entry point for launching an IPython kernel.
In [33]:
transaction_data_filtered['BRAND'].unique()
Out[33]:
array(['NATURAL', 'CCS', 'SMITHS', 'KETTLE', 'OLD', 'GRAIN', 'DORITOS',
       'TWISTIES', 'WOOLWORTHS', 'THINS', 'BURGER', 'NCC', 'CHEEZELS',
       'INFUZIONS', 'RRD', 'PRINGLES', 'GRNWVES', 'TYRRELLS', 'COBS',
       'FRENCH', 'TOSTITOS', 'CHEETOS', 'SUNBITES'], dtype=object)
In [34]:
transaction_data_filtered
Out[34]:

Exploratory Data Analysis - Customer Data

Examining Customer Data

In [35]:
print('Customer Data', customer_data.shape)
customer_data.head(10)
Customer Data (72637, 3)
Out[35]:

How many unique Loyalty Cards

In [36]:
customer_data['LYLTY_CARD_NBR'].unique().shape
Out[36]:
(72637,)

Unique Lifestlye classes

In [37]:
customer_data['LIFESTAGE'].unique()
Out[37]:
array(['YOUNG SINGLES/COUPLES', 'YOUNG FAMILIES', 'OLDER SINGLES/COUPLES',
       'MIDAGE SINGLES/COUPLES', 'NEW FAMILIES', 'OLDER FAMILIES',
       'RETIREES'], dtype=object)

Unique Customer Types

In [38]:
customer_data['PREMIUM_CUSTOMER'].unique()
Out[38]:
array(['Premium', 'Mainstream', 'Budget'], dtype=object)

Merge Transaction and Customer Data

In [39]:
transaction_data_filtered
Out[39]:
In [40]:
customer_data.isnull().sum()
Out[40]:
LYLTY_CARD_NBR      0
LIFESTAGE           0
PREMIUM_CUSTOMER    0
dtype: int64
In [41]:
transaction_data_filtered.isnull().sum()
Out[41]:
DATE              0
STORE_NBR         0
LYLTY_CARD_NBR    0
TXN_ID            0
PROD_NBR          0
PROD_NAME         0
PROD_QTY          0
TOT_SALES         0
DATE_CONVERTED    0
PACK_SIZE         0
BRAND             0
dtype: int64

use merge. Tip!

In [42]:
data = transaction_data_filtered.merge(customer_data, on='LYLTY_CARD_NBR', how='left')
data
Out[42]:
In [43]:
data.isnull().sum()
Out[43]:
DATE                0
STORE_NBR           0
LYLTY_CARD_NBR      0
TXN_ID              0
PROD_NBR            0
PROD_NAME           0
PROD_QTY            0
TOT_SALES           0
DATE_CONVERTED      0
PACK_SIZE           0
BRAND               0
LIFESTAGE           0
PREMIUM_CUSTOMER    0
dtype: int64

Save Combined Data for future tasks

In [ ]:
data.to_csv('./QVI_data.csv')

Data Analysis Customer Segments

Total Sales by LIFESTAGE and PREMIUM_CUSTOMER

In [44]:
sales_by_lifestage = pd.DataFrame(data.groupby('LIFESTAGE')['TOT_SALES'].count()).rename(columns={'TOT_SALES':'COUNT'})
sales_by_lifestage
Out[44]:
In [45]:
plt.plot(sales_by_lifestage)
plt.title('Total Sales by Life Stage')
plt.xlabel('Life Stage')
plt.ylabel('Number of Sales')
plt.xticks(rotation=90)
Out[45]:
([0, 1, 2, 3, 4, 5, 6], <a list of 7 Text major ticklabel objects>)
Notebook Image

It seems older sngles and couples eat more chips

In [ ]:
sales_by_customer_segment = pd.DataFrame(data.groupby('PREMIUM_CUSTOMER')['TOT_SALES'].count()).rename(columns={'TOT_SALES':'COUNT'})
sales_by_customer_segment
Out[]:
In [46]:
plt.plot(sales_by_customer_segment)
plt.title('Total Sales by Customer Segment')
plt.xlabel('Customer Segment')
plt.ylabel('Number of Sales')
plt.xticks(rotation=90)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-46-498de5382357> in <module>() ----> 1 plt.plot(sales_by_customer_segment) 2 plt.title('Total Sales by Customer Segment') 3 plt.xlabel('Customer Segment') 4 plt.ylabel('Number of Sales') 5 plt.xticks(rotation=90) NameError: name 'sales_by_customer_segment' is not defined

How many customers by LIFESTAGE

In [47]:
customers_by_lifestage = pd.DataFrame(customer_data.groupby('LIFESTAGE')['PREMIUM_CUSTOMER'].count()).rename(columns={'PREMIUM_CUSTOMER':'COUNT'})
customers_by_lifestage
Out[47]:
In [48]:
plt.plot(customers_by_lifestage)
plt.title('Number of Customers per Life Stage')
plt.xlabel('Life Stage')
plt.ylabel('Number of Customers')
plt.xticks(rotation=90)
Out[48]:
([0, 1, 2, 3, 4, 5, 6], <a list of 7 Text major ticklabel objects>)
Notebook Image

How many customers by PREMIUM_CUSTOMER

In [49]:
customers_by_segment = pd.DataFrame(customer_data.groupby('PREMIUM_CUSTOMER')['LIFESTAGE'].count()).rename(columns={'LIFESTAGE':'COUNT'})
customers_by_segment
Out[49]:
In [50]:
plt.plot(customers_by_segment)
plt.title('Number of Customers per Segment')
plt.xlabel('Customer Segment')
plt.ylabel('Number of Customers')
plt.xticks(rotation=90)
Out[50]:
([0, 1, 2], <a list of 3 Text major ticklabel objects>)
Notebook Image

Average Number of Uits per customer by LIFESTAGE and PREMIUM_CUSTOMER

Missing Transaction Values

In [52]:
missing_features = [feature for feature in transaction_data.columns if transaction_data[feature].isnull().sum() > 1]

for feature in missing_features:
    print(feature, np.round(transaction_data[feature].isnull().mean(), 4), '% missing')
In [53]:
missing_features_count = transaction_data.isnull().sum()

missing_features_count[0:10]
Out[53]:
DATE              0
STORE_NBR         0
LYLTY_CARD_NBR    0
TXN_ID            0
PROD_NBR          0
PROD_NAME         0
PROD_QTY          0
TOT_SALES         0
DATE_CONVERTED    0
dtype: int64
In [54]:
total_data_point = np.product(transaction_data.shape)
total_missing = missing_features_count.sum()
percent_missing = (total_missing/total_data_point)*100
print(percent_missing)
0.0

Numerical Transaction Values

In [ ]:
numerical_features = [feature for feature in transaction_data.columns if transaction_data[feature].dtype != 'O']

print('Number of Numerical Features are:', len(numerical_features))
Number of Numerical Features are: 8
In [ ]:
categorical_features = [feature for feature in transaction_data.columns if transaction_data[feature].dtype == 'O']

print('Number of Numerical Values are:', len(categorical_features))
Number of Numerical Values are: 1