Learn practical skills, build real-world projects, and advance your career
Updated 3 years ago
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import missingno as msno
data = pd.read_csv("blackFriday_train.csv")
data.head()
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 User_ID 550068 non-null int64
1 Product_ID 550068 non-null object
2 Gender 550068 non-null object
3 Age 550068 non-null object
4 Occupation 550068 non-null int64
5 City_Category 550068 non-null object
6 Stay_In_Current_City_Years 550068 non-null object
7 Marital_Status 550068 non-null int64
8 Product_Category_1 550068 non-null int64
9 Product_Category_2 376430 non-null float64
10 Product_Category_3 166821 non-null float64
11 Purchase 550068 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB
sns.distplot(data['Purchase'], fit=stats.norm)
<matplotlib.axes._subplots.AxesSubplot at 0x17a8c5eb2e0>
def unique_category(df, features=[]):
for i in features:
print('feature :',i)
print('Count of unique categories :\n',data[i].value_counts())
print('unique categories :',data[i].unique())
print('toal number of unique cat :', len(data[i].unique()))
print("_"*50)