Jovian
⭐️
Sign In
In [1]:
!pip install jovian --upgrade
Requirement already up-to-date: jovian in c:\users\securelyshare\anaconda3\lib\site-packages (0.1.88) Requirement already satisfied, skipping upgrade: requests in c:\users\securelyshare\anaconda3\lib\site-packages (from jovian) (2.22.0) Requirement already satisfied, skipping upgrade: pyyaml in c:\users\securelyshare\anaconda3\lib\site-packages (from jovian) (5.1.1) Requirement already satisfied, skipping upgrade: uuid in c:\users\securelyshare\anaconda3\lib\site-packages (from jovian) (1.30) Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\users\securelyshare\anaconda3\lib\site-packages (from requests->jovian) (1.24.2) Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in c:\users\securelyshare\anaconda3\lib\site-packages (from requests->jovian) (3.0.4) Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in c:\users\securelyshare\anaconda3\lib\site-packages (from requests->jovian) (2.8) Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in c:\users\securelyshare\anaconda3\lib\site-packages (from requests->jovian) (2019.6.16)
In [2]:
import jovian
In [ ]:
jovian.commit()
[jovian] Saving notebook..
In [4]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
In [5]:
df=pd.read_csv('train.csv')

In [6]:
# check the number of rows and column
df.shape
Out[6]:
(550068, 12)
In [7]:
#check the data type of columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 550068 entries, 0 to 550067 Data columns (total 12 columns): User_ID 550068 non-null int64 Product_ID 550068 non-null object Gender 550068 non-null object Age 550068 non-null object Occupation 550068 non-null int64 City_Category 550068 non-null object Stay_In_Current_City_Years 550068 non-null object Marital_Status 550068 non-null int64 Product_Category_1 550068 non-null int64 Product_Category_2 376430 non-null float64 Product_Category_3 166821 non-null float64 Purchase 550068 non-null int64 dtypes: float64(2), int64(5), object(5) memory usage: 50.4+ MB
In [8]:
# find null value column
df.isnull().sum()
Out[8]:
User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64
In [9]:
# heatmap shows the which column has more missing value
sns.heatmap(df.isnull(),yticklabels=False,cbar=False)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f6316265f8>
Notebook Image
In [10]:
# Drop the column which has more missing value
df.drop(['Product_Category_2'], axis=1, inplace=True)
In [11]:
df.drop(['Product_Category_3'], axis=1, inplace=True)
In [12]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False)
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f6323a4ef0>
Notebook Image
In [13]:
df.isnull().sum()
Out[13]:
User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Purchase                      0
dtype: int64
In [14]:
df.head()
Out[14]:
In [15]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 550068 entries, 0 to 550067 Data columns (total 10 columns): User_ID 550068 non-null int64 Product_ID 550068 non-null object Gender 550068 non-null object Age 550068 non-null object Occupation 550068 non-null int64 City_Category 550068 non-null object Stay_In_Current_City_Years 550068 non-null object Marital_Status 550068 non-null int64 Product_Category_1 550068 non-null int64 Purchase 550068 non-null int64 dtypes: int64(5), object(5) memory usage: 42.0+ MB
In [16]:
# converting object type into int type when there are only 2 category are present in one column
df['Gender']=(df['Gender']=='M').astype(int)
df['Gender'].value_counts()
Out[16]:
1    414259
0    135809
Name: Gender, dtype: int64
In [17]:
# check the frequency of city_category
df['City_Category'].value_counts()
Out[17]:
B    231173
C    171175
A    147720
Name: City_Category, dtype: int64
In [18]:
#create dummy
#dummies = pd.get_dummies(df['City_Category'])
#df = pd.concat([df, dummies], axis=1)
#df = df.drop(['City_Category'], inplace=True, axis=1)
    
In [19]:
# creating a dummy for object column
df['City_A']=np.where(df['City_Category']=='A',1,0)
df['City_B']=np.where(df['City_Category']=='B',1,0)
df.drop(['City_Category'], axis=1, inplace=True)
In [ ]:
 
In [20]:
df
Out[20]:
In [21]:
df.shape
Out[21]:
(550068, 11)
In [22]:
df.head()
Out[22]:
In [23]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 550068 entries, 0 to 550067 Data columns (total 11 columns): User_ID 550068 non-null int64 Product_ID 550068 non-null object Gender 550068 non-null int32 Age 550068 non-null object Occupation 550068 non-null int64 Stay_In_Current_City_Years 550068 non-null object Marital_Status 550068 non-null int64 Product_Category_1 550068 non-null int64 Purchase 550068 non-null int64 City_A 550068 non-null int32 City_B 550068 non-null int32 dtypes: int32(3), int64(5), object(3) memory usage: 39.9+ MB
In [24]:
df.drop(['Product_ID', 'User_ID'],axis=1, inplace=True)
In [25]:
df['Stay_In_Current_City_Years'].value_counts()
Out[25]:
1     193821
2     101838
3      95285
4+     84726
0      74398
Name: Stay_In_Current_City_Years, dtype: int64
In [26]:
#df['Stay_In_Current_City_Years']=np.where(df['Stay_In_Current_City_Years'].str[:1]=='4',4,df['Stay_In_Current_City_Years'])
In [27]:
df['Stay_In_Current_City_Years']=np.where(df['Stay_In_Current_City_Years']=='4+',4,df['Stay_In_Current_City_Years'])
df['Stay_In_Current_City_Years']=pd.to_numeric(df['Stay_In_Current_City_Years'])
In [28]:
#to_numeric()
In [29]:
df['Age'].value_counts()
Out[29]:
26-35    219587
36-45    110013
18-25     99660
46-50     45701
51-55     38501
55+       21504
0-17      15102
Name: Age, dtype: int64
In [30]:
#df['Age']=np.where(df['Age']=='55+', 55, df['Age'])
#df['Age']=pd.to_numeric(df['Age'])
In [31]:
k=df['Age'].str.split('-',expand=True)
In [32]:
k
Out[32]:
In [33]:

k[0]=np.where(k[0]=='55+',55,k[0])
In [34]:
k[0]
Out[34]:
0          0
1          0
2          0
3          0
4         55
5         26
6         46
7         46
8         46
9         26
10        26
11        26
12        26
13        26
14        51
15        51
16        51
17        51
18        36
19        26
20        26
21        26
22        26
23        26
24        26
25        26
26        26
27        26
28        26
29        36
          ..
550038    36
550039    26
550040    26
550041    46
550042    51
550043    46
550044    26
550045    26
550046     0
550047    26
550048    36
550049    36
550050    36
550051    26
550052    46
550053    36
550054    36
550055    26
550056    26
550057    26
550058    26
550059    26
550060    36
550061    26
550062    46
550063    51
550064    26
550065    26
550066    55
550067    46
Name: 0, Length: 550068, dtype: object
In [35]:
k[1].fillna(value=pd.np.nan, inplace=True)
In [36]:
k[1]
Out[36]:
0          17
1          17
2          17
3          17
4         NaN
5          35
6          50
7          50
8          50
9          35
10         35
11         35
12         35
13         35
14         55
15         55
16         55
17         55
18         45
19         35
20         35
21         35
22         35
23         35
24         35
25         35
26         35
27         35
28         35
29         45
         ... 
550038     45
550039     35
550040     35
550041     50
550042     55
550043     50
550044     35
550045     35
550046     17
550047     35
550048     45
550049     45
550050     45
550051     35
550052     50
550053     45
550054     45
550055     35
550056     35
550057     35
550058     35
550059     35
550060     45
550061     35
550062     50
550063     55
550064     35
550065     35
550066    NaN
550067     50
Name: 1, Length: 550068, dtype: object
In [37]:
k[1]=np.where(k[1].isnull(),55,k[1])
In [38]:
k[1].value_counts()
Out[38]:
35    219587
45    110013
25     99660
50     45701
55     38501
55     21504
17     15102
Name: 1, dtype: int64
In [39]:
k[1]=pd.to_numeric(k[1])
In [40]:
k[0]=pd.to_numeric(k[0])
In [41]:
df['Age1']=(k[0]+ k[1])/2

In [42]:
df['Age1'].value_counts(dropna=False)
Out[42]:
30.5    219587
40.5    110013
21.5     99660
48.0     45701
53.0     38501
55.0     21504
8.5      15102
Name: Age1, dtype: int64
In [43]:
df.dtypes
Out[43]:
Gender                          int32
Age                            object
Occupation                      int64
Stay_In_Current_City_Years      int64
Marital_Status                  int64
Product_Category_1              int64
Purchase                        int64
City_A                          int32
City_B                          int32
Age1                          float64
dtype: object
In [44]:
df.drop(['Age'], axis=1, inplace=True)
In [45]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 550068 entries, 0 to 550067 Data columns (total 9 columns): Gender 550068 non-null int32 Occupation 550068 non-null int64 Stay_In_Current_City_Years 550068 non-null int64 Marital_Status 550068 non-null int64 Product_Category_1 550068 non-null int64 Purchase 550068 non-null int64 City_A 550068 non-null int32 City_B 550068 non-null int32 Age1 550068 non-null float64 dtypes: float64(1), int32(3), int64(5) memory usage: 31.5 MB
In [46]:
df.shape
Out[46]:
(550068, 9)
In [47]:
x_train1=df.drop('Purchase',axis=1)
y_train1=df['Purchase']
In [48]:
from sklearn.linear_model import LinearRegression
In [49]:
lm=LinearRegression()
In [50]:
lm.fit(x_train1,y_train1)
Out[50]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [51]:
x_train2=df.drop('Purchase',axis=1)
y_train2=df['Purchase']
In [52]:
predict_ir=lm.predict(x_train2)
In [53]:
#Performance Measure
from sklearn.metrics import mean_absolute_error
In [54]:
mean_absolute_error(y_train2,predict_ir)
Out[54]:
3598.764302632832
In [55]:
from sklearn.metrics import mean_squared_error
In [56]:
mean_squared_error(y_train2,predict_ir)
Out[56]:
22082060.135549378
In [57]:
np.sqrt(mean_squared_error(y_train2,predict_ir))
Out[57]:
4699.155257655292
In [ ]: