Learn practical skills, build real-world projects, and advance your career
Created 2 years ago
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
data=pd.read_csv("train.csv")
print(data.shape)
data.head()
(1460, 81)
vars_with_na=[var for var in data.columns if data[var].isnull().sum()>1]
for var in vars_with_na:
print(var,np.round(data[var].isnull().mean(),3),'% missing values')
LotFrontage 0.177 % missing values
Alley 0.938 % missing values
MasVnrType 0.005 % missing values
MasVnrArea 0.005 % missing values
BsmtQual 0.025 % missing values
BsmtCond 0.025 % missing values
BsmtExposure 0.026 % missing values
BsmtFinType1 0.025 % missing values
BsmtFinType2 0.026 % missing values
FireplaceQu 0.473 % missing values
GarageType 0.055 % missing values
GarageYrBlt 0.055 % missing values
GarageFinish 0.055 % missing values
GarageQual 0.055 % missing values
GarageCond 0.055 % missing values
PoolQC 0.995 % missing values
Fence 0.808 % missing values
MiscFeature 0.963 % missing values
def analyse_na_value(df,var):
df=data.copy()
df[var]=np.where(df[var].isnull(),1,0)
df.groupby(var)['SalePrice'].median().plot.bar()
plt.title(var)
plt.show()
for var in vars_with_na:
analyse_na_value(data,var)
nums_vars=[var for var in data.columns if data[var].dtypes!='O']
print("number of numerical variable: " ,len(nums_vars))
data[nums_vars].head()
number of numerical variable: 38