Learn practical skills, build real-world projects, and advance your career
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
data=pd.read_csv("train.csv")

print(data.shape)

data.head()
(1460, 81)
vars_with_na=[var for var in data.columns if data[var].isnull().sum()>1]

for var in vars_with_na:
    print(var,np.round(data[var].isnull().mean(),3),'% missing values')
LotFrontage 0.177 % missing values Alley 0.938 % missing values MasVnrType 0.005 % missing values MasVnrArea 0.005 % missing values BsmtQual 0.025 % missing values BsmtCond 0.025 % missing values BsmtExposure 0.026 % missing values BsmtFinType1 0.025 % missing values BsmtFinType2 0.026 % missing values FireplaceQu 0.473 % missing values GarageType 0.055 % missing values GarageYrBlt 0.055 % missing values GarageFinish 0.055 % missing values GarageQual 0.055 % missing values GarageCond 0.055 % missing values PoolQC 0.995 % missing values Fence 0.808 % missing values MiscFeature 0.963 % missing values
def analyse_na_value(df,var):
    df=data.copy()
    
    df[var]=np.where(df[var].isnull(),1,0)
    
    df.groupby(var)['SalePrice'].median().plot.bar()
    plt.title(var)
    plt.show()

for var in vars_with_na:
    analyse_na_value(data,var)
Notebook Image
Notebook Image
Notebook Image
Notebook Image
Notebook Image
Notebook Image
Notebook Image
Notebook Image
Notebook Image
Notebook Image
Notebook Image
Notebook Image
Notebook Image
Notebook Image
Notebook Image
Notebook Image
Notebook Image
Notebook Image
nums_vars=[var for var in data.columns if data[var].dtypes!='O']

print("number of numerical variable: " ,len(nums_vars))

data[nums_vars].head()
number of numerical variable: 38