Sign In
In [24]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

%matplotlib inline
In [2]:
bank_df = pd.read_csv('C:/Users/msafar/Desktop/Datasets/Bank/bank-full.csv')
In [3]:
bank_df2 = pd.read_csv('C:/Users/msafar/Desktop/Datasets/Bank/bank.csv')
In [137]:
def strip_quotes(x):
    if x[0] == '"':
        return x[1:-1]
        return x
In [279]:
output = pd.DataFrame(list(map(strip_quotes, bank_df.loc[0][0].split(';')))).T
output.columns = list(map(strip_quotes, bank_df.columns.values[0].split(';')))

for i in range(1,len(bank_df)):
    temp = pd.DataFrame(list(map(strip_quotes, bank_df.loc[i][0].split(';')))).T
    temp.columns = list(map(strip_quotes, bank_df.columns.values[0].split(';')))
    output = output.append(temp)
In [166]:
out = pd.read_csv(r'C:/Users/msafar/Desktop/Datasets/Bank/bank_full_fixed.csv')
In [167]:
out.drop(['Unnamed: 0', 'day', 'month'], axis=1, inplace=True)
In [168]:
# Turn all 'yes' to 1 and 'no' to 0 for binary columns 
out = out.mask(out=="no").fillna(0)
out = out.mask(out=="yes").fillna(1)

# Change all columns with categorical data to Pandas Categorical type
out['job'] = pd.Categorical(out['job'])
out['marital'] = pd.Categorical(out['marital'])
out['education'] = pd.Categorical(out['education'])
out['contact'] = pd.Categorical(out['contact'])
out['poutcome'] = pd.Categorical(out['poutcome'])
In [169]:
# Create one hot encoded data for each categorical data
job = pd.get_dummies(out['job'], prefix='job', drop_first=True)
marital = pd.get_dummies(out['marital'], prefix = 'marital', drop_first=True)
education = pd.get_dummies(out['education'], prefix='education', drop_first=True)
contact = pd.get_dummies(out['contact'], prefix='contact', drop_first=True)
poutcome = pd.get_dummies(out['poutcome'], prefix='poutcome', drop_first=True)
In [170]:
# Concatenate the main dataframe with each dummy frame and drop the category columns
out = pd.concat([out, job], axis=1)
out = pd.concat([out, marital], axis=1)
out = pd.concat([out, education], axis=1)
out = pd.concat([out, contact], axis=1)
out = pd.concat([out, poutcome], axis=1)
out.drop(['job', 'marital', 'education', 'contact', 'poutcome'], axis=1, inplace=True)
In [141]:
# Map months in strings to numerical values
# months = {'jan':1,'feb':2, 'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12}
# out['month'] = out['month'].map(months)
In [111]:
sns.distplot(out['age'], kde=False, color='darkred', bins=40)
<matplotlib.axes._subplots.AxesSubplot at 0x2a1932cb518>
Notebook Image
In [113]:
sns.distplot(out['balance'], kde=False, color='darkred', bins=100)
<matplotlib.axes._subplots.AxesSubplot at 0x2a196953ba8>
Notebook Image
In [114]:
sns.distplot(out['duration'], kde=False, color='darkred', bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0x2a195698a58>
Notebook Image
In [125]:
out.drop('pdays', axis=1, inplace=True)
In [115]:
from sklearn.model_selection import train_test_split
In [126]:
X_train, X_test, y_train, y_test = train_test_split(out.drop('y',axis=1), 
                                                    out['y'], test_size=0.30, 
In [117]:
from sklearn.linear_model import LogisticRegression
In [134]:
age                    int64
default                int64
balance                int64
housing                int64
loan                   int64
education_secondary    uint8
education_tertiary     uint8
education_unknown      uint8
contact_telephone      uint8
contact_unknown        uint8
Length: 29, dtype: object
In [127]:
logmodel = LogisticRegression(),y_train)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-127-63530b1fb37f> in <module> 1 logmodel = LogisticRegression() ----> 2,y_train) c:\users\msafar\appdata\local\programs\python\python37\lib\site-packages\sklearn\linear_model\ in fit(self, X, y, sample_weight) 1530 1531 X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C", -> 1532 accept_large_sparse=solver != 'liblinear') 1533 check_classification_targets(y) 1534 self.classes_ = np.unique(y) c:\users\msafar\appdata\local\programs\python\python37\lib\site-packages\sklearn\utils\ in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator) 717 ensure_min_features=ensure_min_features, 718 warn_on_dtype=warn_on_dtype, --> 719 estimator=estimator) 720 if multi_output: 721 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, c:\users\msafar\appdata\local\programs\python\python37\lib\site-packages\sklearn\utils\ in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator) 494 try: 495 warnings.simplefilter('error', ComplexWarning) --> 496 array = np.asarray(array, dtype=dtype, order=order) 497 except ComplexWarning: 498 raise ValueError("Complex data not supported\n" c:\users\msafar\appdata\local\programs\python\python37\lib\site-packages\numpy\core\ in asarray(a, dtype, order) 83 84 """ ---> 85 return array(a, dtype, copy=False, order=order) 86 87 ValueError: could not convert string to float: 'unknown'
In [122]:
In [124]:
In [ ]: