Jovian
⭐️
Sign In

Debt Prediction using Linear Regression

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn import metrics
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split as t_t_s
In [2]:
df = pd.read_csv("https://github.com/mpourhoma/CS4661/raw/master/Credit.csv")
In [3]:
df.head()
Out[3]:
In [4]:
feature_cols =['Income', 'Limit', 'Rating', 'Cards','Age', 'Education','Married']
bal = df['Balance']
In [5]:
cols = df[feature_cols]
X = preprocessing.scale(cols)

In [6]:
# Splitting the dataset into testing and training:
X_train, X_test, y_train, y_test = t_t_s(X, bal, test_size=0.24,random_state=9)

Linear Regression

In [7]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_predict = linear_reg.predict(X_test)

print(y_predict)

# Calculating "Mean Square Error" (MSE):
mse = metrics.mean_squared_error(y_test, y_predict)

# Using numpy sqrt function to take the square root and calculate "Root Mean Square Error" (RMSE)
rmse = np.sqrt(mse)

print(rmse)
[ 316.89813114 677.5093907 489.03669272 -49.82989971 706.42913216 864.96954115 897.65752581 108.20297732 -213.73474279 834.94198677 11.05109139 -227.34447869 472.01450822 4.54554458 -228.17349827 956.21537845 896.91570524 821.97824267 18.37144232 857.31517013 1028.30326279 699.22168497 1176.9426756 666.28578465 643.94064891 690.63628696 589.35273903 -29.37237394 575.15444917 406.78468032 845.81453327 828.30862904 82.2376206 953.40592514 -184.74828227 525.84335889 1045.19205022 535.04104472 107.26753118 -172.8968111 497.84037664 1155.35533939 429.08674601 403.49872552 143.72087313 764.78737012 431.24114308 1304.79656588 475.00132057 95.75404622 -215.69000296 5.62900657 313.88797405 859.9328932 792.27147016 1029.90210771 1500.40180297 1078.48617624 713.37981473 1072.97948262 -64.12187718 488.60063387 92.3601712 440.414465 277.13284667 1075.41508762 258.10181743 773.89657462 278.12447916 409.51029492 335.56674013 989.87781589 978.31940615 1042.6230052 915.82319179 150.95645254 516.7089988 733.47654088 74.84330655 556.47429962 1096.60831626 359.12494251 204.54562103 407.76115868 113.50656023 958.07237887 934.59051997 2.98452112 1627.80317193 636.31395153 688.17824441 711.80910955 293.66354104 -219.28997401 -91.76701391 -149.31642723] 143.24649739642265

Applying 10-fold cross validation with linear regression :

In [8]:
mse_list = cross_val_score(linear_reg, X, bal, cv=10, scoring='neg_mean_squared_error')
mse_list_positive = - mse_list

print(mse_list)
print("------------------------------------------------------------------------------------------------------------")
print(mse_list_positive)
[-23646.90415343 -32003.04401232 -35462.64435619 -37327.60719635 -14341.32205939 -33628.37104224 -31631.99317834 -12491.00334951 -20749.61212175 -23204.94743459] ------------------------------------------------------------------------------------------------------------ [23646.90415343 32003.04401232 35462.64435619 37327.60719635 14341.32205939 33628.37104224 31631.99317834 12491.00334951 20749.61212175 23204.94743459]
In [9]:
# calculate the average RMSE as final result of cross validation:
rmse_list = np.sqrt(mse_list_positive)
print(rmse_list)
print("------------------------------------------------------------------------------------------------------------")
print(rmse_list.mean())
[153.7754992 178.89394627 188.31527914 193.20353826 119.75525901 183.38039983 177.85385343 111.76315739 144.04725656 152.331702 ] ------------------------------------------------------------------------------------------------------------ 160.33198910744136

RMSE :

In [10]:
print(rmse_list.mean())
160.33198910744136