Jovian
⭐️
Sign In

Linear Regression Project

-Shreejaya Bharathan and Aakanksha Nallabothula Surya

In [4]:
import numpy as np
import matplotlib.pyplot as plt 

import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats
import matplotlib.cm as cm
from IPython.display import display
from mpl_toolkits.mplot3d import Axes3D
from sklearn.feature_selection import f_regression
from statsmodels.stats.anova import anova_lm
In [5]:
student_data = pd.read_csv('./student-mat.csv')
In [6]:
student_data.head()
Out[6]:

Test train split

In [7]:
### creating a subset of data for prediction 
test_data = student_data.sample(n=10)
print(list(test_data.index))
### updating the training data
student_data = student_data.drop(list(test_data.index))
student_data.shape
[0, 88, 258, 84, 117, 158, 324, 42, 26, 335]
Out[7]:
(385, 33)
Selecting variables of intereset for our research
In [8]:
student_data = student_data[['sex','famsize','Pstatus','famsup','studytime','failures','romantic','goout','absences','Dalc','traveltime','paid','schoolsup', 'G3']]
In [9]:
student_data.shape
Out[9]:
(385, 14)
In [10]:
features = student_data.drop('G3',axis=1)
In [11]:
features.columns
Out[11]:
Index(['sex', 'famsize', 'Pstatus', 'famsup', 'studytime', 'failures',
       'romantic', 'goout', 'absences', 'Dalc', 'traveltime', 'paid',
       'schoolsup'],
      dtype='object')

Explanatory Analysis

Scatter plots

In [12]:
## pairplot

import seaborn as sn
sn.pairplot(student_data)
Out[12]:
<seaborn.axisgrid.PairGrid at 0x12c2059d0>
Notebook Image
In [13]:
#scatter plot
import matplotlib.pyplot as plt
plt.figure(figsize=(35, 4))


# i: index
for i, col in enumerate(features.columns):
    plt.subplot(1, 13, i+1)
    x = features[col]
    y = student_data['G3']
    plt.plot(x, y, 'o')
    # Create regression line
    plt.plot(np.unique(x))
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel('grade')
Notebook Image

Correlation matrix

In [14]:
#correlation
import seaborn as sns
correlation_matrix = student_data.corr().round(2)
# annot = True to print the values inside the square
plt.figure(figsize = (16,5))
sns.heatmap(data=correlation_matrix, annot=True)
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x130ca4390>
Notebook Image

Regression Analysis

FULL MODEL

In [15]:
reg = smf.ols('G3~  C(sex) + C(famsize) + C(Pstatus) + C(traveltime) + C(studytime)+failures +C(schoolsup)+ C(famsup)+C(paid)+ C(romantic) + C(goout) + C(Dalc) +absences', data = student_data).fit()
reg.summary()
Out[15]:

INDIVIDUAL T tests

sex

p-value < 0.05 => significant

In [16]:
reg_sex = smf.ols('G3~ C(sex)', data=student_data).fit()
reg_sex.summary()
Out[16]:

famsize

p-value > 0.05 => not significant

In [17]:
reg_famsize = smf.ols('G3~ C(famsize)', data=student_data).fit()
reg_famsize.summary()
Out[17]:

Pstatus

p-value > 0.05 => not-significant

In [18]:
reg_pstatus = smf.ols('G3~ C(Pstatus)', data=student_data).fit()
reg_pstatus.summary()
Out[18]:

traveltime

p-value > 0.05 so not significant

In [19]:
reg_travel = smf.ols('G3~ C(traveltime)', data=student_data).fit()
reg_travel.summary()
Out[19]:

studytime

p-value > 0.05 so not significant

In [20]:
reg_studytime = smf.ols('G3~ C(studytime)', data=student_data).fit()
reg_studytime.summary()
Out[20]:

failures

p-value <0.05 => significant

In [21]:
reg_failures = smf.ols('G3~ failures', data=student_data).fit()
reg_failures.summary()
Out[21]:

schoolsup

p-value > 0.05 so not significant

In [22]:
reg_schoolsup = smf.ols('G3~ C(schoolsup)', data=student_data).fit()
reg_schoolsup.summary()
Out[22]:

famsup

p-value > 0.05 so not significant

In [23]:
reg_famsup = smf.ols('G3~ C(famsup)', data=student_data).fit()
reg_famsup.summary()
Out[23]:

paid

p-value < 0.05 => significant

In [24]:
reg_paid = smf.ols('G3~ C(paid)', data=student_data).fit()
reg_paid.summary()
Out[24]:

romantic

p-value < 0.05 => significant

In [25]:
reg_romantic = smf.ols('G3~ C(romantic)', data=student_data).fit()
reg_romantic.summary()
Out[25]:

goout

p-value > 0.05 => not significant

In [26]:
reg_goout = smf.ols('G3~ C(goout)', data=student_data).fit()
reg_goout.summary()
Out[26]:

Dalc

p-value < 0.05 => significant

In [27]:
reg_dalc = smf.ols('G3~ C(Dalc)', data=student_data).fit()
reg_dalc.summary()
Out[27]:

absences

p-value > 0.05 => not significant

In [28]:
reg_absences = smf.ols('G3~ absences', data=student_data).fit()
reg_absences.summary()
Out[28]:

Significant variables : sex, failures, paid, romantic, Dalc, studytime

In [29]:
reg_chosen = smf.ols('G3~ C(sex) + failures + C(paid) + C(romantic) + C(Dalc) + C(studytime)', data=student_data).fit()
reg_chosen.summary()
Out[29]:

Significant variables : sex, failures, romantic, studytime

In [30]:
reg_full_model = smf.ols('G3~ C(sex) + failures + C(romantic) + C(studytime)', data=student_data).fit()
reg_full_model.summary()
Out[30]:
In [31]:
anova_model_full = sm.stats.anova_lm(reg_full_model, typ=1)
anova_model_full
Out[31]:

Model Selection - Best Subset Selection

Mallow's Cp Calculation

In [32]:
MSE_p = reg_full_model.ssr/(395-7)
In [33]:
## Mallow's Cp
def Cp_calc(SSE,k):
    n = 395    
    Cp = (SSE/MSE_p)+ 2*(k+1) -n 
    return Cp

Model 1 - no predictors

G3 ~ constant

In [34]:
student_data['null'] = 1
reg_model_1 = smf.ols('G3~ null', data = student_data).fit()

Cp_model_1 = Cp_calc(reg_model_1.ssr,0)
model_1 = [0,reg_model_1.rsquared, reg_model_1.rsquared_adj, Cp_model_1, 'none', reg_model_1.aic, reg_model_1.bic]
cp_table = pd.DataFrame(model_1).T

Model 2 - sex only

G3 ~ sex

In [35]:
reg_model_2 = smf.ols('G3~ C(sex)', data = student_data).fit()
Cp_model_2 = Cp_calc(reg_model_2.ssr,1)
model_2 = [1,reg_model_2.rsquared, reg_model_2.rsquared_adj, Cp_model_2, 'sex', reg_model_2.aic, reg_model_2.bic]
cp_table = cp_table.append(pd.DataFrame(model_2).T)

Model 3 - failures only

G3 ~ failures

In [36]:
reg_model_3 = smf.ols('G3~ failures', data = student_data).fit()
Cp_model_3 = Cp_calc(reg_model_3.ssr,1)
model_3 = [1,reg_model_3.rsquared, reg_model_3.rsquared_adj, Cp_model_3, 'failures', reg_model_3.aic, reg_model_3.bic]
cp_table = cp_table.append(pd.DataFrame(model_3).T)

Model 4 - romantic only

G3 ~ romatic

In [37]:
reg_model_4 = smf.ols('G3~C(romantic)', data = student_data).fit()
Cp_model_4 = Cp_calc(reg_model_4.ssr,1)
model_4 = [1,reg_model_4.rsquared, reg_model_4.rsquared_adj, Cp_model_4, 'romantic', reg_model_4.aic, reg_model_4.bic]
cp_table = cp_table.append(pd.DataFrame(model_4).T)

Model 5 - studytime only

G3 ~ studytime

In [38]:
reg_model_5 = smf.ols('G3~C(studytime)', data = student_data).fit()
Cp_model_5 = Cp_calc(reg_model_5.ssr,3)
model_5 = [1,reg_model_5.rsquared, reg_model_5.rsquared_adj, Cp_model_5, 'studytime', reg_model_5.aic, reg_model_5.bic]
cp_table = cp_table.append(pd.DataFrame(model_5).T)

Model 6 - sex and failures

G3 ~ sex + failures

In [39]:
reg_model_6 = smf.ols('G3~C(sex) + failures', data = student_data).fit()
Cp_model_6 = Cp_calc(reg_model_6.ssr,2)
model_6 = [2,reg_model_6.rsquared, reg_model_6.rsquared_adj, Cp_model_6, 'sex+failures', reg_model_6.aic, reg_model_6.bic]
cp_table = cp_table.append(pd.DataFrame(model_6).T)

Model 7 - sex and romantic

G3 ~ sex + romantic

In [40]:
reg_model_7 = smf.ols('G3~C(sex) + C(romantic)', data = student_data).fit()
Cp_model_7 = Cp_calc(reg_model_7.ssr,2)
model_7 = [2,reg_model_7.rsquared, reg_model_7.rsquared_adj, Cp_model_7, 'sex+romantic', reg_model_7.aic, reg_model_7.bic]
cp_table = cp_table.append(pd.DataFrame(model_7).T)

Model 8 - sex and studytime

G3 ~ sex + studytime

In [41]:
reg_model_8 = smf.ols('G3~C(sex) + C(studytime)', data = student_data).fit()
Cp_model_8 = Cp_calc(reg_model_8.ssr,4)
model_8 = [2,reg_model_8.rsquared, reg_model_8.rsquared_adj, Cp_model_8, 'sex+studytime', reg_model_8.aic, reg_model_8.bic]
cp_table = cp_table.append(pd.DataFrame(model_8).T)

Model 9 - failures and romantic

G3 ~ failures + romantic

In [42]:
reg_model_9 = smf.ols('G3~failures + C(romantic)', data = student_data).fit()
Cp_model_9 = Cp_calc(reg_model_9.ssr,2)
model_9 = [2,reg_model_9.rsquared, reg_model_9.rsquared_adj, Cp_model_9, 'failures+romantic', reg_model_9.aic, reg_model_9.bic]
cp_table = cp_table.append(pd.DataFrame(model_9).T)

Model 10 - failures and studytime

G3 ~ failures + studytime

In [43]:
reg_model_10 = smf.ols('G3~failures + C(studytime)', data = student_data).fit()
Cp_model_10 = Cp_calc(reg_model_10.ssr,4)
model_10 = [2,reg_model_10.rsquared, reg_model_10.rsquared_adj, Cp_model_10, 'failures+studytime', reg_model_10.aic, reg_model_10.bic]
cp_table = cp_table.append(pd.DataFrame(model_10).T)

Model 11 - romantic and studytime

G3 ~ romantic + studytime

In [44]:
reg_model_11 = smf.ols('G3~C(romantic) + C(studytime)', data = student_data).fit()
Cp_model_11 = Cp_calc(reg_model_11.ssr,4)
model_11 = [2,reg_model_11.rsquared, reg_model_11.rsquared_adj, Cp_model_11, 'romantic+studytime', reg_model_11.aic, reg_model_11.bic]
cp_table = cp_table.append(pd.DataFrame(model_11).T)

Model 12 - sex, failures and romantic

G3 ~ sex +failures + romantic

In [45]:
reg_model_12 = smf.ols('G3~C(sex) + failures + C(romantic)', data = student_data).fit()
Cp_model_12 = Cp_calc(reg_model_12.ssr,3)
model_12 = [3,reg_model_12.rsquared, reg_model_12.rsquared_adj, Cp_model_12, 'sex+failures+romantic', reg_model_12.aic, reg_model_12.bic]
cp_table = cp_table.append(pd.DataFrame(model_12).T)

Model 13 - sex, failures and studytime

G3 ~ sex +failures + studytime

In [46]:
reg_model_13 = smf.ols('G3~C(sex) + failures + C(studytime)', data = student_data).fit()
Cp_model_13 = Cp_calc(reg_model_13.ssr,5)
model_13 = [3,reg_model_13.rsquared, reg_model_13.rsquared_adj, Cp_model_13, 'sex+failues+studytime', reg_model_13.aic, reg_model_13.bic]
cp_table = cp_table.append(pd.DataFrame(model_13).T)

Model 14 - sex, romantic, studytime

G3 ~ sex+romantic+studytime

In [47]:
reg_model_14 = smf.ols('G3~C(sex) + C(romantic) + C(studytime)', data = student_data).fit()
Cp_model_14 = Cp_calc(reg_model_14.ssr,5)
model_14 = [3,reg_model_14.rsquared, reg_model_14.rsquared_adj, Cp_model_14, 'sex+romantic+studytime', reg_model_14.aic, reg_model_14.bic]
cp_table = cp_table.append(pd.DataFrame(model_14).T)

Model 15 - failures, romantic,studytime

G3 ~ failures+romantic+studytime

In [48]:
reg_model_15 = smf.ols('G3~failures+ C(romantic) + C(studytime)', data = student_data).fit()
Cp_model_15 = Cp_calc(reg_model_15.ssr,5)
model_15 = [3,reg_model_15.rsquared, reg_model_15.rsquared_adj, Cp_model_15, 'failures+romantic+studytime', reg_model_15.aic, reg_model_15.bic]
cp_table = cp_table.append(pd.DataFrame(model_15).T)

Model 16 - sex,failures,romantic,studytime

G3~ sex+failures+romantic+studytime

In [49]:
Cp_model_16 = Cp_calc(reg_full_model.ssr,6)
model_16 = [4,reg_full_model.rsquared, reg_full_model.rsquared_adj, Cp_model_16, 'sex+failures+romantic+studytime', reg_full_model.aic, reg_full_model.bic]
cp_table = cp_table.append(pd.DataFrame(model_16).T)

mallows Cp, AIC, BIC table

In [50]:
cp_table.columns = ["Vars","Rsquared", "Adj_Rsquared", "Mallows_Cp","Predictors","AIC","BIC"]
cp_table
Out[50]:
In [51]:
cp_table.sort_values('Mallows_Cp')[:4]
Out[51]:

We choose full model because of high Adj_Rsquared value and sex+failures+romantic due to lowest Cp value

Out of these two, full model has lower AIC and higher Adj_Rsquared value so we choose this model although the other model has lower BIC value

Model Diagnosis

Selected model = G3 ~(sex+failures+romantic+studytime)

Fitted values vs residuals

In [52]:
student_data_infl = reg_full_model.get_influence()
#residuals v.s. fitted values
residuals = student_data_infl.resid
grades_fitted = reg_full_model.fittedvalues
plt.scatter(residuals,grades_fitted)
plt.xlabel("residuals")
plt.ylabel("Fitted Values")
plt.title("residuals vs. Fitted Values")
Out[52]:
Text(0.5, 1.0, 'residuals vs. Fitted Values')
Notebook Image
Points bounce around the zero line as expected

Heteroscedasticity check - Breusch Pagan test

In [53]:
from statsmodels.stats.diagnostic import het_breuschpagan
labels = ['LM Statistic', 'LM-Test p-value', 'F-Statistic', 'F-Test p-value']
In [54]:
bp_test_student_data = het_breuschpagan(reg_full_model.resid, reg_full_model.model.exog)
print(dict(zip(labels, bp_test_student_data)))
{'LM Statistic': 7.611099138440543, 'LM-Test p-value': 0.2680015058307023, 'F-Statistic': 1.2705706093292115, 'F-Test p-value': 0.26991852582302445}

p-value > alpha =0.05 => we do not have enough evidence to reject the null hypothesis. Hence the model does not have heteroscedasticity

Autocorrelation check - Breusch-Godfrey test.

In [55]:
bg_test_students =sm.stats.diagnostic.acorr_breusch_godfrey(reg_full_model)
labels_bg = ['LM Statistic', 'LM-Test p-value', 'F-Statistic', 'F-Test p-value']
print(dict(zip(labels_bg, bg_test_students)))
{'LM Statistic': 27.92953790865811, 'LM-Test p-value': 0.0322378087359484, 'F-Statistic': 1.769694954554213, 'F-Test p-value': 0.0335493557107013}

p-value > alpha =0.05 => we do not have enough evidence to reject the null hypothesis. Hence the model does not have an autocorrelation problem

Influential points

In [56]:
student_data_infl = reg_full_model.get_influence()
using internally studentized residuals
In [57]:
#threshold internally studentized residuals
import scipy
n_students = student_data.shape[0]
p_students = 7
seuil_stud = scipy.stats.t.ppf(0.975,df=n_students-p_students)

#detection - absolute value > threshold
atyp_stud = np.abs(student_data_infl.resid_studentized_internal) > seuil_stud
#which ones?
print(student_data.index[atyp_stud],student_data[atyp_stud])
Int64Index([131, 134, 135, 136, 140, 148, 162, 168, 198, 221, 239, 242, 244, 259, 264, 269, 296, 316, 332, 333, 334, 337, 341, 383, 387, 389], dtype='int64') sex famsize Pstatus famsup studytime failures romantic goout absences \ 131 F GT3 T yes 1 0 yes 3 0 134 M GT3 T yes 2 0 yes 3 0 135 F GT3 T yes 3 0 yes 3 0 136 M GT3 T no 2 0 no 5 0 140 M GT3 T yes 4 0 no 2 0 148 M GT3 T yes 1 0 yes 2 0 162 M LE3 T no 1 1 no 4 0 168 F GT3 T yes 2 0 no 5 0 198 F GT3 T yes 1 1 no 4 24 221 F GT3 T yes 3 1 yes 4 0 239 M GT3 T no 2 1 no 4 0 242 M LE3 T no 1 0 no 5 0 244 F GT3 T yes 3 0 yes 4 0 259 F LE3 T no 4 0 yes 1 0 264 F GT3 T yes 3 0 yes 3 0 269 F GT3 T yes 2 0 yes 5 0 296 F GT3 T yes 2 0 no 4 0 316 F GT3 T yes 2 0 no 3 0 332 F GT3 T no 2 0 no 4 0 333 F LE3 T no 2 0 yes 3 0 334 F GT3 T no 4 0 no 4 0 337 F GT3 T yes 2 0 yes 2 0 341 M GT3 T yes 2 1 no 3 0 383 M GT3 T no 1 1 no 2 0 387 F GT3 T no 3 1 no 2 0 389 F GT3 T no 2 1 no 1 0 Dalc traveltime paid schoolsup G3 null 131 1 3 no no 0 1 134 1 4 no no 0 1 135 1 1 no no 0 1 136 2 3 no no 0 1 140 1 2 no yes 0 1 148 2 1 no no 0 1 162 2 2 no no 0 1 168 1 1 yes no 0 1 198 2 2 no no 18 1 221 1 1 no no 0 1 239 3 1 no no 0 1 242 1 1 no no 0 1 244 1 2 yes no 0 1 259 1 1 yes no 0 1 264 1 1 yes no 0 1 269 1 2 no no 0 1 296 2 2 yes no 0 1 316 1 2 yes no 0 1 332 1 1 no no 0 1 333 1 1 no no 0 1 334 1 2 no no 0 1 337 2 1 yes no 0 1 341 2 1 no no 0 1 383 1 2 no no 0 1 387 1 1 no no 0 1 389 1 2 no no 0 1
using externally studentized residuals
In [58]:
t_val_students = scipy.stats.t.ppf(0.975,df=n_students-p_students-1)

#detection - absolute value > threshold
suspicious_external_studs = np.abs(student_data_infl.resid_studentized_external) > t_val_students
#which ones?
print(student_data.index[suspicious_external_studs],student_data[suspicious_external_studs])
Int64Index([131, 134, 135, 136, 140, 148, 162, 168, 198, 221, 239, 242, 244, 259, 260, 264, 269, 296, 316, 332, 333, 334, 337, 341, 376, 383, 387, 389], dtype='int64') sex famsize Pstatus famsup studytime failures romantic goout absences \ 131 F GT3 T yes 1 0 yes 3 0 134 M GT3 T yes 2 0 yes 3 0 135 F GT3 T yes 3 0 yes 3 0 136 M GT3 T no 2 0 no 5 0 140 M GT3 T yes 4 0 no 2 0 148 M GT3 T yes 1 0 yes 2 0 162 M LE3 T no 1 1 no 4 0 168 F GT3 T yes 2 0 no 5 0 198 F GT3 T yes 1 1 no 4 24 221 F GT3 T yes 3 1 yes 4 0 239 M GT3 T no 2 1 no 4 0 242 M LE3 T no 1 0 no 5 0 244 F GT3 T yes 3 0 yes 4 0 259 F LE3 T no 4 0 yes 1 0 260 F GT3 T yes 2 0 yes 2 21 264 F GT3 T yes 3 0 yes 3 0 269 F GT3 T yes 2 0 yes 5 0 296 F GT3 T yes 2 0 no 4 0 316 F GT3 T yes 2 0 no 3 0 332 F GT3 T no 2 0 no 4 0 333 F LE3 T no 2 0 yes 3 0 334 F GT3 T no 4 0 no 4 0 337 F GT3 T yes 2 0 yes 2 0 341 M GT3 T yes 2 1 no 3 0 376 F GT3 T yes 3 2 yes 3 4 383 M GT3 T no 1 1 no 2 0 387 F GT3 T no 3 1 no 2 0 389 F GT3 T no 2 1 no 1 0 Dalc traveltime paid schoolsup G3 null 131 1 3 no no 0 1 134 1 4 no no 0 1 135 1 1 no no 0 1 136 2 3 no no 0 1 140 1 2 no yes 0 1 148 2 1 no no 0 1 162 2 2 no no 0 1 168 1 1 yes no 0 1 198 2 2 no no 18 1 221 1 1 no no 0 1 239 3 1 no no 0 1 242 1 1 no no 0 1 244 1 2 yes no 0 1 259 1 1 yes no 0 1 260 1 1 yes no 18 1 264 1 1 yes no 0 1 269 1 2 no no 0 1 296 2 2 yes no 0 1 316 1 2 yes no 0 1 332 1 1 no no 0 1 333 1 1 no no 0 1 334 1 2 no no 0 1 337 2 1 yes no 0 1 341 2 1 no no 0 1 376 1 2 yes no 15 1 383 1 2 no no 0 1 387 1 1 no no 0 1 389 1 2 no no 0 1
using leverage
In [59]:
students_leverage = student_data_infl.hat_matrix_diag
suspicious_students_leverage = np.abs(students_leverage) > (3*p_students)/n_students
#which ones?
print(student_data.index[suspicious_students_leverage],student_data[suspicious_students_leverage])
Int64Index([], dtype='int64') Empty DataFrame Columns: [sex, famsize, Pstatus, famsup, studytime, failures, romantic, goout, absences, Dalc, traveltime, paid, schoolsup, G3, null] Index: []
using DFFITS
In [60]:
students_dffits = student_data_infl.dffits
suspicious_students_dffits = np.abs(students_dffits[0]) > 2*(((p_students+1)/(n_students-p_students-1))**(1/2))
#which ones?
print(student_data.index[suspicious_students_dffits],student_data[suspicious_students_dffits])
Int64Index([ 2, 47, 130, 131, 134, 135, 136, 140, 148, 157, 198, 221, 242, 244, 259, 264, 293, 303, 334, 376, 387], dtype='int64') sex famsize Pstatus famsup studytime failures romantic goout absences \ 2 F LE3 T no 2 3 no 2 10 47 M GT3 T no 4 0 no 2 4 130 F GT3 T yes 3 2 yes 2 0 131 F GT3 T yes 1 0 yes 3 0 134 M GT3 T yes 2 0 yes 3 0 135 F GT3 T yes 3 0 yes 3 0 136 M GT3 T no 2 0 no 5 0 140 M GT3 T yes 4 0 no 2 0 148 M GT3 T yes 1 0 yes 2 0 157 F GT3 T yes 1 3 no 5 6 198 F GT3 T yes 1 1 no 4 24 221 F GT3 T yes 3 1 yes 4 0 242 M LE3 T no 1 0 no 5 0 244 F GT3 T yes 3 0 yes 4 0 259 F LE3 T no 4 0 yes 1 0 264 F GT3 T yes 3 0 yes 3 0 293 F LE3 T yes 4 0 no 2 6 303 F GT3 T yes 4 0 no 2 0 334 F GT3 T no 4 0 no 4 0 376 F GT3 T yes 3 2 yes 3 4 387 F GT3 T no 3 1 no 2 0 Dalc traveltime paid schoolsup G3 null 2 2 1 yes yes 10 1 47 1 1 no no 20 1 130 2 2 no no 0 1 131 1 3 no no 0 1 134 1 4 no no 0 1 135 1 1 no no 0 1 136 2 3 no no 0 1 140 1 2 no yes 0 1 148 2 1 no no 0 1 157 1 3 no no 10 1 198 2 2 no no 18 1 221 1 1 no no 0 1 242 1 1 no no 0 1 244 1 2 yes no 0 1 259 1 1 yes no 0 1 264 1 1 yes no 0 1 293 1 2 yes no 18 1 303 1 1 yes no 18 1 334 1 2 no no 0 1 376 1 2 yes no 15 1 387 1 1 no no 0 1
using Cook's distance
In [61]:
student_cooks = student_data_infl.cooks_distance
suspicious_student_cooks = np.abs(student_cooks[0]) > 4/(n_students-p_students)
#which ones?
print(student_data.index[suspicious_student_cooks],student_data[suspicious_student_cooks])
Int64Index([ 2, 47, 130, 131, 134, 135, 136, 140, 148, 157, 198, 221, 242, 244, 259, 264, 293, 303, 314, 334, 338, 376, 387], dtype='int64') sex famsize Pstatus famsup studytime failures romantic goout absences \ 2 F LE3 T no 2 3 no 2 10 47 M GT3 T no 4 0 no 2 4 130 F GT3 T yes 3 2 yes 2 0 131 F GT3 T yes 1 0 yes 3 0 134 M GT3 T yes 2 0 yes 3 0 135 F GT3 T yes 3 0 yes 3 0 136 M GT3 T no 2 0 no 5 0 140 M GT3 T yes 4 0 no 2 0 148 M GT3 T yes 1 0 yes 2 0 157 F GT3 T yes 1 3 no 5 6 198 F GT3 T yes 1 1 no 4 24 221 F GT3 T yes 3 1 yes 4 0 242 M LE3 T no 1 0 no 5 0 244 F GT3 T yes 3 0 yes 4 0 259 F LE3 T no 4 0 yes 1 0 264 F GT3 T yes 3 0 yes 3 0 293 F LE3 T yes 4 0 no 2 6 303 F GT3 T yes 4 0 no 2 0 314 F GT3 T no 3 2 yes 2 14 334 F GT3 T no 4 0 no 4 0 338 F LE3 T yes 4 0 no 3 7 376 F GT3 T yes 3 2 yes 3 4 387 F GT3 T no 3 1 no 2 0 Dalc traveltime paid schoolsup G3 null 2 2 1 yes yes 10 1 47 1 1 no no 20 1 130 2 2 no no 0 1 131 1 3 no no 0 1 134 1 4 no no 0 1 135 1 1 no no 0 1 136 2 3 no no 0 1 140 1 2 no yes 0 1 148 2 1 no no 0 1 157 1 3 no no 10 1 198 2 2 no no 18 1 221 1 1 no no 0 1 242 1 1 no no 0 1 244 1 2 yes no 0 1 259 1 1 yes no 0 1 264 1 1 yes no 0 1 293 1 2 yes no 18 1 303 1 1 yes no 18 1 314 1 1 no no 13 1 334 1 2 no no 0 1 338 1 1 no no 17 1 376 1 2 yes no 15 1 387 1 1 no no 0 1

Influential points :

In [62]:
merged_student_susp = set(student_data.index[atyp_stud])
merged_student_susp = merged_student_susp.union(set(student_data.index[suspicious_external_studs]))
merged_student_susp = merged_student_susp.union(set(student_data.index[suspicious_students_dffits]))
merged_student_susp = merged_student_susp.union(set(student_data.index[suspicious_student_cooks]))
list(merged_student_susp)
Out[62]:
[2,
 131,
 259,
 387,
 134,
 135,
 136,
 264,
 389,
 260,
 140,
 269,
 130,
 148,
 157,
 162,
 293,
 168,
 296,
 47,
 303,
 314,
 316,
 198,
 332,
 333,
 334,
 337,
 338,
 341,
 221,
 239,
 242,
 244,
 376,
 383]

Influence Plot

In [63]:
#graphical representation of the influences()
sm.graphics.influence_plot(reg_full_model)
Out[63]:
Notebook Image
Notebook Image

Fitted model after removing influential points

In [64]:
students_not_influential = student_data.drop(list(merged_student_susp))
In [65]:
reg_not_influential =  smf.ols('G3~ C(sex) + failures + C(romantic) + C(studytime)', data=students_not_influential).fit()
reg_not_influential.summary()
Out[65]:
In [66]:
reg_full_model.summary()
Out[66]:
After removing influential points, romantic has become insignificant and the p-values for other predictors have also changed indicating the points were highly influential

VIF test

failures
In [67]:
failures_reg_vif = smf.ols('failures ~ C(sex) + C(romantic) + C(studytime)', data=student_data).fit()
failures_reg_vif.summary()
Out[67]:
In [68]:
failures_vif = 1/(1 - 0.043)
failures_vif ## no multicollinearity problem
Out[68]:
1.044932079414838
sex
In [69]:
##dummy coding
vif_sex_data = student_data.copy()
vif_sex_data['sex_female'] = 1
vif_sex_data.loc[(vif_sex_data.sex == 'M'),'sex_female'] = 0
In [70]:
vif_sex_data.head()
Out[70]:
In [71]:
sex_reg_vif = smf.ols('sex_female ~ failures + C(romantic) + C(studytime)', data=vif_sex_data).fit()
sex_reg_vif.summary()
Out[71]:
In [72]:
sex_vif = 1/(1 - 0.139)
sex_vif ## no multicollinearity problem
Out[72]:
1.1614401858304297
romantic
In [73]:
##dummy coding
vif_romantic_data = student_data.copy()
vif_romantic_data['is_romantic'] = 1
vif_romantic_data.loc[(vif_sex_data.romantic == 'no'),'is_romantic'] = 0
In [74]:
vif_romantic_data.head()
Out[74]:
In [75]:
romantic_reg_vif = smf.ols('is_romantic ~ failures + C(sex) + C(studytime)', data=vif_romantic_data).fit()
romantic_reg_vif.summary()
Out[75]:
In [76]:
romantic_vif = 1/(1 - 0.044)
romantic_vif ## no multicollinearity problem
Out[76]:
1.0460251046025104

Predictions

In [77]:
reg_full_model.predict(test_data[['sex','failures', 'romantic', 'studytime']])
Out[77]:
0      10.595147
88      9.711669
258    11.824053
84     10.595147
117    11.647765
158    11.647765
324    11.917651
42     11.824053
26     11.647765
335    11.917651
dtype: float64

RMSE VALUE

In [78]:
# Full model
((sum(test_data['G3']- reg_full_model.predict(test_data[['sex','failures', 'romantic', 'studytime']]))**2)/10)**0.5
Out[78]:
4.323255462645139
In [79]:
# Without the influential
((sum(test_data['G3']- reg_not_influential.predict(test_data[['sex','failures', 'romantic', 'studytime']]))**2)/10)**0.5
Out[79]:
2.7312997881559005
In [80]:
#qq plot
sm.qqplot(reg_full_model.resid)
Out[80]:
Notebook Image
Notebook Image
In [81]:
pip install jovian
Requirement already satisfied: jovian in /Users/aakanksha/miniconda3/envs/LR/lib/python3.7/site-packages (0.1.88) Requirement already satisfied: pyyaml in /Users/aakanksha/miniconda3/envs/LR/lib/python3.7/site-packages (from jovian) (5.1.2) Requirement already satisfied: uuid in /Users/aakanksha/miniconda3/envs/LR/lib/python3.7/site-packages (from jovian) (1.30) Requirement already satisfied: requests in /Users/aakanksha/miniconda3/envs/LR/lib/python3.7/site-packages (from jovian) (2.22.0) Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /Users/aakanksha/miniconda3/envs/LR/lib/python3.7/site-packages (from requests->jovian) (3.0.4) Requirement already satisfied: certifi>=2017.4.17 in /Users/aakanksha/miniconda3/envs/LR/lib/python3.7/site-packages (from requests->jovian) (2019.6.16) Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /Users/aakanksha/miniconda3/envs/LR/lib/python3.7/site-packages (from requests->jovian) (1.25.6) Requirement already satisfied: idna<2.9,>=2.5 in /Users/aakanksha/miniconda3/envs/LR/lib/python3.7/site-packages (from requests->jovian) (2.8) Note: you may need to restart the kernel to use updated packages.
In [82]:
import jovian as jvn
In [ ]:
jvn.commit()
[jovian] Saving notebook..
In [ ]: