Learn practical skills, build real-world projects, and advance your career
Created 3 years ago
####################
## load libraries ##
####################
import numpy as np
import pandas as pd
np.random.seed(123456789)
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import norm
##############################################
## Function for simulating Regression Model ##
##############################################
def simulate_df(n=1000, glm_type='logistic', seed=123456):
np.random.seed(seed)
df = pd.DataFrame()
## specify variables L1 through L6
L1_split = 0.52
L2_split = 0.23
L3_split = 0.38
df['L0'] = n*[1]
df['L1'] = np.random.choice([0, 1], size=n, replace=True, p=[L1_split, (1-L1_split)])
df['L2'] = np.random.choice([0, 1], size=n, replace=True, p=[L2_split, (1-L2_split)])
df['L3'] = np.random.choice([0, 1], size=n, replace=True, p=[L3_split, (1-L3_split)])
df['L4'] = np.random.normal(0, 1, df.shape[0])
df['L5'] = np.random.normal(0, 0.75, df.shape[0])
df['L6'] = np.random.normal(0, 2, df.shape[0])
df['L4'] = df['L4'] / max(abs(df['L4']))
df['L5'] = df['L5'] / max(abs(df['L5']))
df['L6'] = df['L6'] / max(abs(df['L6']))
## define beta parameters
beta_0 = -0.3
beta_1 = -1.58
beta_2 = 1.75
beta_3 = 0.42
beta_4 = 1.32
beta_5 = -1.15
beta_6 = 1.12
if(glm_type=='logistic'):
df['Z'] = beta_0 + (beta_1*df['L1']) + (beta_2*df['L2']) + (beta_3*df['L3']) + (beta_4*df['L4']) + (beta_5*df['L5']) + (beta_6*df['L6'])
df['p'] = 1 / (1 + np.exp(-df['Z']))
df['Y'] = np.random.binomial(1, df['p'])
model = smf.glm('Y ~ L1 + L2 + L3 + L4 + L5 + L6', data=df, family=sm.families.Binomial()).fit()
if(glm_type=='poisson'):
df['Z'] = beta_0 + (beta_1*df['L1']) + (beta_2*df['L2']) + (beta_3*df['L3']) + (beta_4*df['L4']) + (beta_5*df['L5']) + (beta_6*df['L6'])
df['p'] = np.exp(df['Z'])
df['Y'] = None
for i in range(0, df.shape[0]):
df.loc[i, 'Y'] = np.random.poisson(df.loc[i, 'p'], 1)
df['Y'] = df['Y'].astype(float)
model = smf.glm('Y ~ L1 + L2 + L3 + L4 + L5 + L6', data=df, family=sm.families.Poisson()).fit()
if(glm_type=='probit'):
df['Z'] = beta_0 + (beta_1*df['L1']) + (beta_2*df['L2']) + (beta_3*df['L3']) + (beta_4*df['L4']) + (beta_5*df['L5']) + (beta_6*df['L6'])
df['p'] = norm.cdf(df['Z'])
df['Y'] = np.random.binomial(1, df['p'])
model = statsmodels.discrete.discrete_model.Probit(df['Y'], df[['L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6']]).fit()
return(df, model)
##########################
## Activation Functions ##
##########################
def sigmoid_activation(Z):
p = 1 / (1 + np.exp(-Z))
return(p)
def exponential_activation(Z):
p = np.exp(Z)
return(p)
def probit_activation(Z):
p = norm.cdf(Z)
return(p)
#######################################################################
## Function for Newton-Raphson and Fisher Scoring for Canonical GLMs ##
#######################################################################
def NewtonRaphson_FisherScoring_Canonical(df, glm_type='logistic', seed=123456):
np.random.seed(seed)
## specify number of observations and number of features (including intercept)
n = df.shape[0]
m = 7
if(glm_type=='logistic'):
k = 10
if(glm_type=='poisson'):
k = 100
## specify the Target Y
Y = np.array(df['Y']).reshape((n,1))
## specify design matrix X
X = np.array(df[['L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6']])
## initialize Beta vector:
Beta = np.random.normal(0, 0.5, m).reshape((m, 1))
## calculate the initial cost J
if(glm_type=='logistic'):
b_theta = np.log(1+np.exp(np.dot(X, Beta)))
if(glm_type=='poisson'):
b_theta = np.exp(np.dot(X, Beta))
J = np.sum(b_theta - (np.dot(X, Beta) * Y))
## dataframe to hold algo results
df_algo_results = pd.DataFrame()
df_algo_results['k'] = list(range(0, k+1))
df_algo_results['J'] = None
df_algo_results.loc[0, 'J'] = J
del b_theta, J
for i in range(0, k):
## get linear predictor Z
Z = np.dot(X, Beta)
## get prediction p
if(glm_type=='logistic'):
p = sigmoid_activation(Z)
if(glm_type=='poisson'):
p = exponential_activation(Z)
## get variance matrix V
V = np.diag(((p-Y)**2).squeeze(axis=1))
## calculate the Hessian
Hessian = np.linalg.solve(np.dot(X.T, V).dot(X), np.eye(m))
## calculate dJ_Beta
dJ_Beta = np.dot(X.T, (p-Y))
## get the updated Beta
Beta = Beta - np.dot(Hessian, dJ_Beta)
## record cost function
if(glm_type=='logistic'):
b_theta = np.log(1+np.exp(np.dot(X, Beta)))
if(glm_type=='poisson'):
b_theta = np.exp(np.dot(X, Beta))
J = np.sum(b_theta - (np.dot(X, Beta) * Y))
df_algo_results.loc[i+1, 'J'] = J
del J, b_theta, Z, p, V, Hessian, dJ_Beta
return(Beta, df_algo_results)
##########################################
## Function for IRLS for Canonical GLMs ##
##########################################
def IRLS_Canonical(df, glm_type='logistic', seed=123456):
np.random.seed(seed)
## specify number of observations and number of features (including intercept)
n = df.shape[0]
m = 7
if(glm_type=='logistic'):
k = 10
if(glm_type=='poisson'):
k = 100
## specify the Target Y
Y = np.array(df['Y']).reshape((n,1))
## specify design matrix X
X = np.array(df[['L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6']])
## initialize Beta vector:
Beta = np.random.normal(0, 0.5, m).reshape((m, 1))
## calculate the initial cost J
if(glm_type=='logistic'):
b_theta = np.log(1+np.exp(np.dot(X, Beta)))
if(glm_type=='poisson'):
b_theta = np.exp(np.dot(X, Beta))
J = np.sum(b_theta - (np.dot(X, Beta) * Y))
## dataframe to hold algo results
df_algo_results = pd.DataFrame()
df_algo_results['k'] = list(range(0, k+1))
df_algo_results['J'] = None
df_algo_results.loc[0, 'J'] = J
del b_theta, J
for i in range(0, k):
## get linear predictor Z
Z = np.dot(X, Beta)
## get prediction p
if(glm_type=='logistic'):
p = sigmoid_activation(Z)
if(glm_type=='poisson'):
p = exponential_activation(Z)
## get variance matrix V
V = np.diag(((p-Y)**2).squeeze(axis=1))
## get matrix W_inv and Y_til
W_inv = V
Y_til = 1 / (p-Y)
## calculate WLS estimator
WLS = np.linalg.solve(np.dot(X.T, W_inv).dot(X), np.eye(m)).dot(X.T).dot(W_inv).dot(Y_til)
## get the updated Beta
Beta = Beta - WLS
## record cost function
if(glm_type=='logistic'):
b_theta = np.log(1+np.exp(np.dot(X, Beta)))
if(glm_type=='poisson'):
b_theta = np.exp(np.dot(X, Beta))
J = np.sum(b_theta - (np.dot(X, Beta) * Y))
df_algo_results.loc[i+1, 'J'] = J
del J, b_theta, Z, p, V, W_inv, Y_til, WLS
return(Beta, df_algo_results)