Learn practical skills, build real-world projects, and advance your career
####################
## load libraries ##
####################
import numpy as np
import pandas as pd
np.random.seed(123456789)
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import norm
##############################################
## Function for simulating Regression Model ##
##############################################
def simulate_df(n=1000, glm_type='logistic', seed=123456):
    np.random.seed(seed)
    df = pd.DataFrame()
    ## specify variables L1 through L6
    L1_split = 0.52
    L2_split = 0.23
    L3_split = 0.38
    df['L0'] = n*[1]
    df['L1'] = np.random.choice([0, 1], size=n, replace=True, p=[L1_split, (1-L1_split)])
    df['L2'] = np.random.choice([0, 1], size=n, replace=True, p=[L2_split, (1-L2_split)])
    df['L3'] = np.random.choice([0, 1], size=n, replace=True, p=[L3_split, (1-L3_split)])
    df['L4'] = np.random.normal(0, 1, df.shape[0])
    df['L5'] = np.random.normal(0, 0.75, df.shape[0])
    df['L6'] = np.random.normal(0, 2, df.shape[0])
    df['L4'] = df['L4'] / max(abs(df['L4']))
    df['L5'] = df['L5'] / max(abs(df['L5']))
    df['L6'] = df['L6'] / max(abs(df['L6']))
    
    ## define beta parameters
    beta_0 = -0.3
    beta_1 = -1.58
    beta_2 = 1.75
    beta_3 = 0.42
    beta_4 = 1.32
    beta_5 = -1.15
    beta_6 = 1.12

    if(glm_type=='logistic'):
        df['Z'] = beta_0 + (beta_1*df['L1']) + (beta_2*df['L2']) + (beta_3*df['L3']) + (beta_4*df['L4']) + (beta_5*df['L5']) + (beta_6*df['L6'])
        df['p'] = 1 / (1 + np.exp(-df['Z']))
        df['Y'] = np.random.binomial(1, df['p'])
        model = smf.glm('Y ~ L1 + L2 + L3 + L4 + L5 + L6', data=df, family=sm.families.Binomial()).fit()
    
    if(glm_type=='poisson'):
        df['Z'] = beta_0 + (beta_1*df['L1']) + (beta_2*df['L2']) + (beta_3*df['L3']) + (beta_4*df['L4']) + (beta_5*df['L5']) + (beta_6*df['L6'])
        df['p'] = np.exp(df['Z'])
        df['Y'] = None
        for i in range(0, df.shape[0]):
            df.loc[i, 'Y'] = np.random.poisson(df.loc[i, 'p'], 1)
        df['Y'] = df['Y'].astype(float)
        model = smf.glm('Y ~ L1 + L2 + L3 + L4 + L5 + L6', data=df, family=sm.families.Poisson()).fit()
    
    if(glm_type=='probit'):
        df['Z'] = beta_0 + (beta_1*df['L1']) + (beta_2*df['L2']) + (beta_3*df['L3']) + (beta_4*df['L4']) + (beta_5*df['L5']) + (beta_6*df['L6'])
        df['p'] = norm.cdf(df['Z'])
        df['Y'] = np.random.binomial(1, df['p'])
        model = statsmodels.discrete.discrete_model.Probit(df['Y'], df[['L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6']]).fit()
    
    return(df, model)
##########################
## Activation Functions ##
##########################
def sigmoid_activation(Z):
    p = 1 / (1 + np.exp(-Z))
    return(p)

def exponential_activation(Z):
    p = np.exp(Z)
    return(p)
    
def probit_activation(Z):
    p = norm.cdf(Z)
    return(p)
#######################################################################
## Function for Newton-Raphson and Fisher Scoring for Canonical GLMs ##
#######################################################################
def NewtonRaphson_FisherScoring_Canonical(df, glm_type='logistic', seed=123456):

    np.random.seed(seed)
    ## specify number of observations and number of features (including intercept)
    n = df.shape[0]
    m = 7
    if(glm_type=='logistic'):
        k = 10
    if(glm_type=='poisson'):
        k = 100
        
    ## specify the Target Y
    Y = np.array(df['Y']).reshape((n,1))
    ## specify design matrix X        
    X = np.array(df[['L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6']])
    ## initialize Beta vector:
    Beta = np.random.normal(0, 0.5, m).reshape((m, 1))
        
    ## calculate the initial cost J
    if(glm_type=='logistic'):
        b_theta = np.log(1+np.exp(np.dot(X, Beta)))
    if(glm_type=='poisson'):
        b_theta = np.exp(np.dot(X, Beta))
    J = np.sum(b_theta - (np.dot(X, Beta) * Y))
        
    ## dataframe to hold algo results
    df_algo_results = pd.DataFrame()
    df_algo_results['k'] = list(range(0, k+1))
    df_algo_results['J'] = None
    df_algo_results.loc[0, 'J'] = J
    del b_theta, J
        
    for i in range(0, k):    
        ## get linear predictor Z
        Z = np.dot(X, Beta)
        ## get prediction p
        if(glm_type=='logistic'):
            p = sigmoid_activation(Z)
        if(glm_type=='poisson'):
            p = exponential_activation(Z)
        ## get variance matrix V
        V = np.diag(((p-Y)**2).squeeze(axis=1))

        ## calculate the Hessian
        Hessian = np.linalg.solve(np.dot(X.T, V).dot(X), np.eye(m))
        
        ## calculate dJ_Beta
        dJ_Beta = np.dot(X.T, (p-Y))
        
        ## get the updated Beta
        Beta = Beta - np.dot(Hessian, dJ_Beta)
            
        ## record cost function
        if(glm_type=='logistic'):
            b_theta = np.log(1+np.exp(np.dot(X, Beta)))
        if(glm_type=='poisson'):
            b_theta = np.exp(np.dot(X, Beta))
        J = np.sum(b_theta - (np.dot(X, Beta) * Y))
        df_algo_results.loc[i+1, 'J'] = J
            
        del J, b_theta, Z, p, V, Hessian, dJ_Beta
    
    return(Beta, df_algo_results)
##########################################
## Function for IRLS for Canonical GLMs ##
##########################################
def IRLS_Canonical(df, glm_type='logistic', seed=123456):

    np.random.seed(seed)
    ## specify number of observations and number of features (including intercept)
    n = df.shape[0]
    m = 7
    if(glm_type=='logistic'):
        k = 10
    if(glm_type=='poisson'):
        k = 100
        
    ## specify the Target Y
    Y = np.array(df['Y']).reshape((n,1))
    ## specify design matrix X        
    X = np.array(df[['L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6']])
    ## initialize Beta vector:
    Beta = np.random.normal(0, 0.5, m).reshape((m, 1))
        
    ## calculate the initial cost J
    if(glm_type=='logistic'):
        b_theta = np.log(1+np.exp(np.dot(X, Beta)))
    if(glm_type=='poisson'):
        b_theta = np.exp(np.dot(X, Beta))
    J = np.sum(b_theta - (np.dot(X, Beta) * Y))
        
    ## dataframe to hold algo results
    df_algo_results = pd.DataFrame()
    df_algo_results['k'] = list(range(0, k+1))
    df_algo_results['J'] = None
    df_algo_results.loc[0, 'J'] = J
    del b_theta, J
        
    for i in range(0, k):    
        ## get linear predictor Z
        Z = np.dot(X, Beta)
        ## get prediction p
        if(glm_type=='logistic'):
            p = sigmoid_activation(Z)
        if(glm_type=='poisson'):
            p = exponential_activation(Z)
        ## get variance matrix V
        V = np.diag(((p-Y)**2).squeeze(axis=1))
        
        ## get matrix W_inv and Y_til
        W_inv = V
        Y_til = 1 / (p-Y)
            
        ## calculate WLS estimator
        WLS = np.linalg.solve(np.dot(X.T, W_inv).dot(X), np.eye(m)).dot(X.T).dot(W_inv).dot(Y_til)            
            
        ## get the updated Beta
        Beta = Beta - WLS
            
        ## record cost function
        if(glm_type=='logistic'):
            b_theta = np.log(1+np.exp(np.dot(X, Beta)))
        if(glm_type=='poisson'):
            b_theta = np.exp(np.dot(X, Beta))
        J = np.sum(b_theta - (np.dot(X, Beta) * Y))
        df_algo_results.loc[i+1, 'J'] = J
            
        del J, b_theta, Z, p, V, W_inv, Y_til, WLS
    
    return(Beta, df_algo_results)