Learn practical skills, build real-world projects, and advance your career
######################
## Import libraries ##
######################
import numpy as np
import pandas as pd
np.random.seed(123456789)
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
##################################
## function to simulate dataset ##
##################################
def simulate_df(n=200, seed=123456):
    np.random.seed(seed)
    ## specify dataframe
    df = pd.DataFrame()
    df['A'] = np.random.choice(int(n/2)*[0] + int(n/2)*[1], size=n, replace=False)

    ## specify variables L1 through L6
    L1_split = 0.52
    L2_split = 0.23
    L3_split = 0.38
    df['L1'] = np.random.choice([0, 1], size=n, replace=True, p=[L1_split, (1-L1_split)])
    df['L2'] = np.random.choice([0, 1], size=n, replace=True, p=[L2_split, (1-L2_split)])
    df['L3'] = np.random.choice([0, 1], size=n, replace=True, p=[L3_split, (1-L3_split)])
    df['L4'] = np.random.normal(0, 1, df.shape[0])
    df['L5'] = np.random.normal(0, 0.75, df.shape[0])
    df['L6'] = np.random.normal(0, 2, df.shape[0])
    
    df_A0 = df.copy()
    df_A0['A'] = 0
    df_A1 = df.copy()
    df_A1['A'] = 1
    
    theta_0 = -0.5
    theta_1 = 2.1
    theta_2 = 0.28
    theta_3 = 0.42
    theta_4 = 0.32
    theta_5 = -0.15
    theta_6 = 0.12
    theta_7 = -0.29
    
    df['Y'] = theta_0 + (theta_1*df['A']) + (theta_2*df['L1']) + (theta_3*df['L2']) + (theta_4*df['L3']) + (theta_5*df['L4']) + (theta_6*df['L5']) + (theta_7*df['L6']) + np.random.normal(0, 1, df.shape[0])
    model_adjusted = smf.ols('Y ~ A + L1 + L2 + L3 + L4 + L5 + L6', data=df).fit()
    print(model_adjusted.summary())
    
    Y = df[['Y']]
    X = df.drop(['Y'], axis=1)
    X_labels = list(X.columns)
    X['intercept'] = 1
    X = X[['intercept'] + X_labels]
    
    return(df, Y, X)
#####################################
## function to calculate Wald Test ##
#####################################
def wald_test(Y, X):

    df = pd.DataFrame()
    df['coeff'] = X.columns
    df['chi-square statisitic'] = None
    df['p-value'] = None
    
    n = X.shape[0]
    g = np.linalg.solve(np.dot(X.T, X), np.eye(X.shape[1])).dot(X.T).dot(Y)
    Y_hat = np.dot(X, g)
    sigma_squared = (1/(n-X.shape[1]))*((Y-Y_hat)**2).sum().values[0]
    g_variance = sigma_squared * np.linalg.solve(np.dot(X.T, X), np.eye(X.shape[1]))
    
    
    for i in range(0, X.shape[1]):
        J_g_null = np.zeros(X.shape[1]).reshape(1, X.shape[1])
        J_g_null[0, i] = 1
        
        denom_term = np.dot(J_g_null, g_variance).dot(J_g_null.T)
        denom_term = np.linalg.solve(denom_term, np.eye(denom_term.shape[1]))
        
        W = g[i, 0] * denom_term * g[i, 0]
        df.loc[i, 'chi-square statisitic'] = W[0,0]
        df.loc[i, 'p-value'] = stats.chi2.sf(W[0,0] , 1)
        
        del J_g_null, denom_term, W
            
    return(df)
######################################
## function to calculate Score Test ##
######################################
def score_test(Y, X):
    
    df = pd.DataFrame()
    df['coeff'] = X.columns
    df['chi-square statisitic'] = None
    df['p-value'] = None
    
    n = X.shape[0]
    g = np.linalg.solve(np.dot(X.T, X), np.eye(X.shape[1])).dot(X.T).dot(Y)
    Y_hat = np.dot(X, g)
    sigma_squared = (1/(n-X.shape[1]))*((Y-Y_hat)**2).sum().values[0]
    
    for i in range(0, df.shape[0]):
        X_sub = X.drop([X.columns[i]], axis=1)
        OLS_estimator = np.linalg.inv(np.dot(X_sub.T, X_sub)).dot(X_sub.T).dot(Y)
        Beta = np.zeros((X.shape[1], 1))
        Beta[0:i, 0] = OLS_estimator[0:i, 0]
        Beta[i+1:, 0] = OLS_estimator[i:, 0]
        score = np.dot(X.T, Y) - np.dot(X.T, X).dot(Beta)
        Fisher_Information = (1/sigma_squared) * np.dot(X.T, X)
        test_statistic = np.dot(score.T, np.linalg.inv(Fisher_Information)).dot(score)
        p_value = 1 - stats.chi2.cdf(test_statistic , 1)
        df.loc[i, 'chi-square statisitic'] = test_statistic[0][0]
        df.loc[i, 'p-value'] = p_value[0][0]
        del X_sub, OLS_estimator, Beta, score, Fisher_Information, test_statistic, p_value
    
    return(df)
#################################################
## function to calculate Likelihood Ratio Test ##
#################################################
def LR_test(Y, X):
    df = pd.DataFrame()
    df['coeff'] = X.columns
    df['chi-square statisitic'] = None
    df['p-value'] = None
    
    Beta_fm = np.linalg.solve(np.dot(X.T, X), np.eye(X.shape[1])).dot(X.T).dot(Y)
    LR_fm = -(1/2)*np.dot((Y - np.dot(X, Beta_fm)).T, (Y - np.dot(X, Beta_fm)))[0][0]
    for i in range(0, X.shape[1]):
        X_sub = X.drop([X.columns[i]], axis=1)
        Beta_nm = np.linalg.solve(np.dot(X_sub.T, X_sub), np.eye(X_sub.shape[1])).dot(X_sub.T).dot(Y)
        LR_nm = -(1/2)*np.dot((Y - np.dot(X_sub, Beta_nm)).T, (Y - np.dot(X_sub, Beta_nm)))[0][0]
        df.loc[i, 'chi-square statisitic'] = 2*(LR_fm - LR_nm)
        df.loc[i, 'p-value'] = 1 - stats.chi2.cdf(df.loc[i, 'chi-square statisitic'] , 1)
        del X_sub, Beta_nm, LR_nm
    
    return(df)