Learn practical skills, build real-world projects, and advance your career
Created 3 years ago
######################
## Import libraries ##
######################
import numpy as np
import pandas as pd
np.random.seed(123456789)
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
##################################
## function to simulate dataset ##
##################################
def simulate_df(n=200, seed=123456):
np.random.seed(seed)
## specify dataframe
df = pd.DataFrame()
df['A'] = np.random.choice(int(n/2)*[0] + int(n/2)*[1], size=n, replace=False)
## specify variables L1 through L6
L1_split = 0.52
L2_split = 0.23
L3_split = 0.38
df['L1'] = np.random.choice([0, 1], size=n, replace=True, p=[L1_split, (1-L1_split)])
df['L2'] = np.random.choice([0, 1], size=n, replace=True, p=[L2_split, (1-L2_split)])
df['L3'] = np.random.choice([0, 1], size=n, replace=True, p=[L3_split, (1-L3_split)])
df['L4'] = np.random.normal(0, 1, df.shape[0])
df['L5'] = np.random.normal(0, 0.75, df.shape[0])
df['L6'] = np.random.normal(0, 2, df.shape[0])
df_A0 = df.copy()
df_A0['A'] = 0
df_A1 = df.copy()
df_A1['A'] = 1
theta_0 = -0.5
theta_1 = 2.1
theta_2 = 0.28
theta_3 = 0.42
theta_4 = 0.32
theta_5 = -0.15
theta_6 = 0.12
theta_7 = -0.29
df['Y'] = theta_0 + (theta_1*df['A']) + (theta_2*df['L1']) + (theta_3*df['L2']) + (theta_4*df['L3']) + (theta_5*df['L4']) + (theta_6*df['L5']) + (theta_7*df['L6']) + np.random.normal(0, 1, df.shape[0])
model_adjusted = smf.ols('Y ~ A + L1 + L2 + L3 + L4 + L5 + L6', data=df).fit()
print(model_adjusted.summary())
Y = df[['Y']]
X = df.drop(['Y'], axis=1)
X_labels = list(X.columns)
X['intercept'] = 1
X = X[['intercept'] + X_labels]
return(df, Y, X)
#####################################
## function to calculate Wald Test ##
#####################################
def wald_test(Y, X):
df = pd.DataFrame()
df['coeff'] = X.columns
df['chi-square statisitic'] = None
df['p-value'] = None
n = X.shape[0]
g = np.linalg.solve(np.dot(X.T, X), np.eye(X.shape[1])).dot(X.T).dot(Y)
Y_hat = np.dot(X, g)
sigma_squared = (1/(n-X.shape[1]))*((Y-Y_hat)**2).sum().values[0]
g_variance = sigma_squared * np.linalg.solve(np.dot(X.T, X), np.eye(X.shape[1]))
for i in range(0, X.shape[1]):
J_g_null = np.zeros(X.shape[1]).reshape(1, X.shape[1])
J_g_null[0, i] = 1
denom_term = np.dot(J_g_null, g_variance).dot(J_g_null.T)
denom_term = np.linalg.solve(denom_term, np.eye(denom_term.shape[1]))
W = g[i, 0] * denom_term * g[i, 0]
df.loc[i, 'chi-square statisitic'] = W[0,0]
df.loc[i, 'p-value'] = stats.chi2.sf(W[0,0] , 1)
del J_g_null, denom_term, W
return(df)
######################################
## function to calculate Score Test ##
######################################
def score_test(Y, X):
df = pd.DataFrame()
df['coeff'] = X.columns
df['chi-square statisitic'] = None
df['p-value'] = None
n = X.shape[0]
g = np.linalg.solve(np.dot(X.T, X), np.eye(X.shape[1])).dot(X.T).dot(Y)
Y_hat = np.dot(X, g)
sigma_squared = (1/(n-X.shape[1]))*((Y-Y_hat)**2).sum().values[0]
for i in range(0, df.shape[0]):
X_sub = X.drop([X.columns[i]], axis=1)
OLS_estimator = np.linalg.inv(np.dot(X_sub.T, X_sub)).dot(X_sub.T).dot(Y)
Beta = np.zeros((X.shape[1], 1))
Beta[0:i, 0] = OLS_estimator[0:i, 0]
Beta[i+1:, 0] = OLS_estimator[i:, 0]
score = np.dot(X.T, Y) - np.dot(X.T, X).dot(Beta)
Fisher_Information = (1/sigma_squared) * np.dot(X.T, X)
test_statistic = np.dot(score.T, np.linalg.inv(Fisher_Information)).dot(score)
p_value = 1 - stats.chi2.cdf(test_statistic , 1)
df.loc[i, 'chi-square statisitic'] = test_statistic[0][0]
df.loc[i, 'p-value'] = p_value[0][0]
del X_sub, OLS_estimator, Beta, score, Fisher_Information, test_statistic, p_value
return(df)
#################################################
## function to calculate Likelihood Ratio Test ##
#################################################
def LR_test(Y, X):
df = pd.DataFrame()
df['coeff'] = X.columns
df['chi-square statisitic'] = None
df['p-value'] = None
Beta_fm = np.linalg.solve(np.dot(X.T, X), np.eye(X.shape[1])).dot(X.T).dot(Y)
LR_fm = -(1/2)*np.dot((Y - np.dot(X, Beta_fm)).T, (Y - np.dot(X, Beta_fm)))[0][0]
for i in range(0, X.shape[1]):
X_sub = X.drop([X.columns[i]], axis=1)
Beta_nm = np.linalg.solve(np.dot(X_sub.T, X_sub), np.eye(X_sub.shape[1])).dot(X_sub.T).dot(Y)
LR_nm = -(1/2)*np.dot((Y - np.dot(X_sub, Beta_nm)).T, (Y - np.dot(X_sub, Beta_nm)))[0][0]
df.loc[i, 'chi-square statisitic'] = 2*(LR_fm - LR_nm)
df.loc[i, 'p-value'] = 1 - stats.chi2.cdf(df.loc[i, 'chi-square statisitic'] , 1)
del X_sub, Beta_nm, LR_nm
return(df)