Learn practical skills, build real-world projects, and advance your career
Created 3 years ago
####################
## load libraries ##
####################
import numpy as np
import pandas as pd
np.random.seed(12345)
import statsmodels.api as sm
import statsmodels.formula.api as smf
##########################################################################
## simulate dataset and analysis (with and witout covariate adjustment) ##
##########################################################################
def simulate_df(n=200, outcome_type='normal', A_on_Y=2, data_only=False, seed=123456):
np.random.seed(seed)
## specify dataframe
df = pd.DataFrame()
df['A'] = np.random.choice(int(n/2)*[0] + int(n/2)*[1], size=n, replace=False)
## specify variables L1 through L6
L1_split = 0.52
L2_split = 0.23
L3_split = 0.38
df['L1'] = np.random.choice([0, 1], size=n, replace=True, p=[L1_split, (1-L1_split)])
df['L2'] = np.random.choice([0, 1], size=n, replace=True, p=[L2_split, (1-L2_split)])
df['L3'] = np.random.choice([0, 1], size=n, replace=True, p=[L3_split, (1-L3_split)])
df['L4'] = np.random.normal(0, 1, df.shape[0])
df['L5'] = np.random.normal(0, 0.75, df.shape[0])
df['L6'] = np.random.normal(0, 2, df.shape[0])
df_A0 = df.copy()
df_A0['A'] = 0
df_A1 = df.copy()
df_A1['A'] = 1
theta_0 = -0.5
theta_1 = A_on_Y
theta_2 = 0.28
theta_3 = 0.42
theta_4 = 0.32
theta_5 = -0.15
theta_6 = 0.12
theta_7 = -0.29
if(outcome_type=='normal'):
df['Y'] = theta_0 + (theta_1*df['A']) + (theta_2*df['L1']) + (theta_3*df['L2']) + (theta_4*df['L3']) + (theta_5*df['L4']) + (theta_6*df['L5']) + (theta_7*df['L6']) + np.random.normal(0, 1, df.shape[0])
if(data_only):
return(df)
model_unadjusted = smf.ols('Y ~ A', data=df).fit()
print(model_unadjusted.summary())
model_adjusted = smf.ols('Y ~ A + L1 + L2 + L3 + L4 + L5 + L6', data=df).fit()
print(model_adjusted.summary())
print('The sample variance is: ' + str(df['Y'].var()))
if(outcome_type=='binomial'):
df['Z'] = theta_0 + (theta_1*df['A']) + (theta_2*df['L1']) + (theta_3*df['L2']) + (theta_4*df['L3']) + (theta_5*df['L4']) + (theta_6*df['L5']) + (theta_7*df['L6'])
df['p'] = 1 / (1 + np.exp(-df['Z']))
df['Y'] = np.random.binomial(1, df['p'])
if(data_only):
return(df)
model_unadjusted = smf.glm('Y ~ A', data=df, family=sm.families.Binomial()).fit()
print(model_unadjusted.summary())
model_adjusted = smf.glm('Y ~ A + L1 + L2 + L3 + L4 + L5 + L6', data=df, family=sm.families.Binomial()).fit()
print(model_adjusted.summary())
df_A0['Y_pred_unadjusted'] = model_unadjusted.predict(df_A0)
df_A1['Y_pred_unadjusted'] = model_unadjusted.predict(df_A1)
df_A0['Y_pred_adjusted'] = model_adjusted.predict(df_A0)
df_A1['Y_pred_adjusted'] = model_adjusted.predict(df_A1)
print('The unadjusted Causal Effect Difference is: ' + str(df_A1['Y_pred_unadjusted'].mean() - df_A0['Y_pred_unadjusted'].mean()))
print('The adjusted Causal Effect Difference is: ' + str(df_A1['Y_pred_adjusted'].mean() - df_A0['Y_pred_adjusted'].mean()))
return(df)
######################
## Power Simulation ##
######################
def power_analysis(df_original, outcome_type='normal', m=10, alpha=0.05, A_on_Y=0.5, seed=123456):
np.random.seed(seed)
df_p_values = pd.DataFrame()
df_p_values['m'] = list(range(0,m))
df_p_values['p_unadjusted'] = None
df_p_values['p_adjusted'] = None
for i in range(0, m):
#df = df_original.sample(n=df_original.shape[0], replace=True, axis=0).reset_index(drop=True)
if(outcome_type=='normal'):
df = simulate_df(n=df_original.shape[0], outcome_type='normal', A_on_Y=A_on_Y, data_only=True, seed=i)
model_unadjusted = smf.ols('Y ~ A', data=df).fit()
df_p_values.loc[i, 'p_unadjusted'] = model_unadjusted.pvalues['A']
model_adjusted = smf.ols('Y ~ A + L1 + L2 + L3 + L4 + L5 + L6', data=df).fit()
df_p_values.loc[i, 'p_adjusted'] = model_adjusted.pvalues['A']
del df, model_unadjusted, model_adjusted
if(outcome_type=='binomial'):
df = simulate_df(n=df_original.shape[0], outcome_type='binomial', A_on_Y=A_on_Y, data_only=True, seed=i)
model_unadjusted = smf.glm('Y ~ A', data=df, family=sm.families.Binomial()).fit()
df_p_values.loc[i, 'p_unadjusted'] = model_unadjusted.pvalues['A']
model_adjusted = smf.glm('Y ~ A + L1 + L2 + L3 + L4 + L5 + L6', data=df, family=sm.families.Binomial()).fit()
df_p_values.loc[i, 'p_adjusted'] = model_adjusted.pvalues['A']
del df, model_unadjusted, model_adjusted
unadjusted_power = 1 - (df_p_values.loc[df_p_values['p_unadjusted']>alpha, :].shape[0] / m)
adjusted_power = 1 - (df_p_values.loc[df_p_values['p_adjusted']>alpha, :].shape[0] / m)
print('The unadjusted power is: ' + str(unadjusted_power))
print('The adjusted power is: ' + str(adjusted_power))
#############################
## Continuous Outcome Case ##
#############################
## Recover unbiased estimate of Mean Causal Effect Difference, with and without covariate adjustment:
df_normal = simulate_df(n=134, outcome_type='normal', A_on_Y=0.5, data_only=False, seed=1081565)
## conduct power analysis, both with and without covariate adjustment:
power_analysis(df_normal, outcome_type='normal', m=10000, alpha=0.05, A_on_Y=0.5, seed=123)
OLS Regression Results
==============================================================================
Dep. Variable: Y R-squared: 0.057
Model: OLS Adj. R-squared: 0.049
Method: Least Squares F-statistic: 7.911
Date: Mon, 12 Apr 2021 Prob (F-statistic): 0.00566
Time: 00:17:52 Log-Likelihood: -207.28
No. Observations: 134 AIC: 418.6
Df Residuals: 132 BIC: 424.4
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 0.2847 0.140 2.035 0.044 0.008 0.561
A 0.5565 0.198 2.813 0.006 0.165 0.948
==============================================================================
Omnibus: 5.256 Durbin-Watson: 1.872
Prob(Omnibus): 0.072 Jarque-Bera (JB): 5.295
Skew: -0.310 Prob(JB): 0.0708
Kurtosis: 3.751 Cond. No. 2.62
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
OLS Regression Results
==============================================================================
Dep. Variable: Y R-squared: 0.412
Model: OLS Adj. R-squared: 0.379
Method: Least Squares F-statistic: 12.60
Date: Mon, 12 Apr 2021 Prob (F-statistic): 3.55e-12
Time: 00:17:52 Log-Likelihood: -175.63
No. Observations: 134 AIC: 367.3
Df Residuals: 126 BIC: 390.5
Df Model: 7
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -0.6208 0.212 -2.924 0.004 -1.041 -0.201
A 0.5069 0.161 3.140 0.002 0.187 0.826
L1 0.0920 0.161 0.572 0.569 -0.227 0.411
L2 0.7655 0.202 3.783 0.000 0.365 1.166
L3 0.3778 0.165 2.286 0.024 0.051 0.705
L4 -0.0274 0.081 -0.340 0.735 -0.187 0.132
L5 0.0959 0.108 0.885 0.378 -0.119 0.310
L6 -0.2899 0.043 -6.716 0.000 -0.375 -0.204
==============================================================================
Omnibus: 0.277 Durbin-Watson: 1.833
Prob(Omnibus): 0.871 Jarque-Bera (JB): 0.205
Skew: 0.095 Prob(JB): 0.903
Kurtosis: 2.979 Cond. No. 6.48
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
The sample variance is: 1.3793646703039655
The unadjusted Causal Effect Difference is: 0.5564637241021395
The adjusted Causal Effect Difference is: 0.5069038658712943
The unadjusted power is: 0.6735
The adjusted power is: 0.8023
#########################
## Binary Outcome Case ##
#########################
## Recover unbiased estimate of Mean Causal Effect Difference, with and without covariate adjustment:
df_binomial = simulate_df(n=652, outcome_type='binomial', A_on_Y=0.5, data_only=False, seed=12345678)
## conduct power analysis, both with and without covariate adjustment:
power_analysis(df_binomial, outcome_type='binomial', m=10000, alpha=0.05, A_on_Y=0.5, seed=123456)
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: Y No. Observations: 652
Model: GLM Df Residuals: 650
Model Family: Binomial Df Model: 1
Link Function: logit Scale: 1.0000
Method: IRLS Log-Likelihood: -437.20
Date: Mon, 12 Apr 2021 Deviance: 874.40
Time: 00:26:45 Pearson chi2: 652.
No. Iterations: 4
Covariance Type: nonrobust
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 0.1970 0.111 1.769 0.077 -0.021 0.415
A 0.3832 0.160 2.389 0.017 0.069 0.698
==============================================================================
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: Y No. Observations: 652
Model: GLM Df Residuals: 644
Model Family: Binomial Df Model: 7
Link Function: logit Scale: 1.0000
Method: IRLS Log-Likelihood: -392.91
Date: Mon, 12 Apr 2021 Deviance: 785.82
Time: 00:26:45 Pearson chi2: 651.
No. Iterations: 4
Covariance Type: nonrobust
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -0.5910 0.246 -2.401 0.016 -1.074 -0.109
A 0.4749 0.173 2.739 0.006 0.135 0.815
L1 0.4837 0.174 2.774 0.006 0.142 0.825
L2 0.4122 0.207 1.994 0.046 0.007 0.817
L3 0.3906 0.176 2.220 0.026 0.046 0.735
L4 -0.0240 0.083 -0.291 0.771 -0.186 0.138
L5 0.1518 0.118 1.282 0.200 -0.080 0.384
L6 -0.3961 0.051 -7.746 0.000 -0.496 -0.296
==============================================================================
The unadjusted Causal Effect Difference is: 0.0920245398772942
The adjusted Causal Effect Difference is: 0.09912605022292376
The unadjusted power is: 0.8187
The adjusted power is: 0.851