Learn practical skills, build real-world projects, and advance your career
Created 3 years ago
######################
## import libraries ##
######################
import pandas as pd
import numpy as np
np.random.seed(108156571)
import statsmodels.api as sm
import statsmodels.formula.api as smf
def simulate_dataset(n=100000, C1=1):
## A: dichotomous
## C1: continuous (with squared term) -> only effect modification
## C2: discrete -> only confouding
A_split = 0.4
C2_split = 0.35
df_A1 = pd.DataFrame()
df_A1['A'] = np.ones(int(n*A_split))
df_A1['C2'] = np.random.uniform(0, 1, df_A1.shape[0])
df_A1.loc[df_A1['C2']<C2_split, 'C2'] = 0
df_A1.loc[df_A1['C2']>=C2_split, 'C2'] = 1
df_A0 = pd.DataFrame()
df_A0['A'] = np.zeros(int(n*(1-A_split)))
df_A0['C2'] = np.random.uniform(0, 1, df_A0.shape[0])
df_A0.loc[df_A0['C2']<(1-C2_split), 'C2'] = 0
df_A0.loc[df_A0['C2']>=(1-C2_split), 'C2'] = 1
df = pd.concat([df_A0, df_A1], axis=0).reset_index(drop=True)
df['C1'] = np.random.normal(2.5, 1, df.shape[0])
df['C1_squared'] = df['C1']*df['C1']
df['C1_sin'] = 3*np.sin(df['C1'])
df['C1_squared_sin_interaction'] = df['C1_squared']*df['C1_sin']
df['error'] = np.random.normal(0, 0.25, df.shape[0])
B0 = -0.23
B1 = 1.23
B2 = 1.56
B3 = 1.42
B4 = 2.31
B5 = -0.43
B6 = 0.52
B7 = 1.09
df['Y'] = B0 + (B1*df['A']) + (B2*df['C1']) + (B3*df['C1_squared']) + (B4*df['C1_sin']) + (B5*df['C1_squared_sin_interaction']) + (B6*df['C2']) + (B7*df['A']*df['C1']) + df['error']
print("In the marginal Causal DAG, there is confounding by C2 and effect modification by C1 \n")
print("The true outcome model is:")
print('E[Y|A,C1,C2] = ' + str(B0) + ' + ' + str(B1) + '*A + ' + str(B2) + '*C1 + ' + str(B3) + '*C1^2 + ' + str(B4) + '*sin(C1) + ' + str(B5) + '*C1^2*sin(C1) + ' + str(B6) + '*C2 + ' + str(B7) + '*A*C1')
print('\n')
print('The true Mean Causal Effect Difference under intervention a=1 and a=0 within levels of C1=' + str(C1) + ' is:')
print('E[Y(a=1)-Y(a=0)|C1=' + str(C1) + '] = ' + str(B1 + (B7*C1)))
return(df)
## get simulated dataset
df_original = simulate_dataset(n=200000, C1=0.5)
In the marginal Causal DAG, there is confounding by C2 and effect modification by C1
The true outcome model is:
E[Y|A,C1,C2] = -0.23 + 1.23*A + 1.56*C1 + 1.42*C1^2 + 2.31*sin(C1) + -0.43*C1^2*sin(C1) + 0.52*C2 + 1.09*A*C1
The true Mean Causal Effect Difference under intervention a=1 and a=0 within levels of C1=0.5 is:
E[Y(a=1)-Y(a=0)|C1=0.5] = 1.775
###############################################
## fit and return standardized outcome model ##
###############################################
def fit_standardized_outcome_model(df, model_string, C1, fit_bounds=True, print_result=True):
df_original = df.copy()
df_A0 = df_original.copy()
df_A1 = df_original.copy()
df_A0['A'] = 0
df_A0['C1'] = C1
df_A0['C1_squared'] = df_A0['C1']*df_A0['C1']
df_A0['C1_sin'] = 3*np.sin(df_A0['C1'])
df_A0['C1_squared_sin_interaction'] = df_A0['C1_squared']*df_A0['C1_sin']
df_A1['A'] = 1
df_A1['C1'] = C1
df_A1['C1_squared'] = df_A1['C1']*df_A1['C1']
df_A1['C1_sin'] = 3*np.sin(df_A1['C1'])
df_A1['C1_squared_sin_interaction'] = df_A1['C1_squared']*df_A1['C1_sin']
outcome_model = smf.ols(formula=model_string, data=df).fit()
df_A0['Y_fit'] = outcome_model.predict(df_A0)
df_A1['Y_fit'] = outcome_model.predict(df_A1)
mean_unexposed = df_A0['Y_fit'].mean()
mean_exposed = df_A1['Y_fit'].mean()
effect_difference = mean_exposed - mean_unexposed
if(fit_bounds):
effect_difference_list = []
for i in range(0, 100):
df_sub = df_original.sample(n=df_original.shape[0], replace=True)
df_A0_sub = df_sub.copy()
df_A1_sub = df_sub.copy()
df_A0_sub['A'] = 0
df_A0_sub['C1'] = C1
df_A0_sub['C1_squared'] = df_A0_sub['C1']*df_A0_sub['C1']
df_A0_sub['C1_sin'] = 3*np.sin(df_A0_sub['C1'])
df_A0_sub['C1_squared_sin_interaction'] = df_A0_sub['C1_squared']*df_A0_sub['C1_sin']
df_A1_sub['A'] = 1
df_A1_sub['C1'] = C1
df_A1_sub['C1_squared'] = df_A1_sub['C1']*df_A1_sub['C1']
df_A1_sub['C1_sin'] = 3*np.sin(df_A1_sub['C1'])
df_A1_sub['C1_squared_sin_interaction'] = df_A1_sub['C1_squared']*df_A1_sub['C1_sin']
OM_sub = smf.ols(formula=model_string, data=df_sub).fit()
df_A0_sub['Y_fit'] = OM_sub.predict(df_A0_sub)
df_A1_sub['Y_fit'] = OM_sub.predict(df_A1_sub)
mean_unexposed_sub = df_A0_sub['Y_fit'].mean()
mean_exposed_sub = df_A1_sub['Y_fit'].mean()
effect_difference_sub = mean_exposed_sub - mean_unexposed_sub
effect_difference_list.append(effect_difference_sub)
del df_sub, df_A0_sub, df_A1_sub, OM_sub, mean_unexposed_sub, mean_exposed_sub, effect_difference_sub
variance = np.var(effect_difference_list)
standard_error = np.sqrt(variance)
bound_size = 1.96*standard_error
lower_bound = effect_difference - bound_size
upper_bound = effect_difference + bound_size
if(print_result):
print(outcome_model.summary())
print("The standardized mean in those with A=0 and C1=" + str(C1) +" is: " + str(mean_unexposed))
print("The standardized mean in those with A=1 and C1=" + str(C1) +" is: " + str(mean_exposed))
if(fit_bounds):
print("The standardized effect difference with C1=" + str(C1) +" is " + str(effect_difference) + " with 95% CI (" + str(lower_bound) + ' , ' + str(upper_bound) + ')')
else:
print("The standardized effect difference with C1=" + str(C1) + " is " + str(effect_difference))
return(df, outcome_model)
## Standardization via the g-formula with correctly specified Outcome model
_, OM_CS = fit_standardized_outcome_model(df_original.copy(), model_string='Y ~ A + C1 + C1_squared + C1_sin + C1_squared_sin_interaction + C2 + A*C1', C1=0.5)
OLS Regression Results
==============================================================================
Dep. Variable: Y R-squared: 1.000
Model: OLS Adj. R-squared: 1.000
Method: Least Squares F-statistic: 7.241e+07
Date: Tue, 15 Dec 2020 Prob (F-statistic): 0.00
Time: 12:30:44 Log-Likelihood: -6440.0
No. Observations: 200000 AIC: 1.290e+04
Df Residuals: 199992 BIC: 1.298e+04
Df Model: 7
Covariance Type: nonrobust
==============================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------
Intercept -0.2341 0.004 -65.471 0.000 -0.241 -0.227
A 1.2305 0.003 397.969 0.000 1.224 1.237
C1 1.5610 0.004 434.572 0.000 1.554 1.568
C1_squared 1.4201 0.001 1462.192 0.000 1.418 1.422
C1_sin 2.3105 0.001 2671.685 0.000 2.309 2.312
C1_squared_sin_interaction -0.4299 9.75e-05 -4409.689 0.000 -0.430 -0.430
C2 0.5202 0.001 444.135 0.000 0.518 0.523
A:C1 1.0897 0.001 956.363 0.000 1.087 1.092
==============================================================================
Omnibus: 0.658 Durbin-Watson: 1.993
Prob(Omnibus): 0.720 Jarque-Bera (JB): 0.655
Skew: 0.004 Prob(JB): 0.721
Kurtosis: 3.002 Cond. No. 155.
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
The standardized mean in those with A=0 and C1=0.5 is: 4.314202877691639
The standardized mean in those with A=1 and C1=0.5 is: 6.089578604476575
The standardized effect difference with C1=0.5 is 1.7753757267849357 with 95% CI (1.770226691240206 , 1.7805247623296654)