Learn practical skills, build real-world projects, and advance your career
Updated 3 years ago
######################
## import libraries ##
######################
import pandas as pd
import numpy as np
np.random.seed(10815657)
import statsmodels.api as sm
import statsmodels.formula.api as smf
#########################################
## generate simulated dataset function ##
#########################################
def simulate_dataset(n=100000):
A_split = 0.4
C1_split = 0.2
C2_split = 0.65
df_A1 = pd.DataFrame()
df_A1['A'] = np.ones(int(n*A_split))
df_A1['C1'] = np.random.uniform(0, 1, int(n*A_split))
df_A1['C2'] = np.random.uniform(0, 1, int(n*A_split))
df_A1.loc[df_A1['C1']<C1_split, 'C1'] = 0
df_A1.loc[df_A1['C1']>=C1_split, 'C1'] = 1
df_A1.loc[df_A1['C2']<C2_split, 'C2'] = 0
df_A1.loc[df_A1['C2']>=C2_split, 'C2'] = 1
df_A0 = pd.DataFrame()
df_A0['A'] = np.zeros(int(n*(1-A_split)))
df_A0['C1'] = np.random.uniform(0, 1, int(n*(1-A_split)))
df_A0['C2'] = np.random.uniform(0, 1, int(n*(1-A_split)))
df_A0.loc[df_A0['C1']<(1-C1_split), 'C1'] = 0
df_A0.loc[df_A0['C1']>=(1-C1_split), 'C1'] = 1
df_A0.loc[df_A0['C2']<(1-C2_split), 'C2'] = 0
df_A0.loc[df_A0['C2']>=(1-C2_split), 'C2'] = 1
df = pd.concat([df_A0, df_A1], axis=0).reset_index(drop=True)
df['error'] = np.random.normal(0, 0.25, df.shape[0])
del df_A1, df_A0
B0 = -0.23
B1 = 1.23
B2 = 1.56
B3 = 0.87
df['Y'] = B0 + (B1*df['A']) + (B2*df['C1']) + (B3*df['C2']) + df['error']
print("The true outcome model is:")
print('E[Y|A,C1,C2] = ' + str(B0) + ' + ' + str(B1) + '*A + ' + str(B2) + '*C1 + ' + str(B3) + '*C2')
return(df)
## get simulated dataset
df_original = simulate_dataset(n=100000)
The true outcome model is:
E[Y|A,C1,C2] = -0.23 + 1.23*A + 1.56*C1 + 0.87*C2
###############################################
## fit and return standardized outcome model ##
###############################################
def fit_standardized_outcome_model(df, model_string, fit_bounds=True, print_result=True):
df_original = df.copy()
df_A0 = df_original.copy()
df_A1 = df_original.copy()
df_A0['A'] = 0
df_A1['A'] = 1
outcome_model = smf.ols(formula=model_string, data=df).fit()
df_A0['Y_fit'] = outcome_model.predict(df_A0)
df_A1['Y_fit'] = outcome_model.predict(df_A1)
mean_unexposed = df_A0['Y_fit'].mean()
mean_exposed = df_A1['Y_fit'].mean()
effect_difference = mean_exposed - mean_unexposed
if(fit_bounds):
effect_difference_list = []
for i in range(0, 100):
df_sub = df_original.sample(n=df_original.shape[0], replace=True)
df_A0_sub = df_sub.copy()
df_A1_sub = df_sub.copy()
df_A0_sub['A'] = 0
df_A1_sub['A'] = 1
OM_sub = smf.ols(formula=model_string, data=df_sub).fit()
df_A0_sub['Y_fit'] = OM_sub.predict(df_A0_sub)
df_A1_sub['Y_fit'] = OM_sub.predict(df_A1_sub)
mean_unexposed_sub = df_A0_sub['Y_fit'].mean()
mean_exposed_sub = df_A1_sub['Y_fit'].mean()
effect_difference_sub = mean_exposed_sub - mean_unexposed_sub
effect_difference_list.append(effect_difference_sub)
del df_sub, df_A0_sub, df_A1_sub, OM_sub, mean_unexposed_sub, mean_exposed_sub, effect_difference_sub
variance = np.var(effect_difference_list)
standard_error = np.sqrt(variance)
bound_size = 1.96*standard_error
lower_bound = effect_difference - bound_size
upper_bound = effect_difference + bound_size
if(print_result):
print(outcome_model.summary())
print("The standardized mean in those with A=0 is: " + str(mean_unexposed))
print("The standardized mean in those with A=1 is: " + str(mean_exposed))
if(fit_bounds):
print("The standardized effect difference is " + str(effect_difference) + " with 95% CI (" + str(lower_bound) + ' , ' + str(upper_bound) + ')')
else:
print("The standardized effect difference is " + str(effect_difference))
return(df, outcome_model)
## fit and return correctly specified outcome model
df_OM_CS, OM_CS = fit_standardized_outcome_model(df_original.copy(), 'Y ~ A + C1 + C2')
OLS Regression Results
==============================================================================
Dep. Variable: Y R-squared: 0.958
Model: OLS Adj. R-squared: 0.958
Method: Least Squares F-statistic: 7.620e+05
Date: Thu, 03 Dec 2020 Prob (F-statistic): 0.00
Time: 11:33:19 Log-Likelihood: -3353.7
No. Observations: 100000 AIC: 6715.
Df Residuals: 99996 BIC: 6753.
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -0.2272 0.002 -147.829 0.000 -0.230 -0.224
A 1.2280 0.002 593.422 0.000 1.224 1.232
C1 1.5588 0.002 786.222 0.000 1.555 1.563
C2 0.8679 0.002 523.136 0.000 0.865 0.871
==============================================================================
Omnibus: 0.738 Durbin-Watson: 1.992
Prob(Omnibus): 0.691 Jarque-Bera (JB): 0.748
Skew: 0.000 Prob(JB): 0.688
Kurtosis: 2.987 Cond. No. 4.31
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
The standardized mean in those with A=0 is: 0.9177235440004516
The standardized mean in those with A=1 is: 2.1456780075228004
The standardized effect difference is 1.2279544635223487 with 95% CI (1.2241617612334224 , 1.231747165811275)