Learn practical skills, build real-world projects, and advance your career
######################
## import libraries ##
######################
import pandas as pd
import numpy as np    
np.random.seed(108156571)
import statsmodels.api as sm
import statsmodels.formula.api as smf
def simulate_dataset(n=100000, C1=1):
    
    ## A: dichotomous
    ## C1: continuous (with squared term) -> only effect modification
    ## C2: discrete -> only confouding
        
    A_split = 0.4
    C2_split = 0.35

    df_A1 = pd.DataFrame()
    df_A1['A'] = np.ones(int(n*A_split))
    df_A1['C2'] = np.random.uniform(0, 1, df_A1.shape[0])
    df_A1.loc[df_A1['C2']<C2_split, 'C2'] = 0
    df_A1.loc[df_A1['C2']>=C2_split, 'C2'] = 1
        
        
    df_A0 = pd.DataFrame()
    df_A0['A'] = np.zeros(int(n*(1-A_split)))
    df_A0['C2'] = np.random.uniform(0, 1, df_A0.shape[0])
    df_A0.loc[df_A0['C2']<(1-C2_split), 'C2'] = 0
    df_A0.loc[df_A0['C2']>=(1-C2_split), 'C2'] = 1
    
    df = pd.concat([df_A0, df_A1], axis=0).reset_index(drop=True) 
    df['C1'] = np.random.normal(2.5, 1, df.shape[0])
    df['C1_squared'] = df['C1']*df['C1']
    df['C1_sin'] = 3*np.sin(df['C1'])
    df['C1_squared_sin_interaction'] = df['C1_squared']*df['C1_sin']
    df['error'] = np.random.normal(0, 0.25, df.shape[0])     
    
    B0 = -0.23
    B1 = 1.23
    B2 = 1.56
    B3 = 1.42
    B4 = 2.31
    B5 = -0.43
    B6 = 0.52
    B7 = 1.09
    
    df['Y'] = B0 + (B1*df['A']) + (B2*df['C1']) + (B3*df['C1_squared']) + (B4*df['C1_sin']) + (B5*df['C1_squared_sin_interaction']) + (B6*df['C2']) + (B7*df['A']*df['C1']) + df['error']
    
    print("In the marginal Causal DAG, there is confounding by C2 and effect modification by C1 \n")
    print("The true outcome model is:")
    print('E[Y|A,C1,C2] = ' + str(B0) + ' + ' + str(B1) + '*A + ' + str(B2) + '*C1 + ' + str(B3) + '*C1^2 + ' + str(B4) + '*sin(C1) + ' + str(B5) + '*C1^2*sin(C1) + ' + str(B6) + '*C2 + ' + str(B7) + '*A*C1')
    print('\n')
    print('The true Mean Causal Effect Difference under intervention a=1 and a=0 within levels of C1=' + str(C1) + ' is:')
    print('E[Y(a=1)-Y(a=0)|C1=' + str(C1) + '] = ' + str(B1 + (B7*C1)))
   
    return(df)
## get simulated dataset
df_original = simulate_dataset(n=200000, C1=0.5)
In the marginal Causal DAG, there is confounding by C2 and effect modification by C1 The true outcome model is: E[Y|A,C1,C2] = -0.23 + 1.23*A + 1.56*C1 + 1.42*C1^2 + 2.31*sin(C1) + -0.43*C1^2*sin(C1) + 0.52*C2 + 1.09*A*C1 The true Mean Causal Effect Difference under intervention a=1 and a=0 within levels of C1=0.5 is: E[Y(a=1)-Y(a=0)|C1=0.5] = 1.775
###############################################
## fit and return standardized outcome model ##
###############################################
def fit_standardized_outcome_model(df, model_string, C1, fit_bounds=True, print_result=True):
    df_original = df.copy()
    df_A0 = df_original.copy()
    df_A1 = df_original.copy()
    
    df_A0['A'] = 0
    df_A0['C1'] = C1
    df_A0['C1_squared'] = df_A0['C1']*df_A0['C1']
    df_A0['C1_sin'] = 3*np.sin(df_A0['C1'])
    df_A0['C1_squared_sin_interaction'] = df_A0['C1_squared']*df_A0['C1_sin']
    
    df_A1['A'] = 1
    df_A1['C1'] = C1
    df_A1['C1_squared'] = df_A1['C1']*df_A1['C1']
    df_A1['C1_sin'] = 3*np.sin(df_A1['C1'])
    df_A1['C1_squared_sin_interaction'] = df_A1['C1_squared']*df_A1['C1_sin']

    outcome_model = smf.ols(formula=model_string, data=df).fit()
    df_A0['Y_fit'] = outcome_model.predict(df_A0)
    df_A1['Y_fit'] = outcome_model.predict(df_A1)
    mean_unexposed = df_A0['Y_fit'].mean()
    mean_exposed = df_A1['Y_fit'].mean()
    effect_difference = mean_exposed - mean_unexposed
        
    if(fit_bounds):
        effect_difference_list = []
        for i in range(0, 100):
            df_sub = df_original.sample(n=df_original.shape[0], replace=True)
            df_A0_sub = df_sub.copy()
            df_A1_sub = df_sub.copy()
            
            df_A0_sub['A'] = 0
            df_A0_sub['C1'] = C1
            df_A0_sub['C1_squared'] = df_A0_sub['C1']*df_A0_sub['C1']
            df_A0_sub['C1_sin'] = 3*np.sin(df_A0_sub['C1'])
            df_A0_sub['C1_squared_sin_interaction'] = df_A0_sub['C1_squared']*df_A0_sub['C1_sin']
            
            df_A1_sub['A'] = 1
            df_A1_sub['C1'] = C1
            df_A1_sub['C1_squared'] = df_A1_sub['C1']*df_A1_sub['C1']
            df_A1_sub['C1_sin'] = 3*np.sin(df_A1_sub['C1'])
            df_A1_sub['C1_squared_sin_interaction'] = df_A1_sub['C1_squared']*df_A1_sub['C1_sin']
            
            OM_sub = smf.ols(formula=model_string, data=df_sub).fit()
            df_A0_sub['Y_fit'] = OM_sub.predict(df_A0_sub)
            df_A1_sub['Y_fit'] = OM_sub.predict(df_A1_sub)
            mean_unexposed_sub = df_A0_sub['Y_fit'].mean()
            mean_exposed_sub = df_A1_sub['Y_fit'].mean()
            effect_difference_sub = mean_exposed_sub - mean_unexposed_sub
            effect_difference_list.append(effect_difference_sub)
            del df_sub, df_A0_sub, df_A1_sub, OM_sub, mean_unexposed_sub, mean_exposed_sub, effect_difference_sub
   
        variance = np.var(effect_difference_list)
        standard_error = np.sqrt(variance)
        bound_size = 1.96*standard_error
        lower_bound = effect_difference - bound_size
        upper_bound = effect_difference + bound_size
    
    if(print_result):
        print(outcome_model.summary())
        print("The standardized mean in those with A=0 and C1=" + str(C1) +" is: " + str(mean_unexposed))
        print("The standardized mean in those with A=1 and C1=" + str(C1) +" is: " + str(mean_exposed))
        if(fit_bounds):
            print("The standardized effect difference with C1=" + str(C1) +" is " + str(effect_difference) + " with 95% CI (" + str(lower_bound) + ' , ' + str(upper_bound) + ')')
        else:
            print("The standardized effect difference with C1=" + str(C1) + " is " + str(effect_difference))
    
    return(df, outcome_model)
## Standardization via the g-formula with correctly specified Outcome model
_, OM_CS = fit_standardized_outcome_model(df_original.copy(), model_string='Y ~ A + C1 + C1_squared + C1_sin + C1_squared_sin_interaction + C2 + A*C1', C1=0.5)
OLS Regression Results ============================================================================== Dep. Variable: Y R-squared: 1.000 Model: OLS Adj. R-squared: 1.000 Method: Least Squares F-statistic: 7.241e+07 Date: Tue, 15 Dec 2020 Prob (F-statistic): 0.00 Time: 12:30:44 Log-Likelihood: -6440.0 No. Observations: 200000 AIC: 1.290e+04 Df Residuals: 199992 BIC: 1.298e+04 Df Model: 7 Covariance Type: nonrobust ============================================================================================== coef std err t P>|t| [0.025 0.975] ---------------------------------------------------------------------------------------------- Intercept -0.2341 0.004 -65.471 0.000 -0.241 -0.227 A 1.2305 0.003 397.969 0.000 1.224 1.237 C1 1.5610 0.004 434.572 0.000 1.554 1.568 C1_squared 1.4201 0.001 1462.192 0.000 1.418 1.422 C1_sin 2.3105 0.001 2671.685 0.000 2.309 2.312 C1_squared_sin_interaction -0.4299 9.75e-05 -4409.689 0.000 -0.430 -0.430 C2 0.5202 0.001 444.135 0.000 0.518 0.523 A:C1 1.0897 0.001 956.363 0.000 1.087 1.092 ============================================================================== Omnibus: 0.658 Durbin-Watson: 1.993 Prob(Omnibus): 0.720 Jarque-Bera (JB): 0.655 Skew: 0.004 Prob(JB): 0.721 Kurtosis: 3.002 Cond. No. 155. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. The standardized mean in those with A=0 and C1=0.5 is: 4.314202877691639 The standardized mean in those with A=1 and C1=0.5 is: 6.089578604476575 The standardized effect difference with C1=0.5 is 1.7753757267849357 with 95% CI (1.770226691240206 , 1.7805247623296654)