Learn practical skills, build real-world projects, and advance your career
######################
## import libraries ##
######################
import pandas as pd
import numpy as np    
np.random.seed(10815657)
import statsmodels.api as sm
import statsmodels.formula.api as smf
#########################################
## generate simulated dataset function ##
#########################################
def simulate_dataset(n=100000):
    A_split = 0.4
    C1_split = 0.2
    C2_split = 0.65

    df_A1 = pd.DataFrame()
    df_A1['A'] = np.ones(int(n*A_split))
    df_A1['C1'] = np.random.uniform(0, 1, int(n*A_split))
    df_A1['C2'] = np.random.uniform(0, 1, int(n*A_split))
    df_A1.loc[df_A1['C1']<C1_split, 'C1'] = 0
    df_A1.loc[df_A1['C1']>=C1_split, 'C1'] = 1
    df_A1.loc[df_A1['C2']<C2_split, 'C2'] = 0
    df_A1.loc[df_A1['C2']>=C2_split, 'C2'] = 1

    df_A0 = pd.DataFrame()
    df_A0['A'] = np.zeros(int(n*(1-A_split)))
    df_A0['C1'] = np.random.uniform(0, 1, int(n*(1-A_split)))
    df_A0['C2'] = np.random.uniform(0, 1, int(n*(1-A_split)))
    df_A0.loc[df_A0['C1']<(1-C1_split), 'C1'] = 0
    df_A0.loc[df_A0['C1']>=(1-C1_split), 'C1'] = 1
    df_A0.loc[df_A0['C2']<(1-C2_split), 'C2'] = 0
    df_A0.loc[df_A0['C2']>=(1-C2_split), 'C2'] = 1

    df = pd.concat([df_A0, df_A1], axis=0).reset_index(drop=True)
    df['error'] = np.random.normal(0, 0.25, df.shape[0]) 
    del df_A1, df_A0

    B0 = -0.23
    B1 = 1.23
    B2 = 1.56
    B3 = 0.87
    df['Y'] = B0 + (B1*df['A']) + (B2*df['C1']) + (B3*df['C2']) + df['error']
    
    print("The true outcome model is:")
    print('E[Y|A,C1,C2] = ' + str(B0) + ' + ' + str(B1) + '*A + ' + str(B2) + '*C1 + ' + str(B3) + '*C2')
    
    return(df)
## get simulated dataset
df_original = simulate_dataset(n=100000)
The true outcome model is: E[Y|A,C1,C2] = -0.23 + 1.23*A + 1.56*C1 + 0.87*C2
###############################################
## fit and return standardized outcome model ##
###############################################
def fit_standardized_outcome_model(df, model_string, fit_bounds=True, print_result=True):
    df_original = df.copy()
    df_A0 = df_original.copy()
    df_A1 = df_original.copy()
    df_A0['A'] = 0
    df_A1['A'] = 1
    outcome_model = smf.ols(formula=model_string, data=df).fit()
    df_A0['Y_fit'] = outcome_model.predict(df_A0)
    df_A1['Y_fit'] = outcome_model.predict(df_A1)
    mean_unexposed = df_A0['Y_fit'].mean()
    mean_exposed = df_A1['Y_fit'].mean()
    effect_difference = mean_exposed - mean_unexposed
        
    if(fit_bounds):
        effect_difference_list = []
        for i in range(0, 100):
            df_sub = df_original.sample(n=df_original.shape[0], replace=True)
            df_A0_sub = df_sub.copy()
            df_A1_sub = df_sub.copy()
            df_A0_sub['A'] = 0
            df_A1_sub['A'] = 1
            OM_sub = smf.ols(formula=model_string, data=df_sub).fit()
            df_A0_sub['Y_fit'] = OM_sub.predict(df_A0_sub)
            df_A1_sub['Y_fit'] = OM_sub.predict(df_A1_sub)
            mean_unexposed_sub = df_A0_sub['Y_fit'].mean()
            mean_exposed_sub = df_A1_sub['Y_fit'].mean()
            effect_difference_sub = mean_exposed_sub - mean_unexposed_sub
            effect_difference_list.append(effect_difference_sub)
            del df_sub, df_A0_sub, df_A1_sub, OM_sub, mean_unexposed_sub, mean_exposed_sub, effect_difference_sub
   
        variance = np.var(effect_difference_list)
        standard_error = np.sqrt(variance)
        bound_size = 1.96*standard_error
        lower_bound = effect_difference - bound_size
        upper_bound = effect_difference + bound_size
    
    if(print_result):
        print(outcome_model.summary())
        print("The standardized mean in those with A=0 is: " + str(mean_unexposed))
        print("The standardized mean in those with A=1 is: " + str(mean_exposed))
        if(fit_bounds):
            print("The standardized effect difference is " + str(effect_difference) + " with 95% CI (" + str(lower_bound) + ' , ' + str(upper_bound) + ')')
        else:
            print("The standardized effect difference is " + str(effect_difference))
    
    return(df, outcome_model)
## fit and return correctly specified outcome model
df_OM_CS, OM_CS = fit_standardized_outcome_model(df_original.copy(), 'Y ~ A + C1 + C2')
OLS Regression Results ============================================================================== Dep. Variable: Y R-squared: 0.958 Model: OLS Adj. R-squared: 0.958 Method: Least Squares F-statistic: 7.620e+05 Date: Thu, 03 Dec 2020 Prob (F-statistic): 0.00 Time: 11:33:19 Log-Likelihood: -3353.7 No. Observations: 100000 AIC: 6715. Df Residuals: 99996 BIC: 6753. Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -0.2272 0.002 -147.829 0.000 -0.230 -0.224 A 1.2280 0.002 593.422 0.000 1.224 1.232 C1 1.5588 0.002 786.222 0.000 1.555 1.563 C2 0.8679 0.002 523.136 0.000 0.865 0.871 ============================================================================== Omnibus: 0.738 Durbin-Watson: 1.992 Prob(Omnibus): 0.691 Jarque-Bera (JB): 0.748 Skew: 0.000 Prob(JB): 0.688 Kurtosis: 2.987 Cond. No. 4.31 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. The standardized mean in those with A=0 is: 0.9177235440004516 The standardized mean in those with A=1 is: 2.1456780075228004 The standardized effect difference is 1.2279544635223487 with 95% CI (1.2241617612334224 , 1.231747165811275)