Double Robust Estimation - Notebook by atrothman (atrothman)

Learn practical skills, build real-world projects, and advance your career

Updated 3 years ago

######################
## import libraries ##
######################
import pandas as pd
import numpy as np    
np.random.seed(10815657)
import statsmodels.api as sm
import statsmodels.formula.api as smf

#########################################
## generate simulated dataset function ##
#########################################
def simulate_dataset(n=100000):
    A_split = 0.4
    C1_split = 0.2
    C2_split = 0.65

    df_A1 = pd.DataFrame()
    df_A1['A'] = np.ones(int(n*A_split))
    df_A1['C1'] = np.random.uniform(0, 1, int(n*A_split))
    df_A1['C2'] = np.random.uniform(0, 1, int(n*A_split))
    df_A1.loc[df_A1['C1']<C1_split, 'C1'] = 0
    df_A1.loc[df_A1['C1']>=C1_split, 'C1'] = 1
    df_A1.loc[df_A1['C2']<C2_split, 'C2'] = 0
    df_A1.loc[df_A1['C2']>=C2_split, 'C2'] = 1

    df_A0 = pd.DataFrame()
    df_A0['A'] = np.zeros(int(n*(1-A_split)))
    df_A0['C1'] = np.random.uniform(0, 1, int(n*(1-A_split)))
    df_A0['C2'] = np.random.uniform(0, 1, int(n*(1-A_split)))
    df_A0.loc[df_A0['C1']<(1-C1_split), 'C1'] = 0
    df_A0.loc[df_A0['C1']>=(1-C1_split), 'C1'] = 1
    df_A0.loc[df_A0['C2']<(1-C2_split), 'C2'] = 0
    df_A0.loc[df_A0['C2']>=(1-C2_split), 'C2'] = 1

    df = pd.concat([df_A0, df_A1], axis=0).reset_index(drop=True)
    df['error'] = np.random.normal(0, 0.25, df.shape[0]) 
    del df_A1, df_A0

    B0 = -0.23
    B1 = 1.23
    B2 = 1.56
    B3 = 0.87
    df['Y'] = B0 + (B1*df['A']) + (B2*df['C1']) + (B3*df['C2']) + df['error']
    
    print("The true outcome model is:")
    print('E[Y|A,C1,C2] = ' + str(B0) + ' + ' + str(B1) + '*A + ' + str(B2) + '*C1 + ' + str(B3) + '*C2')
    
    return(df)

## get simulated dataset
df_original = simulate_dataset(n=100000)

The true outcome model is:
E[Y|A,C1,C2] = -0.23 + 1.23*A + 1.56*C1 + 0.87*C2

###############################################
## fit and return standardized outcome model ##
###############################################
def fit_standardized_outcome_model(df, model_string, fit_bounds=True, print_result=True):
    df_original = df.copy()
    df_A0 = df_original.copy()
    df_A1 = df_original.copy()
    df_A0['A'] = 0
    df_A1['A'] = 1
    outcome_model = smf.ols(formula=model_string, data=df).fit()
    df_A0['Y_fit'] = outcome_model.predict(df_A0)
    df_A1['Y_fit'] = outcome_model.predict(df_A1)
    mean_unexposed = df_A0['Y_fit'].mean()
    mean_exposed = df_A1['Y_fit'].mean()
    effect_difference = mean_exposed - mean_unexposed
        
    if(fit_bounds):
        effect_difference_list = []
        for i in range(0, 100):
            df_sub = df_original.sample(n=df_original.shape[0], replace=True)
            df_A0_sub = df_sub.copy()
            df_A1_sub = df_sub.copy()
            df_A0_sub['A'] = 0
            df_A1_sub['A'] = 1
            OM_sub = smf.ols(formula=model_string, data=df_sub).fit()
            df_A0_sub['Y_fit'] = OM_sub.predict(df_A0_sub)
            df_A1_sub['Y_fit'] = OM_sub.predict(df_A1_sub)
            mean_unexposed_sub = df_A0_sub['Y_fit'].mean()
            mean_exposed_sub = df_A1_sub['Y_fit'].mean()
            effect_difference_sub = mean_exposed_sub - mean_unexposed_sub
            effect_difference_list.append(effect_difference_sub)
            del df_sub, df_A0_sub, df_A1_sub, OM_sub, mean_unexposed_sub, mean_exposed_sub, effect_difference_sub
   
        variance = np.var(effect_difference_list)
        standard_error = np.sqrt(variance)
        bound_size = 1.96*standard_error
        lower_bound = effect_difference - bound_size
        upper_bound = effect_difference + bound_size
    
    if(print_result):
        print(outcome_model.summary())
        print("The standardized mean in those with A=0 is: " + str(mean_unexposed))
        print("The standardized mean in those with A=1 is: " + str(mean_exposed))
        if(fit_bounds):
            print("The standardized effect difference is " + str(effect_difference) + " with 95% CI (" + str(lower_bound) + ' , ' + str(upper_bound) + ')')
        else:
            print("The standardized effect difference is " + str(effect_difference))
    
    return(df, outcome_model)

## fit and return correctly specified outcome model
df_OM_CS, OM_CS = fit_standardized_outcome_model(df_original.copy(), 'Y ~ A + C1 + C2')

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.958
Model:                            OLS   Adj. R-squared:                  0.958
Method:                 Least Squares   F-statistic:                 7.620e+05
Date:                Thu, 03 Dec 2020   Prob (F-statistic):               0.00
Time:                        11:33:19   Log-Likelihood:                -3353.7
No. Observations:              100000   AIC:                             6715.
Df Residuals:                   99996   BIC:                             6753.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.2272      0.002   -147.829      0.000      -0.230      -0.224
A              1.2280      0.002    593.422      0.000       1.224       1.232
C1             1.5588      0.002    786.222      0.000       1.555       1.563
C2             0.8679      0.002    523.136      0.000       0.865       0.871
==============================================================================
Omnibus:                        0.738   Durbin-Watson:                   1.992
Prob(Omnibus):                  0.691   Jarque-Bera (JB):                0.748
Skew:                           0.000   Prob(JB):                        0.688
Kurtosis:                       2.987   Cond. No.                         4.31
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
The standardized mean in those with A=0 is: 0.9177235440004516
The standardized mean in those with A=1 is: 2.1456780075228004
The standardized effect difference is 1.2279544635223487 with 95% CI (1.2241617612334224 , 1.231747165811275)