Random Trials Simulation With Without Covariate Adjustment - Notebook by atrothman (atrothman)

Learn practical skills, build real-world projects, and advance your career

Created 3 years ago

####################
## load libraries ##
####################
import numpy as np
import pandas as pd
np.random.seed(12345)
import statsmodels.api as sm
import statsmodels.formula.api as smf

##########################################################################
## simulate dataset and analysis (with and witout covariate adjustment) ##
##########################################################################
def simulate_df(n=200, outcome_type='normal', A_on_Y=2, data_only=False, seed=123456):
    np.random.seed(seed)
    ## specify dataframe
    df = pd.DataFrame()
    df['A'] = np.random.choice(int(n/2)*[0] + int(n/2)*[1], size=n, replace=False)

    ## specify variables L1 through L6
    L1_split = 0.52
    L2_split = 0.23
    L3_split = 0.38
    df['L1'] = np.random.choice([0, 1], size=n, replace=True, p=[L1_split, (1-L1_split)])
    df['L2'] = np.random.choice([0, 1], size=n, replace=True, p=[L2_split, (1-L2_split)])
    df['L3'] = np.random.choice([0, 1], size=n, replace=True, p=[L3_split, (1-L3_split)])
    df['L4'] = np.random.normal(0, 1, df.shape[0])
    df['L5'] = np.random.normal(0, 0.75, df.shape[0])
    df['L6'] = np.random.normal(0, 2, df.shape[0])
    
    df_A0 = df.copy()
    df_A0['A'] = 0
    df_A1 = df.copy()
    df_A1['A'] = 1
    
    theta_0 = -0.5
    theta_1 = A_on_Y
    theta_2 = 0.28
    theta_3 = 0.42
    theta_4 = 0.32
    theta_5 = -0.15
    theta_6 = 0.12
    theta_7 = -0.29
    
    if(outcome_type=='normal'):
        df['Y'] = theta_0 + (theta_1*df['A']) + (theta_2*df['L1']) + (theta_3*df['L2']) + (theta_4*df['L3']) + (theta_5*df['L4']) + (theta_6*df['L5']) + (theta_7*df['L6']) + np.random.normal(0, 1, df.shape[0])
        if(data_only):
            return(df)
        model_unadjusted = smf.ols('Y ~ A', data=df).fit()
        print(model_unadjusted.summary()) 
        model_adjusted = smf.ols('Y ~ A + L1 + L2 + L3 + L4 + L5 + L6', data=df).fit()
        print(model_adjusted.summary())
        print('The sample variance is: ' + str(df['Y'].var()))
        
    if(outcome_type=='binomial'):
        df['Z'] = theta_0 + (theta_1*df['A']) + (theta_2*df['L1']) + (theta_3*df['L2']) + (theta_4*df['L3']) + (theta_5*df['L4']) + (theta_6*df['L5']) + (theta_7*df['L6'])
        df['p'] = 1 / (1 + np.exp(-df['Z']))
        df['Y'] = np.random.binomial(1, df['p'])
        if(data_only):
            return(df)
        model_unadjusted = smf.glm('Y ~ A', data=df, family=sm.families.Binomial()).fit()
        print(model_unadjusted.summary()) 
        model_adjusted = smf.glm('Y ~ A + L1 + L2 + L3 + L4 + L5 + L6', data=df, family=sm.families.Binomial()).fit()
        print(model_adjusted.summary())
    
    
    df_A0['Y_pred_unadjusted'] =  model_unadjusted.predict(df_A0)
    df_A1['Y_pred_unadjusted'] =  model_unadjusted.predict(df_A1)
    df_A0['Y_pred_adjusted'] =  model_adjusted.predict(df_A0)
    df_A1['Y_pred_adjusted'] =  model_adjusted.predict(df_A1)
       
    print('The unadjusted Causal Effect Difference is: ' + str(df_A1['Y_pred_unadjusted'].mean() - df_A0['Y_pred_unadjusted'].mean()))
    print('The adjusted Causal Effect Difference is: ' + str(df_A1['Y_pred_adjusted'].mean() - df_A0['Y_pred_adjusted'].mean()))   
    
    return(df)

######################
## Power Simulation ##
######################
def power_analysis(df_original, outcome_type='normal', m=10, alpha=0.05, A_on_Y=0.5, seed=123456):
    np.random.seed(seed)
    
    df_p_values = pd.DataFrame()
    df_p_values['m'] = list(range(0,m))
    df_p_values['p_unadjusted'] = None
    df_p_values['p_adjusted'] = None
    
    for i in range(0, m):
        #df = df_original.sample(n=df_original.shape[0], replace=True, axis=0).reset_index(drop=True)

        if(outcome_type=='normal'):
            df = simulate_df(n=df_original.shape[0], outcome_type='normal', A_on_Y=A_on_Y, data_only=True, seed=i)            
            model_unadjusted = smf.ols('Y ~ A', data=df).fit()
            df_p_values.loc[i, 'p_unadjusted'] = model_unadjusted.pvalues['A']
            model_adjusted = smf.ols('Y ~ A + L1 + L2 + L3 + L4 + L5 + L6', data=df).fit()
            df_p_values.loc[i, 'p_adjusted'] = model_adjusted.pvalues['A']
            del df, model_unadjusted, model_adjusted
            
        if(outcome_type=='binomial'):
            df = simulate_df(n=df_original.shape[0], outcome_type='binomial', A_on_Y=A_on_Y, data_only=True, seed=i) 
            model_unadjusted = smf.glm('Y ~ A', data=df, family=sm.families.Binomial()).fit()
            df_p_values.loc[i, 'p_unadjusted'] = model_unadjusted.pvalues['A']
            model_adjusted = smf.glm('Y ~ A + L1 + L2 + L3 + L4 + L5 + L6', data=df, family=sm.families.Binomial()).fit()
            df_p_values.loc[i, 'p_adjusted'] = model_adjusted.pvalues['A']
            del df, model_unadjusted, model_adjusted
    
    unadjusted_power = 1 - (df_p_values.loc[df_p_values['p_unadjusted']>alpha, :].shape[0] / m)
    adjusted_power = 1 - (df_p_values.loc[df_p_values['p_adjusted']>alpha, :].shape[0] / m)
    
    print('The unadjusted power is: ' + str(unadjusted_power))
    print('The adjusted power is: ' + str(adjusted_power))

#############################
## Continuous Outcome Case ##
#############################
## Recover unbiased estimate of Mean Causal Effect Difference, with and without covariate adjustment:
df_normal = simulate_df(n=134, outcome_type='normal', A_on_Y=0.5, data_only=False, seed=1081565)
## conduct power analysis, both with and without covariate adjustment:
power_analysis(df_normal, outcome_type='normal', m=10000, alpha=0.05, A_on_Y=0.5, seed=123)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.057
Model:                            OLS   Adj. R-squared:                  0.049
Method:                 Least Squares   F-statistic:                     7.911
Date:                Mon, 12 Apr 2021   Prob (F-statistic):            0.00566
Time:                        00:17:52   Log-Likelihood:                -207.28
No. Observations:                 134   AIC:                             418.6
Df Residuals:                     132   BIC:                             424.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.2847      0.140      2.035      0.044       0.008       0.561
A              0.5565      0.198      2.813      0.006       0.165       0.948
==============================================================================
Omnibus:                        5.256   Durbin-Watson:                   1.872
Prob(Omnibus):                  0.072   Jarque-Bera (JB):                5.295
Skew:                          -0.310   Prob(JB):                       0.0708
Kurtosis:                       3.751   Cond. No.                         2.62
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.412
Model:                            OLS   Adj. R-squared:                  0.379
Method:                 Least Squares   F-statistic:                     12.60
Date:                Mon, 12 Apr 2021   Prob (F-statistic):           3.55e-12
Time:                        00:17:52   Log-Likelihood:                -175.63
No. Observations:                 134   AIC:                             367.3
Df Residuals:                     126   BIC:                             390.5
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.6208      0.212     -2.924      0.004      -1.041      -0.201
A              0.5069      0.161      3.140      0.002       0.187       0.826
L1             0.0920      0.161      0.572      0.569      -0.227       0.411
L2             0.7655      0.202      3.783      0.000       0.365       1.166
L3             0.3778      0.165      2.286      0.024       0.051       0.705
L4            -0.0274      0.081     -0.340      0.735      -0.187       0.132
L5             0.0959      0.108      0.885      0.378      -0.119       0.310
L6            -0.2899      0.043     -6.716      0.000      -0.375      -0.204
==============================================================================
Omnibus:                        0.277   Durbin-Watson:                   1.833
Prob(Omnibus):                  0.871   Jarque-Bera (JB):                0.205
Skew:                           0.095   Prob(JB):                        0.903
Kurtosis:                       2.979   Cond. No.                         6.48
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
The sample variance is: 1.3793646703039655
The unadjusted Causal Effect Difference is: 0.5564637241021395
The adjusted Causal Effect Difference is: 0.5069038658712943
The unadjusted power is: 0.6735
The adjusted power is: 0.8023

#########################
## Binary Outcome Case ##
#########################
## Recover unbiased estimate of Mean Causal Effect Difference, with and without covariate adjustment:
df_binomial = simulate_df(n=652, outcome_type='binomial', A_on_Y=0.5, data_only=False, seed=12345678)
## conduct power analysis, both with and without covariate adjustment:
power_analysis(df_binomial, outcome_type='binomial', m=10000, alpha=0.05, A_on_Y=0.5, seed=123456)

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                      Y   No. Observations:                  652
Model:                            GLM   Df Residuals:                      650
Model Family:                Binomial   Df Model:                            1
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -437.20
Date:                Mon, 12 Apr 2021   Deviance:                       874.40
Time:                        00:26:45   Pearson chi2:                     652.
No. Iterations:                     4                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.1970      0.111      1.769      0.077      -0.021       0.415
A              0.3832      0.160      2.389      0.017       0.069       0.698
==============================================================================
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                      Y   No. Observations:                  652
Model:                            GLM   Df Residuals:                      644
Model Family:                Binomial   Df Model:                            7
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -392.91
Date:                Mon, 12 Apr 2021   Deviance:                       785.82
Time:                        00:26:45   Pearson chi2:                     651.
No. Iterations:                     4                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.5910      0.246     -2.401      0.016      -1.074      -0.109
A              0.4749      0.173      2.739      0.006       0.135       0.815
L1             0.4837      0.174      2.774      0.006       0.142       0.825
L2             0.4122      0.207      1.994      0.046       0.007       0.817
L3             0.3906      0.176      2.220      0.026       0.046       0.735
L4            -0.0240      0.083     -0.291      0.771      -0.186       0.138
L5             0.1518      0.118      1.282      0.200      -0.080       0.384
L6            -0.3961      0.051     -7.746      0.000      -0.496      -0.296
==============================================================================
The unadjusted Causal Effect Difference is: 0.0920245398772942
The adjusted Causal Effect Difference is: 0.09912605022292376
The unadjusted power is: 0.8187
The adjusted power is: 0.851