Learn practical skills, build real-world projects, and advance your career
####################
## load libraries ##
####################
import numpy as np
import pandas as pd
np.random.seed(12345)
import statsmodels.api as sm
import statsmodels.formula.api as smf
##########################################################################
## simulate dataset and analysis (with and witout covariate adjustment) ##
##########################################################################
def simulate_df(n=200, outcome_type='normal', A_on_Y=2, data_only=False, seed=123456):
    np.random.seed(seed)
    ## specify dataframe
    df = pd.DataFrame()
    df['A'] = np.random.choice(int(n/2)*[0] + int(n/2)*[1], size=n, replace=False)

    ## specify variables L1 through L6
    L1_split = 0.52
    L2_split = 0.23
    L3_split = 0.38
    df['L1'] = np.random.choice([0, 1], size=n, replace=True, p=[L1_split, (1-L1_split)])
    df['L2'] = np.random.choice([0, 1], size=n, replace=True, p=[L2_split, (1-L2_split)])
    df['L3'] = np.random.choice([0, 1], size=n, replace=True, p=[L3_split, (1-L3_split)])
    df['L4'] = np.random.normal(0, 1, df.shape[0])
    df['L5'] = np.random.normal(0, 0.75, df.shape[0])
    df['L6'] = np.random.normal(0, 2, df.shape[0])
    
    df_A0 = df.copy()
    df_A0['A'] = 0
    df_A1 = df.copy()
    df_A1['A'] = 1
    
    theta_0 = -0.5
    theta_1 = A_on_Y
    theta_2 = 0.28
    theta_3 = 0.42
    theta_4 = 0.32
    theta_5 = -0.15
    theta_6 = 0.12
    theta_7 = -0.29
    
    if(outcome_type=='normal'):
        df['Y'] = theta_0 + (theta_1*df['A']) + (theta_2*df['L1']) + (theta_3*df['L2']) + (theta_4*df['L3']) + (theta_5*df['L4']) + (theta_6*df['L5']) + (theta_7*df['L6']) + np.random.normal(0, 1, df.shape[0])
        if(data_only):
            return(df)
        model_unadjusted = smf.ols('Y ~ A', data=df).fit()
        print(model_unadjusted.summary()) 
        model_adjusted = smf.ols('Y ~ A + L1 + L2 + L3 + L4 + L5 + L6', data=df).fit()
        print(model_adjusted.summary())
        print('The sample variance is: ' + str(df['Y'].var()))
        
    if(outcome_type=='binomial'):
        df['Z'] = theta_0 + (theta_1*df['A']) + (theta_2*df['L1']) + (theta_3*df['L2']) + (theta_4*df['L3']) + (theta_5*df['L4']) + (theta_6*df['L5']) + (theta_7*df['L6'])
        df['p'] = 1 / (1 + np.exp(-df['Z']))
        df['Y'] = np.random.binomial(1, df['p'])
        if(data_only):
            return(df)
        model_unadjusted = smf.glm('Y ~ A', data=df, family=sm.families.Binomial()).fit()
        print(model_unadjusted.summary()) 
        model_adjusted = smf.glm('Y ~ A + L1 + L2 + L3 + L4 + L5 + L6', data=df, family=sm.families.Binomial()).fit()
        print(model_adjusted.summary())
    
    
    df_A0['Y_pred_unadjusted'] =  model_unadjusted.predict(df_A0)
    df_A1['Y_pred_unadjusted'] =  model_unadjusted.predict(df_A1)
    df_A0['Y_pred_adjusted'] =  model_adjusted.predict(df_A0)
    df_A1['Y_pred_adjusted'] =  model_adjusted.predict(df_A1)
       
    print('The unadjusted Causal Effect Difference is: ' + str(df_A1['Y_pred_unadjusted'].mean() - df_A0['Y_pred_unadjusted'].mean()))
    print('The adjusted Causal Effect Difference is: ' + str(df_A1['Y_pred_adjusted'].mean() - df_A0['Y_pred_adjusted'].mean()))   
    
    return(df)
######################
## Power Simulation ##
######################
def power_analysis(df_original, outcome_type='normal', m=10, alpha=0.05, A_on_Y=0.5, seed=123456):
    np.random.seed(seed)
    
    df_p_values = pd.DataFrame()
    df_p_values['m'] = list(range(0,m))
    df_p_values['p_unadjusted'] = None
    df_p_values['p_adjusted'] = None
    
    for i in range(0, m):
        #df = df_original.sample(n=df_original.shape[0], replace=True, axis=0).reset_index(drop=True)

        if(outcome_type=='normal'):
            df = simulate_df(n=df_original.shape[0], outcome_type='normal', A_on_Y=A_on_Y, data_only=True, seed=i)            
            model_unadjusted = smf.ols('Y ~ A', data=df).fit()
            df_p_values.loc[i, 'p_unadjusted'] = model_unadjusted.pvalues['A']
            model_adjusted = smf.ols('Y ~ A + L1 + L2 + L3 + L4 + L5 + L6', data=df).fit()
            df_p_values.loc[i, 'p_adjusted'] = model_adjusted.pvalues['A']
            del df, model_unadjusted, model_adjusted
            
        if(outcome_type=='binomial'):
            df = simulate_df(n=df_original.shape[0], outcome_type='binomial', A_on_Y=A_on_Y, data_only=True, seed=i) 
            model_unadjusted = smf.glm('Y ~ A', data=df, family=sm.families.Binomial()).fit()
            df_p_values.loc[i, 'p_unadjusted'] = model_unadjusted.pvalues['A']
            model_adjusted = smf.glm('Y ~ A + L1 + L2 + L3 + L4 + L5 + L6', data=df, family=sm.families.Binomial()).fit()
            df_p_values.loc[i, 'p_adjusted'] = model_adjusted.pvalues['A']
            del df, model_unadjusted, model_adjusted
    
    unadjusted_power = 1 - (df_p_values.loc[df_p_values['p_unadjusted']>alpha, :].shape[0] / m)
    adjusted_power = 1 - (df_p_values.loc[df_p_values['p_adjusted']>alpha, :].shape[0] / m)
    
    print('The unadjusted power is: ' + str(unadjusted_power))
    print('The adjusted power is: ' + str(adjusted_power))
#############################
## Continuous Outcome Case ##
#############################
## Recover unbiased estimate of Mean Causal Effect Difference, with and without covariate adjustment:
df_normal = simulate_df(n=134, outcome_type='normal', A_on_Y=0.5, data_only=False, seed=1081565)
## conduct power analysis, both with and without covariate adjustment:
power_analysis(df_normal, outcome_type='normal', m=10000, alpha=0.05, A_on_Y=0.5, seed=123)
OLS Regression Results ============================================================================== Dep. Variable: Y R-squared: 0.057 Model: OLS Adj. R-squared: 0.049 Method: Least Squares F-statistic: 7.911 Date: Mon, 12 Apr 2021 Prob (F-statistic): 0.00566 Time: 00:17:52 Log-Likelihood: -207.28 No. Observations: 134 AIC: 418.6 Df Residuals: 132 BIC: 424.4 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 0.2847 0.140 2.035 0.044 0.008 0.561 A 0.5565 0.198 2.813 0.006 0.165 0.948 ============================================================================== Omnibus: 5.256 Durbin-Watson: 1.872 Prob(Omnibus): 0.072 Jarque-Bera (JB): 5.295 Skew: -0.310 Prob(JB): 0.0708 Kurtosis: 3.751 Cond. No. 2.62 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: Y R-squared: 0.412 Model: OLS Adj. R-squared: 0.379 Method: Least Squares F-statistic: 12.60 Date: Mon, 12 Apr 2021 Prob (F-statistic): 3.55e-12 Time: 00:17:52 Log-Likelihood: -175.63 No. Observations: 134 AIC: 367.3 Df Residuals: 126 BIC: 390.5 Df Model: 7 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -0.6208 0.212 -2.924 0.004 -1.041 -0.201 A 0.5069 0.161 3.140 0.002 0.187 0.826 L1 0.0920 0.161 0.572 0.569 -0.227 0.411 L2 0.7655 0.202 3.783 0.000 0.365 1.166 L3 0.3778 0.165 2.286 0.024 0.051 0.705 L4 -0.0274 0.081 -0.340 0.735 -0.187 0.132 L5 0.0959 0.108 0.885 0.378 -0.119 0.310 L6 -0.2899 0.043 -6.716 0.000 -0.375 -0.204 ============================================================================== Omnibus: 0.277 Durbin-Watson: 1.833 Prob(Omnibus): 0.871 Jarque-Bera (JB): 0.205 Skew: 0.095 Prob(JB): 0.903 Kurtosis: 2.979 Cond. No. 6.48 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. The sample variance is: 1.3793646703039655 The unadjusted Causal Effect Difference is: 0.5564637241021395 The adjusted Causal Effect Difference is: 0.5069038658712943 The unadjusted power is: 0.6735 The adjusted power is: 0.8023
#########################
## Binary Outcome Case ##
#########################
## Recover unbiased estimate of Mean Causal Effect Difference, with and without covariate adjustment:
df_binomial = simulate_df(n=652, outcome_type='binomial', A_on_Y=0.5, data_only=False, seed=12345678)
## conduct power analysis, both with and without covariate adjustment:
power_analysis(df_binomial, outcome_type='binomial', m=10000, alpha=0.05, A_on_Y=0.5, seed=123456)
Generalized Linear Model Regression Results ============================================================================== Dep. Variable: Y No. Observations: 652 Model: GLM Df Residuals: 650 Model Family: Binomial Df Model: 1 Link Function: logit Scale: 1.0000 Method: IRLS Log-Likelihood: -437.20 Date: Mon, 12 Apr 2021 Deviance: 874.40 Time: 00:26:45 Pearson chi2: 652. No. Iterations: 4 Covariance Type: nonrobust ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 0.1970 0.111 1.769 0.077 -0.021 0.415 A 0.3832 0.160 2.389 0.017 0.069 0.698 ============================================================================== Generalized Linear Model Regression Results ============================================================================== Dep. Variable: Y No. Observations: 652 Model: GLM Df Residuals: 644 Model Family: Binomial Df Model: 7 Link Function: logit Scale: 1.0000 Method: IRLS Log-Likelihood: -392.91 Date: Mon, 12 Apr 2021 Deviance: 785.82 Time: 00:26:45 Pearson chi2: 651. No. Iterations: 4 Covariance Type: nonrobust ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -0.5910 0.246 -2.401 0.016 -1.074 -0.109 A 0.4749 0.173 2.739 0.006 0.135 0.815 L1 0.4837 0.174 2.774 0.006 0.142 0.825 L2 0.4122 0.207 1.994 0.046 0.007 0.817 L3 0.3906 0.176 2.220 0.026 0.046 0.735 L4 -0.0240 0.083 -0.291 0.771 -0.186 0.138 L5 0.1518 0.118 1.282 0.200 -0.080 0.384 L6 -0.3961 0.051 -7.746 0.000 -0.496 -0.296 ============================================================================== The unadjusted Causal Effect Difference is: 0.0920245398772942 The adjusted Causal Effect Difference is: 0.09912605022292376 The unadjusted power is: 0.8187 The adjusted power is: 0.851