Learn practical skills, build real-world projects, and advance your career
####################
## load libraries ##
####################
import numpy as np
import pandas as pd
np.random.seed(123456789)
import statsmodels.api as sm
import statsmodels.formula.api as smf
######################################
## function to simulate toy dataset ##
######################################
def simulate_df(n, A_on_Y, seed=123456):
    np.random.seed(seed)
    ## specify dataframe
    df = pd.DataFrame()

    ## specify variables L1 through L6
    L1_split = 0.52
    L2_split = 0.23
    L3_split = 0.38
    df['L1'] = np.random.choice([0, 1], size=n, replace=True, p=[L1_split, (1-L1_split)])
    df['L2'] = np.random.choice([0, 1], size=n, replace=True, p=[L2_split, (1-L2_split)])
    df['L3'] = np.random.choice([0, 1], size=n, replace=True, p=[L3_split, (1-L3_split)])
    df['L4'] = np.random.normal(0, 1, df.shape[0])
    df['L5'] = np.random.normal(0, 0.75, df.shape[0])
    df['L6'] = np.random.normal(0, 2, df.shape[0])

    ## specify intervention A
    beta_0 = 1.2
    beta_1 = -2.56
    beta_2 = -1.78
    beta_3 = 3.52
    beta_4 = 0.98
    beta_5 = -2.17
    beta_6 = 1.52
    Z = beta_0 + (beta_1*df['L1']) + (beta_2*df['L2']) + (beta_3*df['L3']) + (beta_4*df['L4']) + (beta_5*df['L5']) + (beta_6*df['L6'])
    p = 1 / (1 + np.exp(-Z))
    df['A'] = np.random.binomial(1, p)
    
    ## specify outcome Y
    theta_0 = -0.5
    theta_1 = A_on_Y
    theta_2 = 2.38
    theta_3 = 1.42
    theta_4 = 4.32
    theta_5 = -2.15
    theta_6 = 1.12
    theta_7 = -2.29
    df['Y'] = theta_0 + (theta_1*df['A']) + (theta_2*df['L1']) + (theta_3*df['L2']) + (theta_4*df['L3']) + (theta_5*df['L4']) + (theta_6*df['L5']) + (theta_7*df['L6']) + np.random.normal(0, 1, df.shape[0])
    
    return(df)
#########################################################
## function for univariate regression of outcome model ##
#########################################################
def unadjusted_regression(df):
    model_unadjusted = smf.ols('Y ~ A', data=df).fit()
    print(model_unadjusted.summary())
##############################################################################
## function for multivariate regression of outcome model, conditioning on L ##
##############################################################################
def adjusted_regression_L(df):
    model_adjusted_L = smf.ols('Y ~ A + L1 + L2 + L3 + L4 + L5 + L6', data=df).fit()
    print(model_adjusted_L.summary())
#############################################################################################
## function for multivariate regression of outcome model, conditioning on Propensity Score ##
#############################################################################################
def adjusted_regression_PS(df):
    model_PS = smf.glm('A ~ L1 + L2 + L3 + L4 + L5 + L6', data=df, family=sm.families.Binomial()).fit()
    df['PS'] = model_PS.predict(df)
    model_adjusted_PS = smf.ols('Y ~ A + PS', data=df).fit()
    print(model_adjusted_PS.summary())