Learn practical skills, build real-world projects, and advance your career
######################
## import libraries ##
######################
import numpy as np
np.random.seed(10815652)
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
def simulate_dataset(n=100000, A_on_Y=0):
 
    ## A: observed binary intervention (0, 1)
    ## L: direct binary determinant of intervention A (0, 1)
    ## U1: binary unmeasured common cause of A and Y
    ## U2: binary unmeasured common cause of L and Y
    ## Y: countinuous outcome
    
    ## specify dataframe
    df = pd.DataFrame()
    
    ## specify variables Z, U1, and U2
    U1_split = 0.52
    U2_split = 0.23
    df['U1'] = np.random.choice([0, 1], size=n, replace=True, p=[U1_split, (1-U1_split)])
    df['U2'] = np.random.choice([0, 1], size=n, replace=True, p=[U2_split, (1-U2_split)])
    
    ## specify variable A
    lambda_0 = -2.32
    lambda_1 = 5.18
    df['A'] = 0
    df.loc[df['U1']==0, 'A'] = np.random.binomial(1, (1/(1+np.exp(-lambda_0))), size=df.loc[df['U1']==0, 'A'].shape[0])
    df.loc[df['U1']==1, 'A'] = np.random.binomial(1, (1/(1+np.exp(-(lambda_0+lambda_1)))), size=df.loc[df['U1']==1, 'A'].shape[0])

    ## specify variable L
    Beta_0 = -0.52
    Beta_1 = 2.32
    Beta_2 = 1.98
    df['L'] = 0
    df.loc[(df['U1']==0) & (df['U2']==0), 'L'] = np.random.binomial(1, (1/(1+np.exp(-Beta_0))), size=df.loc[(df['U1']==0) & (df['U2']==0), 'L'].shape[0])
    df.loc[(df['U1']==1) & (df['U2']==0), 'L'] = np.random.binomial(1, (1/(1+np.exp(-(Beta_0+Beta_1)))), size=df.loc[(df['U1']==1) & (df['U2']==0), 'L'].shape[0])
    df.loc[(df['U1']==0) & (df['U2']==1), 'L'] = np.random.binomial(1, (1/(1+np.exp(-(Beta_0+Beta_2)))), size=df.loc[(df['U1']==0) & (df['U2']==1), 'L'].shape[0])
    df.loc[(df['U1']==1) & (df['U2']==1), 'L'] = np.random.binomial(1, (1/(1+np.exp(-(Beta_0+Beta_1+Beta_2)))), size=df.loc[(df['U1']==1) & (df['U2']==1), 'L'].shape[0])
    
    ## specify variable Y
    theta_0 = -0.5
    theta_1 = 5.78
    theta_2 = A_on_Y
    df['Y'] = theta_0 + (theta_1*df['U2']) + (theta_2*df['A']) + np.random.normal(0, 0.5, df.shape[0])
    
    df = df[['A', 'L', 'Y', 'U1', 'U2']]
    df_observed = df[['A', 'L', 'Y']].copy()
    
    return(df, df_observed)
#################################################################
## simulate dataset with no causal effect difference of A on Y ##
#################################################################
df, df_observed = simulate_dataset(n=1000000, A_on_Y=0)
#####################
## the "true" data ##
#####################
print(df.head(n=10))
A L Y U1 U2 0 0 0 5.247234 0 1 1 0 1 4.307400 0 1 2 0 1 4.864241 0 1 3 1 1 5.213704 1 1 4 1 1 4.334214 1 1 5 0 1 4.410975 0 1 6 1 1 5.623942 1 1 7 0 1 -0.863473 0 0 8 1 1 -0.259578 1 0 9 0 1 5.305411 1 1
##############################################
## data we get to "observe" in our analysis ##
##############################################
print(df_observed.head(n=10))
A L Y 0 0 0 5.247234 1 0 1 4.307400 2 0 1 4.864241 3 1 1 5.213704 4 1 1 4.334214 5 0 1 4.410975 6 1 1 5.623942 7 0 1 -0.863473 8 1 1 -0.259578 9 0 1 5.305411