Learn practical skills, build real-world projects, and advance your career
Created 3 years ago
######################
## import libraries ##
######################
import numpy as np
np.random.seed(1081565)
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
def simulate_dataset(n=2000000, A_on_Y=0):
## Z: “randomized” binary intervention assignment (0, 1)
## A: observed binary intervention (0, 1)
## L: direct binary determinant of intervention A (0, 1)
## U1: binary unmeasured common cause of A and Y
## U2: binary unmeasured common cause of L and Y
## C: censoring variable
## Y: countinuous outcome
## specify dataframe
df = pd.DataFrame()
## specify variables Z, U1, and U2
Z_split = 0.4
U1_split = 0.52
U2_split = 0.23
df['Z'] = np.random.choice([0, 1], size=n, replace=True, p=[Z_split, (1-Z_split)])
df['U1'] = np.random.choice([0, 1], size=n, replace=True, p=[U1_split, (1-U1_split)])
df['U2'] = np.random.choice([0, 1], size=n, replace=True, p=[U2_split, (1-U2_split)])
## specify variable A
lambda_0 = -5.2
lambda_1 = 9.3
lambda_2 = 3.45
df['A'] = 0
df.loc[(df['Z']==0) & (df['U1']==0), 'A'] = np.random.binomial(1, (1/(1+np.exp(-lambda_0))), size=df.loc[(df['Z']==0) & (df['U1']==0), 'A'].shape[0])
df.loc[(df['Z']==1) & (df['U1']==0), 'A'] = np.random.binomial(1, (1/(1+np.exp(-(lambda_0+lambda_1)))), size=df.loc[(df['Z']==1) & (df['U1']==0), 'A'].shape[0])
df.loc[(df['Z']==0) & (df['U1']==1), 'A'] = np.random.binomial(1, (1/(1+np.exp(-(lambda_0+lambda_2)))), size=df.loc[(df['Z']==0) & (df['U1']==1), 'A'].shape[0])
df.loc[(df['Z']==1) & (df['U1']==1), 'A'] = np.random.binomial(1, (1/(1+np.exp(-(lambda_0+lambda_1+lambda_2)))), size=df.loc[(df['Z']==1) & (df['U1']==1), 'A'].shape[0])
#model = smf.glm('A ~ Z + U1', data=df, family=sm.families.Binomial()).fit()
#print(model.summary())
#pd.crosstab(df['Z'], df['A'])
#pd.crosstab(df['Z'], df['A'], normalize=True)
## specify variable L
Beta_0 = -0.52
Beta_1 = 2.32
Beta_2 = 1.98
df['L'] = 0
df.loc[(df['A']==0) & (df['U2']==0), 'L'] = np.random.binomial(1, (1/(1+np.exp(-Beta_0))), size=df.loc[(df['A']==0) & (df['U2']==0), 'L'].shape[0])
df.loc[(df['A']==1) & (df['U2']==0), 'L'] = np.random.binomial(1, (1/(1+np.exp(-(Beta_0+Beta_1)))), size=df.loc[(df['A']==1) & (df['U2']==0), 'L'].shape[0])
df.loc[(df['A']==0) & (df['U2']==1), 'L'] = np.random.binomial(1, (1/(1+np.exp(-(Beta_0+Beta_2)))), size=df.loc[(df['A']==0) & (df['U2']==1), 'L'].shape[0])
df.loc[(df['A']==1) & (df['U2']==1), 'L'] = np.random.binomial(1, (1/(1+np.exp(-(Beta_0+Beta_1+Beta_2)))), size=df.loc[(df['A']==1) & (df['U2']==1), 'L'].shape[0])
#model = smf.glm('L ~ A + U2', data=df, family=sm.families.Binomial()).fit()
#print(model.summary())
## specify variable C
psi_0 = 0.15
psi_1 = -5.8
df['C'] = 0
df.loc[df['L']==0, 'C'] = np.random.binomial(1, (1/(1+np.exp(-psi_0))), size=df.loc[df['L']==0, 'C'].shape[0])
df.loc[df['L']==1, 'C'] = np.random.binomial(1, (1/(1+np.exp(-(psi_0+psi_1)))), size=df.loc[df['L']==1, 'C'].shape[0])
#model = smf.glm('C ~ L', data=df, family=sm.families.Binomial()).fit()
#print(model.summary())
## specify variable Y
theta_0 = -0.5
theta_1 = 5.78
theta_2 = A_on_Y
theta_3 = 1.42
df['Y'] = theta_0 + (theta_1*df['U2']) + (theta_2*df['A']) + (theta_3*df['U1']) + np.random.normal(0, 0.5, df.shape[0])
df_observed = df[['Z', 'A', 'L', 'C', 'Y']].copy()
df_observed.loc[df['C']==1, 'Y'] = None
return(df, df_observed)
#################################################################
## simulate dataset with no causal effect difference of A on Y ##
#################################################################
df, df_observed = simulate_dataset(n=2000000, A_on_Y=0)
#####################
## the "true" data ##
#####################
print(df.head(n=10))
Z U1 U2 A L C Y
0 0 0 1 0 1 0 4.957074
1 1 0 1 1 1 0 5.717914
2 1 1 0 1 1 0 0.795860
3 0 1 1 0 0 1 6.636585
4 0 0 1 0 1 0 5.645637
5 0 1 1 0 1 0 7.528026
6 0 0 0 0 1 0 -0.839650
7 1 1 1 1 1 0 6.888819
8 1 0 0 1 1 0 -0.576333
9 1 0 0 1 0 1 -0.747018
##############################################
## data we get to "observe" in our analysis ##
##############################################
print(df_observed.head(n=10))
Z A L C Y
0 0 0 1 0 4.957074
1 1 1 1 0 5.717914
2 1 1 1 0 0.795860
3 0 0 0 1 NaN
4 0 0 1 0 5.645637
5 0 0 1 0 7.528026
6 0 0 1 0 -0.839650
7 1 1 1 0 6.888819
8 1 1 1 0 -0.576333
9 1 1 0 1 NaN