-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_simulation_parameters_binary_X.R
223 lines (191 loc) · 12.3 KB
/
create_simulation_parameters_binary_X.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
source('./src/dependencies.R')
source('./src/estimation_functions.R')
# parameters for sequence of IF22
n_train <- 5000
n_oracle <- 1000000
degree_of_interactions <- c(1,2,3,4,5,6,7,8)
params_save_name <- '1,2,3,4,5,6,7,8'
set.seed(202848)
# parameters for true function
key_trt <- expand.grid(Z1=c(0,1), Z2=c(0,1), Z3=c(0,1), Z4=c(0,1)) %>%
{bind_cols(., val_trt=sample(0.05*1:nrow(.), replace=F))}
key_outcome <- expand.grid(Z1=c(0,1), Z2=c(0,1), Z3=c(0,1), Z4=c(0,1)) %>%
{bind_cols(., val_outcome=sample(0.05*1:nrow(.), replace=F))}
##########################################################
## simulate oracle data
##########################################################
sim_data_oracle <- data.frame(
Z1 = as.numeric(rbernoulli(n_oracle,p=0.5)), Z2 = as.numeric(rbernoulli(n_oracle,p=0.5)),
Z3 = as.numeric(rbernoulli(n_oracle,p=0.5)), Z4 = as.numeric(rbernoulli(n_oracle,p=0.5)),
Z5 = as.numeric(rbernoulli(n_oracle,p=0.5)), Z6 = as.numeric(rbernoulli(n_oracle,p=0.5)),
Z7 = as.numeric(rbernoulli(n_oracle,p=0.5)), Z8 = as.numeric(rbernoulli(n_oracle,p=0.5)),
Z9 = as.numeric(rbernoulli(n_oracle,p=0.5)), Z10 = as.numeric(rbernoulli(n_oracle,p=0.5))
) %>%
merge(., key_trt) %>%
merge(., key_outcome) %>%
mutate(
X = rbinom(n(), 1, val_trt),
Y = rbinom(n(), 1, val_outcome)
) %>% dplyr::select(-c(val_trt, val_outcome))
basis_oracle <- lapply(1:length(degree_of_interactions), function(k) {
create_binary_var_basis(
data = sim_data_oracle %>% dplyr::select(-c(X,Y)),
binary_vars = names(sim_data_oracle %>% dplyr::select(-c(X,Y))),
degree_of_interactions = degree_of_interactions[k]
)
})
sigma_oracle_eff1 <- c('oracle' = compute_sigma(basis = basis_oracle,
trt = sim_data_oracle$X))
sigma_oracle_eff0 <- c('oracle' = compute_sigma(basis = basis_oracle,
trt = 1-sim_data_oracle$X))
##########################################################
## simulate training data
##########################################################
sim_data_tr <- data.frame(
Z1 = as.numeric(rbernoulli(n_train,p=0.5)), Z2 = as.numeric(rbernoulli(n_train,p=0.5)),
Z3 = as.numeric(rbernoulli(n_train,p=0.5)), Z4 = as.numeric(rbernoulli(n_train,p=0.5)),
Z5 = as.numeric(rbernoulli(n_train,p=0.5)), Z6 = as.numeric(rbernoulli(n_train,p=0.5)),
Z7 = as.numeric(rbernoulli(n_train,p=0.5)), Z8 = as.numeric(rbernoulli(n_train,p=0.5)),
Z9 = as.numeric(rbernoulli(n_train,p=0.5)), Z10 = as.numeric(rbernoulli(n_train,p=0.5))
) %>%
merge(., key_trt) %>%
merge(., key_outcome) %>%
mutate(
X = rbinom(n(), 1, val_trt),
Y = rbinom(n(), 1, val_outcome)
) %>% dplyr::select(-c(val_trt, val_outcome))
basis_tr <- lapply(1:length(degree_of_interactions), function(k) {
create_binary_var_basis(
data = sim_data_tr %>% dplyr::select(-c(X,Y)),
binary_vars = names(sim_data_tr %>% dplyr::select(-c(X,Y))),
degree_of_interactions = degree_of_interactions[k]
)
})
sigma_tr_eff1 <- c('tr' = compute_sigma(basis = basis_tr,
trt = sim_data_tr$X),
'nlshrink' = lapply(1:length(c(degree_of_interactions)), function(i) {
nlshrink_cov(sim_data_tr$X*basis_tr[[i]], k=1)
}),
'unequal_shrink' = lapply(1:length(c(degree_of_interactions)), function(i) {
shrinkcovmat.unequal(t(sim_data_tr$X*basis_tr[[i]]), centered=T)$Sigmahat
}),
'equal_shrink' = lapply(1:length(c(degree_of_interactions)), function(i) {
shrinkcovmat.equal(t(sim_data_tr$X*basis_tr[[i]]), centered=T)$Sigmahat
}),
'identity_shrink' = lapply(1:length(c(degree_of_interactions)), function(i) {
shrinkcovmat.identity(t(sim_data_tr$X*basis_tr[[i]]), centered=T)$Sigmahat
}))
sigma_tr_eff0 <- c('tr' = compute_sigma(basis = basis_tr,
trt = 1-sim_data_tr$X),
'nlshrink' = lapply(1:length(c(degree_of_interactions)), function(i) {
nlshrink_cov((1-sim_data_tr$X)*basis_tr[[i]], k=1)
}),
'unequal_shrink' = lapply(1:length(c(degree_of_interactions)), function(i) {
shrinkcovmat.unequal(t((1-sim_data_tr$X)*basis_tr[[i]]), centered=T)$Sigmahat
}),
'equal_shrink' = lapply(1:length(c(degree_of_interactions)), function(i) {
shrinkcovmat.equal(t((1-sim_data_tr$X)*basis_tr[[i]]), centered=T)$Sigmahat
}),
'identity_shrink' = lapply(1:length(c(degree_of_interactions)), function(i) {
shrinkcovmat.identity(t((1-sim_data_tr$X)*basis_tr[[i]]), centered=T)$Sigmahat
}))
##########################################################
## estimate nuisance parameter models using training data
##########################################################
params_boosted_tree_trt <- find_params_boosted_tree_model(covariates_df = sim_data_tr %>% dplyr::select(-c(Y, X)),
label_vector = sim_data_tr %>% dplyr::select(X) %>% {.[[1]]},
nfold = 5,
tree_depth = c(1:5),
shrinkage_factor = seq(0.01,0.05,0.01),
num_trees = seq(250,500,50),
num_cores = 10)
params_random_forest_trt <- find_params_random_forest_model(covariates_df = sim_data_tr %>% dplyr::select(-c(Y, X)),
label_vector = sim_data_tr %>% dplyr::select(X) %>% {.[[1]]},
nfold = 5,
num_trees = seq(500,1000,50),
num_vars = seq(1,5,1),
num_cores = 10)
params_knn_trt <- find_params_knn(covariates_df = sim_data_tr %>% dplyr::select(-c(Y)),
label_vector = sim_data_tr %>% dplyr::select(Y) %>% {.[[1]]},
nfold = 5,
k = seq(11,101,2),
num_cores = 10)
params_lasso_trt <- find_params_lasso(covariates_df = sim_data_tr %>% dplyr::select(-c(Y, X)),
label_vector = sim_data_tr %>% dplyr::select(X) %>% {.[[1]]},
binary_vars = names(sim_data_tr %>% dplyr::select(-c(X,Y))),
degree_of_interactions = 3,
nfold = 5)
params_glm_trt <- find_params_glm(binary_vars = names(sim_data_tr %>% dplyr::select(-c(X,Y))))
meta_model_trt <- fit_stacked_classifer_model(covariates_df = sim_data_tr %>% dplyr::select(-c(Y, X)),
label_vector = sim_data_tr %>% dplyr::select(X) %>% {.[[1]]},
params_boosted_tree = params_boosted_tree_trt,
params_random_forest = params_random_forest_trt,
params_knn = params_knn_trt,
params_lasso = params_lasso_trt,
params_glm = params_glm_trt,
num_spline_knots = 4,
alpha = 0, lambda=0)
params_boosted_tree_outcome <- find_params_boosted_tree_model(covariates_df = sim_data_tr %>% dplyr::select(-c(Y)),
label_vector = sim_data_tr %>% dplyr::select(Y) %>% {.[[1]]},
nfold = 5,
tree_depth = c(1:5),
shrinkage_factor = seq(0.01,0.05,0.01),
num_trees = seq(250,500,50),
num_cores = 10)
params_random_forest_outcome <- find_params_random_forest_model(covariates_df = sim_data_tr %>% dplyr::select(-c(Y)),
label_vector = sim_data_tr %>% dplyr::select(Y) %>% {.[[1]]},
nfold = 5,
num_trees = seq(500,1000,50),
num_vars = seq(1,5,1),
num_cores = 10)
params_knn_outcome <- find_params_knn(covariates_df = sim_data_tr %>% dplyr::select(-c(Y)),
label_vector = sim_data_tr %>% dplyr::select(Y) %>% {.[[1]]},
nfold = 5,
k = seq(11,101,2),
num_cores = 10)
params_lasso_outcome <- find_params_lasso(covariates_df = sim_data_tr %>% dplyr::select(-c(Y)),
label_vector = sim_data_tr %>% dplyr::select(Y) %>% {.[[1]]},
binary_vars = names(sim_data_tr %>% dplyr::select(-c(Y))),
degree_of_interactions = 3,
nfold = 5)
params_glm_outcome <- find_params_glm(binary_vars = names(sim_data_tr %>% dplyr::select(-c(Y))))
meta_model_outcome <- fit_stacked_classifer_model(covariates_df = sim_data_tr %>% dplyr::select(-c(Y)),
label_vector = sim_data_tr %>% dplyr::select(Y) %>% {.[[1]]},
params_boosted_tree = params_boosted_tree_outcome,
params_random_forest = params_random_forest_outcome,
params_knn = params_knn_outcome,
params_lasso = params_lasso_outcome,
params_glm = params_glm_outcome,
num_spline_knots = 4,
alpha = 0, lambda=0)
simulation_parameters_binary <- list(
n_oracle = n_oracle,
key_trt = key_trt,
key_outcome = key_outcome,
degree_of_interactions = degree_of_interactions,
sigma_oracle_eff1 = sigma_oracle_eff1,
sigma_oracle_eff0 = sigma_oracle_eff0,
sim_data_tr = sim_data_tr,
sigma_tr_eff1 = sigma_tr_eff1,
sigma_tr_eff0 = sigma_tr_eff0,
params_boosted_tree_trt = params_boosted_tree_trt,
params_random_forest_trt = params_random_forest_trt,
params_knn_trt = params_knn_trt,
params_glm_trt = params_glm_trt,
params_lasso_trt = params_lasso_trt,
meta_model_trt = meta_model_trt,
params_boosted_tree_outcome = params_boosted_tree_outcome,
params_random_forest_outcome = params_random_forest_outcome,
params_knn_outcome = params_knn_outcome,
params_lasso_outcome = params_lasso_outcome,
params_glm_outcome = params_glm_outcome,
meta_model_outcome = meta_model_outcome
)
save(simulation_parameters_binary, file=paste0("./params/simulation_parameters_binary_", params_save_name, ".RData"))
rm(n_train, n_oracle, degree_of_interactions, key_trt, key_outcome, params_save_name,
sim_data_oracle, sim_data_tr,
sigma_oracle_eff1, sigma_oracle_eff0,
sigma_tr_eff1, sigma_tr_eff0,
basis_oracle, basis_tr,
params_boosted_tree_trt, params_random_forest_trt, params_knn_trt, params_lasso_trt, params_glm_trt, meta_model_trt,
params_boosted_tree_outcome, params_random_forest_outcome, params_knn_outcome, params_lasso_outcome, params_glm_outcome, meta_model_outcome)