-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_parameters_CRC_real_data.R
344 lines (295 loc) · 21.2 KB
/
create_parameters_CRC_real_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
source('./src/dependencies.R')
source('./src/estimation_functions.R')
# parameters for sequence of IF22
K <- c(2,8,32,128,256,512,1024,2,6,10)
degree_of_interactions <- c(1,1,1,1,1,1,1,2,2,2)
polynomial_degree <- c(1,1,1,1,1,1,1,1,1,1)
params_save_name <- '2,8,32,128,256,512,1024,2,6,10_1,1,1,1,1,1,1,2,2,2'
##########################################################
## load data and create basis function
##########################################################
colon_ncdb <- read.csv('./colon_ncdb.csv') %>%
mutate(X = surg_approach_open,
Y = death_90) %>% dplyr::select(-c(surg_approach_open, death_90))
set.seed(202848)
colon_ncdb_split <- split_data (colon_ncdb, 2)
colon_ncdb_1 <- colon_ncdb_split[[1]]
colon_ncdb_2 <- colon_ncdb_split[[2]]
##########################################################
## Evaluate b-spline basis functions
##########################################################
knots <- lapply(1:length(K), function(k) {
list(
attr(bs(colon_ncdb$distance, degree=polynomial_degree[k], df=K[k]), "knots"),
attr(bs(colon_ncdb$age, degree=polynomial_degree[k], df=K[k]), "knots"),
attr(bs(colon_ncdb$tumor_size, degree=polynomial_degree[k], df=K[k]), "knots"),
attr(bs(colon_ncdb$days_from_dx_to_def_surg, degree=polynomial_degree[k], df=K[k]), "knots")
)
})
boundary_knots <- lapply(1:length(K), function(k) {
list(
attr(bs(colon_ncdb$distance, degree=polynomial_degree[k], df=K[k]), "Boundary.knots"),
attr(bs(colon_ncdb$age, degree=polynomial_degree[k], df=K[k]), "Boundary.knots"),
attr(bs(colon_ncdb$tumor_size, degree=polynomial_degree[k], df=K[k]), "Boundary.knots"),
attr(bs(colon_ncdb$days_from_dx_to_def_surg, degree=polynomial_degree[k], df=K[k]), "Boundary.knots")
)
})
###########
## split 1
###########
basis_1 <- lapply(1:length(K), function(k) {
create_b_spline_basis(
data = colon_ncdb_1,
continuous_vars = c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg'),
binary_vars = names(colon_ncdb_1)[!(names(colon_ncdb_1) %in%
c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg', 'X', 'Y'))],
knots = knots[[k]], boundary_knots = boundary_knots[[k]],
degree_of_interactions = degree_of_interactions[k],
polynomial_degree = polynomial_degree[k]
)
})
sigma_1_eff1 <- c(compute_sigma(basis = basis_1,
trt = colon_ncdb_1$X),
lapply(1:length(K), function(i) {
nlshrink_cov(colon_ncdb_1$X*basis_1[[i]], k=1)
}))
sigma_1_eff0 <- c(compute_sigma(basis = basis_1,
trt = 1-colon_ncdb_1$X),
lapply(1:length(K), function(i) {
nlshrink_cov((1-colon_ncdb_1$X)*basis_1[[i]], k=1)
}))
###########
## split 2
###########
basis_2 <- lapply(1:length(K), function(k) {
create_b_spline_basis(
data = colon_ncdb_2,
continuous_vars = c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg'),
binary_vars = names(colon_ncdb_2)[!(names(colon_ncdb_2) %in%
c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg', 'X', 'Y'))],
knots = knots[[k]], boundary_knots = boundary_knots[[k]],
degree_of_interactions = degree_of_interactions[k],
polynomial_degree = polynomial_degree[k]
)
})
sigma_2_eff1 <- c(compute_sigma(basis = basis_2,
trt = colon_ncdb_2$X),
lapply(1:length(K), function(i) {
nlshrink_cov(colon_ncdb_2$X*basis_2[[i]], k=1)
}))
sigma_2_eff0 <- c(compute_sigma(basis = basis_2,
trt = 1-colon_ncdb_2$X),
lapply(1:length(K), function(i) {
nlshrink_cov((1-colon_ncdb_2$X)*basis_2[[i]], k=1)
}))
##########################################################
## estimate nuisance parameter models using training data
##########################################################
###########
## split 1
###########
params_boosted_tree_trt_1 <- find_params_boosted_tree_model(covariates_df = colon_ncdb_1 %>% dplyr::select(-c(Y, X)),
label_vector = colon_ncdb_1 %>% dplyr::select(X) %>% {.[[1]]},
nfold = 5,
tree_depth = c(1:5),
shrinkage_factor = seq(0.01,0.05,0.01),
num_trees = seq(250,500,50),
num_cores = 10)
params_random_forest_trt_1 <- find_params_random_forest_model(covariates_df = colon_ncdb_1 %>% dplyr::select(-c(Y, X)),
label_vector = colon_ncdb_1 %>% dplyr::select(X) %>% {.[[1]]},
nfold = 5,
num_trees = seq(500,1000,50),
num_vars = seq(1,5,1),
num_cores = 10)
params_knn_trt_1 <- find_params_knn(covariates_df = colon_ncdb_1 %>% dplyr::select(-c(Y, X)),
label_vector = colon_ncdb_1 %>% dplyr::select(X) %>% {.[[1]]},
nfold = 5,
k = seq(11,101,2),
num_cores = 10)
params_lasso_trt_1 <- find_params_lasso(covariates_df = colon_ncdb_1 %>% dplyr::select(-c(Y, X)),
label_vector = colon_ncdb_1 %>% dplyr::select(X) %>% {.[[1]]},
continuous_vars = c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg'),
binary_vars = names(colon_ncdb_1)[!(names(colon_ncdb_1) %in%
c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg', 'X', 'Y'))],
continuous_var_spline_knots = 10,
degree_of_interactions = 1,
nfold = 5)
params_glm_trt_1 <- find_params_glm(continuous_vars = c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg'),
binary_vars = names(colon_ncdb_1)[!(names(colon_ncdb_1) %in%
c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg', 'X', 'Y'))],
continuous_var_spline_knots = 2)
meta_model_trt_1 <- fit_stacked_classifer_model(covariates_df = colon_ncdb_1 %>% dplyr::select(-c(Y, X)),
label_vector = colon_ncdb_1 %>% dplyr::select(X) %>% {.[[1]]},
params_boosted_tree = params_boosted_tree_trt_1,
params_random_forest = params_random_forest_trt_1,
params_knn = params_knn_trt_1,
params_lasso = params_lasso_trt_1,
params_glm = params_glm_trt_1,
num_spline_knots = 4,
alpha = 0, lambda=0)
params_boosted_tree_outcome_1 <- find_params_boosted_tree_model(covariates_df = colon_ncdb_1 %>% dplyr::select(-c(Y)),
label_vector = colon_ncdb_1 %>% dplyr::select(Y) %>% {.[[1]]},
nfold = 5,
tree_depth = c(1:5),
shrinkage_factor = seq(0.01,0.05,0.01),
num_trees = seq(250,500,50),
num_cores = 10)
params_random_forest_outcome_1 <- find_params_random_forest_model(covariates_df = colon_ncdb_1 %>% dplyr::select(-c(Y)),
label_vector = colon_ncdb_1 %>% dplyr::select(Y) %>% {.[[1]]},
nfold = 5,
num_trees = seq(500,1000,50),
num_vars = seq(1,5,1),
num_cores = 10)
params_knn_outcome_1 <- find_params_knn(covariates_df = colon_ncdb_1 %>% dplyr::select(-c(Y)),
label_vector = colon_ncdb_1 %>% dplyr::select(Y) %>% {.[[1]]},
nfold = 5,
k = seq(11,101,2),
num_cores = 10)
params_lasso_outcome_1 <- find_params_lasso(covariates_df = colon_ncdb_1 %>% dplyr::select(-c(Y)),
label_vector = colon_ncdb_1 %>% dplyr::select(Y) %>% {.[[1]]},
continuous_vars = c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg'),
binary_vars = names(colon_ncdb_1)[!(names(colon_ncdb_1) %in%
c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg', 'Y'))],
continuous_var_spline_knots = 10,
degree_of_interactions = 1,
nfold = 5)
params_glm_outcome_1 <- find_params_glm(continuous_vars = c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg'),
binary_vars = names(colon_ncdb_1)[!(names(colon_ncdb_1) %in%
c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg', 'Y'))],
continuous_var_spline_knots = 2)
meta_model_outcome_1 <- fit_stacked_classifer_model(covariates_df = colon_ncdb_1 %>% dplyr::select(-c(Y)),
label_vector = colon_ncdb_1 %>% dplyr::select(Y) %>% {.[[1]]},
params_boosted_tree = params_boosted_tree_outcome_1,
params_random_forest = params_random_forest_outcome_1,
params_knn = params_knn_outcome_1,
params_lasso = params_lasso_outcome_1,
params_glm = params_glm_outcome_1,
num_spline_knots = 4,
alpha = 0, lambda=0)
###########
## split 2
###########
params_boosted_tree_trt_2 <- find_params_boosted_tree_model(covariates_df = colon_ncdb_2 %>% dplyr::select(-c(Y, X)),
label_vector = colon_ncdb_2 %>% dplyr::select(X) %>% {.[[1]]},
nfold = 5,
tree_depth = c(1:5),
shrinkage_factor = seq(0.01,0.05,0.01),
num_trees = seq(250,500,50),
num_cores = 10)
params_random_forest_trt_2 <- find_params_random_forest_model(covariates_df = colon_ncdb_2 %>% dplyr::select(-c(Y, X)),
label_vector = colon_ncdb_2 %>% dplyr::select(X) %>% {.[[1]]},
nfold = 5,
num_trees = seq(500,1000,50),
num_vars = seq(1,5,1),
num_cores = 10)
params_knn_trt_2 <- find_params_knn(covariates_df = colon_ncdb_2 %>% dplyr::select(-c(Y, X)),
label_vector = colon_ncdb_2 %>% dplyr::select(X) %>% {.[[1]]},
nfold = 5,
k = seq(11,101,2),
num_cores = 10)
params_lasso_trt_2 <- find_params_lasso(covariates_df = colon_ncdb_2 %>% dplyr::select(-c(Y, X)),
label_vector = colon_ncdb_2 %>% dplyr::select(X) %>% {.[[1]]},
continuous_vars = c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg'),
binary_vars = names(colon_ncdb_2)[!(names(colon_ncdb_2) %in%
c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg', 'X', 'Y'))],
continuous_var_spline_knots = 10,
degree_of_interactions = 1,
nfold = 5)
params_glm_trt_2 <- find_params_glm(continuous_vars = c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg'),
binary_vars = names(colon_ncdb_2)[!(names(colon_ncdb_2) %in%
c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg', 'X', 'Y'))],
continuous_var_spline_knots = 2)
meta_model_trt_2 <- fit_stacked_classifer_model(covariates_df = colon_ncdb_2 %>% dplyr::select(-c(Y, X)),
label_vector = colon_ncdb_2 %>% dplyr::select(X) %>% {.[[1]]},
params_boosted_tree = params_boosted_tree_trt_2,
params_random_forest = params_random_forest_trt_2,
params_knn = params_knn_trt_2,
params_lasso = params_lasso_trt_2,
params_glm = params_glm_trt_2,
num_spline_knots = 4,
alpha = 0, lambda=0)
params_boosted_tree_outcome_2 <- find_params_boosted_tree_model(covariates_df = colon_ncdb_2 %>% dplyr::select(-c(Y)),
label_vector = colon_ncdb_2 %>% dplyr::select(Y) %>% {.[[1]]},
nfold = 5,
tree_depth = c(1:5),
shrinkage_factor = seq(0.01,0.05,0.01),
num_trees = seq(250,500,50),
num_cores = 10)
params_random_forest_outcome_2 <- find_params_random_forest_model(covariates_df = colon_ncdb_2 %>% dplyr::select(-c(Y)),
label_vector = colon_ncdb_2 %>% dplyr::select(Y) %>% {.[[1]]},
nfold = 5,
num_trees = seq(500,1000,50),
num_vars = seq(1,5,1),
num_cores = 10)
params_knn_outcome_2 <- find_params_knn(covariates_df = colon_ncdb_2 %>% dplyr::select(-c(Y)),
label_vector = colon_ncdb_2 %>% dplyr::select(Y) %>% {.[[1]]},
nfold = 5,
k = seq(11,101,2),
num_cores = 10)
params_lasso_outcome_2 <- find_params_lasso(covariates_df = colon_ncdb_2 %>% dplyr::select(-c(Y)),
label_vector = colon_ncdb_2 %>% dplyr::select(Y) %>% {.[[1]]},
continuous_vars = c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg'),
binary_vars = names(colon_ncdb_2)[!(names(colon_ncdb_2) %in%
c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg', 'Y'))],
continuous_var_spline_knots = 10,
degree_of_interactions = 1,
nfold = 5)
params_glm_outcome_2 <- find_params_glm(continuous_vars = c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg'),
binary_vars = names(colon_ncdb_2)[!(names(colon_ncdb_2) %in%
c('distance', 'age', 'tumor_size', 'days_from_dx_to_def_surg', 'Y'))],
continuous_var_spline_knots = 2)
meta_model_outcome_2 <- fit_stacked_classifer_model(covariates_df = colon_ncdb_2 %>% dplyr::select(-c(Y)),
label_vector = colon_ncdb_2 %>% dplyr::select(Y) %>% {.[[1]]},
params_boosted_tree = params_boosted_tree_outcome_2,
params_random_forest = params_random_forest_outcome_2,
params_knn = params_knn_outcome_2,
params_lasso = params_lasso_outcome_2,
params_glm = params_glm_outcome_2,
num_spline_knots = 4,
alpha = 0, lambda=0)
##########################################################
## save parameters
##########################################################
parameters_CRC_real_data <- list(
K = K,
knots = knots,
boundary_knots = boundary_knots,
degree_of_interactions = degree_of_interactions,
polynomial_degree = polynomial_degree,
sigma_1_eff1 = sigma_1_eff1,
sigma_1_eff0 = sigma_1_eff0,
sigma_2_eff1 = sigma_2_eff1,
sigma_2_eff0 = sigma_2_eff0,
params_boosted_tree_trt_1 = params_boosted_tree_trt_1,
params_random_forest_trt_1 = params_random_forest_trt_1,
params_knn_trt_1 = params_knn_trt_1,
params_lasso_trt_1 = params_lasso_trt_1,
params_glm_trt_1 = params_glm_trt_1,
meta_model_trt_1 = meta_model_trt_1,
params_boosted_tree_outcome_1 = params_boosted_tree_outcome_1,
params_random_forest_outcome_1 = params_random_forest_outcome_1,
params_knn_outcome_1 = params_knn_outcome_1,
params_lasso_outcome_1 = params_lasso_outcome_1,
params_glm_outcome_1 = params_glm_outcome_1,
meta_model_outcome_1 = meta_model_outcome_1,
params_boosted_tree_trt_2 = params_boosted_tree_trt_2,
params_random_forest_trt_2 = params_random_forest_trt_2,
params_knn_trt_2 = params_knn_trt_2,
params_lasso_trt_2 = params_lasso_trt_2,
params_glm_trt_2 = params_glm_trt_2,
meta_model_trt_2 = meta_model_trt_2,
params_boosted_tree_outcome_2 = params_boosted_tree_outcome_2,
params_random_forest_outcome_2 = params_random_forest_outcome_2,
params_knn_outcome_2 = params_knn_outcome_2,
params_lasso_outcome_2 = params_lasso_outcome_2,
params_glm_outcome_2 = params_glm_outcome_2,
meta_model_outcome_2 = meta_model_outcome_2
)
save(parameters_CRC_real_data, file=paste0("./params/parameters_CRC_real_data_", params_save_name, ".RData"))
rm(K, degree_of_interactions, polynomial_degree, params_save_name, colon_ncdb, colon_ncdb_split, colon_ncdb_1, colon_ncdb_2,
knots, boundary_knots, basis_1, basis_2,
sigma_1_eff1, sigma_1_eff0, sigma_2_eff1, sigma_2_eff0,
params_boosted_tree_trt_1, params_random_forest_trt_1, params_knn_trt_1, params_lasso_trt_1, params_glm_trt_1, meta_model_trt_1,
params_boosted_tree_outcome_1, params_random_forest_outcome_1, params_knn_outcome_1, params_lasso_outcome_1, params_glm_outcome_1, meta_model_outcome_1,
params_boosted_tree_trt_2, params_random_forest_trt_2, params_knn_trt_2, params_lasso_trt_2, params_glm_trt_2, meta_model_trt_2,
params_boosted_tree_outcome_2, params_random_forest_outcome_2, params_knn_outcome_2, params_lasso_outcome_2, params_glm_outcome_2, meta_model_outcome_2)