Prepare to compare AUC for BNN and linmod

evenmm · Apr 27, 2023 · b652ca8 · b652ca8
1 parent fd9b076
commit b652ca8
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 11 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,28 @@
+FROM ubuntu:20.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=Europe/Berlin
+
+RUN apt update \
+    && apt-get install -y \
+    g++ \
+    cmake \
+    libatlas-base-dev \
+    python3 \
+    python3-dev \
+    python3-pip \
+    swig \
+    git\
+    libhdf5-serial-dev\
+    && ln -sf /usr/bin/swig4.0 /usr/bin/swig
+
+RUN pip3 install python-libsbml>=5.17.0 numpy
+
+COPY container_files.tar.gz /tmp
+
+RUN pip3 install -U --upgrade pip wheel \
+    && mkdir -p /tmp/container_files/ \
+    && cd /tmp/container_files \
+    && tar -xzf ../container_files.tar.gz
+
+RUN pip3 install aesara arviz pymc==5.1.1 theano numpy matplotlib scipy pandas seaborn arviz jupyter
diff --git a/run_jobs.sh b/run_jobs.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 # Open multiple screens, send a command to each screen
 
-screen_names=("test0" "test1" "test2" "test3" "test4" "test5")
-job_names=("module purge\n module load GCC/11.2.0\n module load CUDA/11.3.1 \n module load OpenMPI/4.1.1-GCC-11.2.0\n module load Python/3.9.6-GCCcore-11.2.0\n nice -n 10 python simdata_BNN.py 0\n" "module purge\n module load GCC/11.2.0\n module load CUDA/11.3.1 \n module load OpenMPI/4.1.1-GCC-11.2.0\n module load Python/3.9.6-GCCcore-11.2.0\n nice -n 10 python simdata_BNN.py 1\n" "module purge\n module load GCC/11.2.0\n module load CUDA/11.3.1 \n module load OpenMPI/4.1.1-GCC-11.2.0\n module load Python/3.9.6-GCCcore-11.2.0\n nice -n 10 python simdata_BNN.py 2\n" "module purge\n module load GCC/11.2.0\n module load CUDA/11.3.1 \n module load OpenMPI/4.1.1-GCC-11.2.0\n module load Python/3.9.6-GCCcore-11.2.0\n nice -n 10 python simdata_BNN.py 3\n" "module purge\n module load GCC/11.2.0\n module load CUDA/11.3.1 \n module load OpenMPI/4.1.1-GCC-11.2.0\n module load Python/3.9.6-GCCcore-11.2.0\n nice -n 10 python simdata_BNN.py 4\n" "module purge\n module load GCC/11.2.0\n module load CUDA/11.3.1 \n module load OpenMPI/4.1.1-GCC-11.2.0\n module load Python/3.9.6-GCCcore-11.2.0\n nice -n 10 python simdata_BNN.py 5\n")
+screen_names=("x0" "x1" "x2" "x3" "x4" "x5")
+job_names=("module purge\n module load GCC/11.2.0\n module load CUDA/11.3.1 \n module load OpenMPI/4.1.1-GCC-11.2.0\n module load Python/3.9.6-GCCcore-11.2.0\n nice -n 10 taskset -c 0-4 python simdata_BNN.py 0\n" "module purge\n module load GCC/11.2.0\n module load CUDA/11.3.1 \n module load OpenMPI/4.1.1-GCC-11.2.0\n module load Python/3.9.6-GCCcore-11.2.0\n nice -n 10 taskset -c 5-9 python simdata_BNN.py 1\n" "module purge\n module load GCC/11.2.0\n module load CUDA/11.3.1 \n module load OpenMPI/4.1.1-GCC-11.2.0\n module load Python/3.9.6-GCCcore-11.2.0\n nice -n 10 taskset -c 10-14 python simdata_BNN.py 2\n" "module purge\n module load GCC/11.2.0\n module load CUDA/11.3.1 \n module load OpenMPI/4.1.1-GCC-11.2.0\n module load Python/3.9.6-GCCcore-11.2.0\n nice -n 10 taskset -c 15-19 python simdata_BNN.py 3\n" "module purge\n module load GCC/11.2.0\n module load CUDA/11.3.1 \n module load OpenMPI/4.1.1-GCC-11.2.0\n module load Python/3.9.6-GCCcore-11.2.0\n nice -n 10 taskset -c 20-24 python simdata_BNN.py 4\n" "module purge\n module load GCC/11.2.0\n module load CUDA/11.3.1 \n module load OpenMPI/4.1.1-GCC-11.2.0\n module load Python/3.9.6-GCCcore-11.2.0\n nice -n 10 taskset -c 25-29 python simdata_BNN.py 5\n")
 number_of_jobs=${#screen_names[@]}
 
 for (( job=0; job < $number_of_jobs; job++ ))

diff --git a/simdata_BNN.py b/simdata_BNN.py
@@ -7,7 +7,8 @@
 rng = np.random.default_rng(RANDOM_SEED)
 print(f"Running on PyMC v{pm.__version__}")
 #SAVEDIR = "/data/evenmm/plots/"
-SAVEDIR = "./plots/Bayesian_estimates_simdata_BNN/"
+#SAVEDIR = "./plots/Bayesian_estimates_simdata_BNN/"
+SAVEDIR = "./"
 
 script_index = int(sys.argv[1]) 
 
@@ -65,13 +66,13 @@
 with neural_net_model:
     if ADADELTA: 
         print("------------------- ADADELTA INITIALIZATION -------------------")
-        advi_iterations = 50_000
+        advi_iterations = 100_000
         advi = pm.ADVI()
         tracker = pm.callbacks.Tracker(
             mean=advi.approx.mean.eval,  # callable that returns mean
             std=advi.approx.std.eval,  # callable that returns std
         )
-        approx = advi.fit(advi_iterations, obj_optimizer=pm.adadelta(), obj_n_mc=5, callbacks=[tracker])
+        approx = advi.fit(advi_iterations, obj_optimizer=pm.adadelta(), obj_n_mc=50, callbacks=[tracker])
         #approx = advi.fit(advi_iterations, obj_optimizer=pm.adagrad(), obj_n_mc=5, callbacks=[tracker])
 
         # Plot ELBO and trace
@@ -120,5 +121,8 @@
 plot_all_credible_intervals(idata, patient_dictionary, patient_dictionary_test, X_test, SAVEDIR, name, y_resolution, model_name=model_name, parameter_dictionary=parameter_dictionary, PLOT_PARAMETERS=True, parameter_dictionary_test=parameter_dictionary_test, PLOT_PARAMETERS_test=True, PLOT_TREATMENTS=False, MODEL_RANDOM_EFFECTS=MODEL_RANDOM_EFFECTS, CI_with_obs_noise=CI_with_obs_noise, PLOT_RESISTANT=True)
 print("Finished!")
 
-evaluation_time = 1000 # days
+# At how many days to we want to classify people into recurrence / not recurrence: 
+# Here we make sure that all patients have observations at that time by taking the latest time where every patient has an observation
+evaluation_time = measurement_times[:3][-1] + 1
+print(evaluation_time)
 pfs_auc(evaluation_time, patient_dictionary_test, N_patients_test, idata, X_test, model_name, MODEL_RANDOM_EFFECTS, CI_with_obs_noise, SAVEDIR, name)
diff --git a/utilities.py b/utilities.py
@@ -1303,7 +1303,6 @@ def predict_PFS(args): # Predicts observations of M protein
     return p_recurrence #predicted_PFS, point_predicted_PFS
 
 def pfs_auc(evaluation_time, patient_dictionary_test, N_patients_test, idata, X_test, model_name, MODEL_RANDOM_EFFECTS, CI_with_obs_noise, SAVEDIR, name):
-    import sklearn.metrics as metrics
     N_patients_test = len(patient_dictionary_test)
 
     recurrence_or_not = np.zeros(N_patients_test)
@@ -1318,10 +1317,7 @@ def pfs_auc(evaluation_time, patient_dictionary_test, N_patients_test, idata, X_
     sample_shape = idata.posterior['psi'].shape # [chain, n_samples, dim]
     N_samples = sample_shape[1]
     # Posterior predictive CI for test data
-    if N_samples <= 10:
-        N_rand_eff_pred = 100 # Number of random intercept samples to draw for each idata sample when we make predictions
-        N_rand_obs_pred = 100 # Number of observation noise samples to draw for each parameter sample 
-    elif N_samples <= 100:
+    if N_samples <= 100:
         N_rand_eff_pred = 10 # Number of random intercept samples to draw for each idata sample when we make predictions
         N_rand_obs_pred = 100 # Number of observation noise samples to draw for each parameter sample 
     elif N_samples <= 1000:
@@ -1336,8 +1332,22 @@ def pfs_auc(evaluation_time, patient_dictionary_test, N_patients_test, idata, X_
     p_recurrence = np.zeros(N_patients_test) 
     for ii, elem in enumerate(args):
         p_recurrence[ii] = predict_PFS(elem)
+
+    print("recurrence_or_not", recurrence_or_not)
+    print("p_recurrence", p_recurrence)
+
+    picklefile = open(SAVEDIR+name+'_recurrence_or_not', 'wb')
+    pickle.dump(recurrence_or_not, picklefile)
+    picklefile.close()
 
+    picklefile = open(SAVEDIR+name+'_p_recurrence', 'wb')
+    pickle.dump(p_recurrence, picklefile)
+    picklefile.close()
+
+    """
+    # Commented out only because med-biostat2 does not have sklearn 
     # calculate the fpr and tpr for all thresholds of the classification
+    import sklearn.metrics as metrics
     fpr, tpr, threshold = metrics.roc_curve(recurrence_or_not, p_recurrence) #(y_test, preds)
     roc_auc = metrics.auc(fpr, tpr)
 
@@ -1357,6 +1367,7 @@ def pfs_auc(evaluation_time, patient_dictionary_test, N_patients_test, idata, X_
     plt.savefig(SAVEDIR+"AUC_"+str(N_patients_test)+"_test_patients_"+name+".pdf")
     #plt.show()
     plt.close()
+    """
 
 def plot_all_credible_intervals(idata, patient_dictionary, patient_dictionary_test, X_test, SAVEDIR, name, y_resolution, model_name, parameter_dictionary, PLOT_PARAMETERS, parameter_dictionary_test, PLOT_PARAMETERS_test, PLOT_TREATMENTS, MODEL_RANDOM_EFFECTS, CI_with_obs_noise=True, PARALLELLIZE=True, PLOT_RESISTANT=True, PLOT_MEASUREMENTS_test=False):
     sample_shape = idata.posterior['psi'].shape # [chain, n_samples, dim]