Multiclass: plot fixes, full equalization, config files. (#850)

* Adjust paths in database and analysis type in yml. All steps for warm-up. * Rename output folder after preprocess merge * Rename output folder after test-corr merge * Rename output folder after all-plots merge * Rename output directories * Fixed class equalization. * Different directory for equalization results * Fix nclasses in databases * Deactivate redundant steps for comparison * Settings for running all comparison plots * Check also do cross val * Add model config for multiclass -- to be re-checked and corrected * Add database for multiclass * Disable cross val * Correct objective function for multiclass in config_model_parameters * Nitpick in database * Proper classification type in database * Several fixes in processer * Fixed class equalization. * Fixes for a complete successful run of multiclassifier * New database paths for binary classification comparison * Standard set of steps in default_complete * Fixes to ROC plots * Rename output folders before SHAP * Class labels and order in SHAP plots * Rename output folders after SHAP * Equalization debug * Fixed equalization, remove debugs * Renamed multiclass output folders * Remove local adjustments for PR * Remove leftover comment * Correct font size on OvO plot
alisw · Jan 17, 2024 · 478b97b · 478b97b
1 parent 2f22a49
commit 478b97b
Show file tree

Hide file tree

Showing 8 changed files with 569 additions and 102 deletions.
diff --git a/machine_learning_hep/data/config_model_parameters.yml b/machine_learning_hep/data/config_model_parameters.yml
@@ -19,7 +19,7 @@ BinaryClassification:
         max_features: 1
 
       grid_search:
-        params:     
+        params:
           n_estimators: [3, 10, 50, 100]
           max_features: [2,4,6,8]
           max_depth: [1,4]
@@ -58,7 +58,7 @@ BinaryClassification:
         gamma: 0.
         min_child_weight: 3
         subsample: 0.8
-        colsample_bytree: 0.8 
+        colsample_bytree: 0.8
         colsample_bynode: 1
         random_state: 0
         tree_method: 'hist'
@@ -77,6 +77,85 @@ BinaryClassification:
         scoring: ["AUC", "Accuracy"]
 
 
+MultiClassification:
+
+  keras:
+    keras_classifier:
+      activate: False
+      layers:
+        - {"n_nodes": 12, "activation": "relu"}
+      optimizer: "adam"
+      loss: "binary_crossentropy"
+      epochs: 30
+      batch_size: 50
+
+  scikit:
+    scikit_random_forest_classifier:
+      activate: False
+      central_params:
+        max_depth: 5
+        n_estimators: 10
+        max_features: 1
+
+      grid_search:
+        params:
+          n_estimators: [3, 10, 50, 100]
+          max_features: [2,4,6,8]
+          max_depth: [1,4]
+        refit: AUC
+        scoring: ["AUC", "Accuracy"]
+
+    scikit_adaboost_classifier:
+      activate: False
+      central_params: {}
+        #max_depth: 3       # 1 default
+        #n_estimators: 50   # 50 default
+        #learning_rate: 0.5 # 1 default
+
+      grid_search:
+        params:
+          n_estimators: [3, 10, 50, 100]
+          learning_rate: [0.1,0.5,0.9]
+        refit: AUC
+        scoring: ["AUC", "Accuracy"]
+
+    scikit_decision_tree_classifier:
+      activate: False
+      central_params:
+        max_depth: 5
+
+#not default parameters
+  xgboost:
+    xgboost_classifier:
+      activate: True
+      central_params:
+        max_depth: 3
+        learning_rate: 0.1
+        n_estimators: 850
+        objective: 'multi:softprob'
+        n_jobs: 10
+        gamma: 0.
+        min_child_weight: 3
+        subsample: 0.8
+        colsample_bytree: 0.8
+        colsample_bynode: 1
+        random_state: 0
+        tree_method: 'hist'
+      #      early_stopping_rounds: 10
+      grid_search:
+        params:
+          min_child_weight: [3] #[1, 3]
+          max_depth: [2] #[3, 6]
+          gamma: [0.2]
+          subsample: [0.8] #[0.6, 0.8, 0.9]
+          colsample_bytree: [0.8] #[0.6, 0.8, 0.9]
+          learning_rate: [0.05, 0.15] #[0.05, 0.1, 0.5]
+          n_estimators: [200, 300] #[500, 800, 1000]
+          objective: ["multi:softprob"]
+        refit: AUC
+        scoring: ["AUC", "Accuracy"]
+
+
 Regression:
 
   scikit:

diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_LcToPKPi.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_LcToPKPi.yml
@@ -236,8 +236,7 @@ LcpKpi:
       data: null
       mc: null
 
-    nbkg: 500000
-    nsig: 500000
+    nclasses: [500000, 500000]
     equalise_sig_bkg: True
     sampletags: [0, 1]
     sel_bkg: fM < 2.22 or fM > 2.35 # for plotting significance; should agree with bkg selection in sel_ml

diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_LcToPKPi_mult.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_LcToPKPi_mult.yml
@@ -236,8 +236,7 @@ LcpKpi:
       data: null
       mc: null
 
-    nbkg: 500000
-    nsig: 500000
+    nclasses: [500000, 500000]
     equalise_sig_bkg: True
     sampletags: [0, 1]
     sel_bkg: fM < 2.22 or fM > 2.35 # for plotting significance; should agree with bkg selection in sel_ml