Prepare for Run 3 analysis (#816)

* Prepare for run 3 analysis incl. HF jets * Make pylint happier
alisw · Aug 4, 2023 · ddb5106 · ddb5106
1 parent 98110e4
commit ddb5106
Show file tree

Hide file tree

Showing 12 changed files with 952 additions and 118 deletions.
diff --git a/machine_learning_hep/analysis/analyzer.py b/machine_learning_hep/analysis/analyzer.py
@@ -19,7 +19,6 @@
 from machine_learning_hep.workflow.workflow_base import WorkflowBase
 from machine_learning_hep.io import dump_yaml_from_dict
 
-
 class Analyzer(WorkflowBase):
     def __init__(self, datap, case, typean, period):
         super().__init__(datap, case, typean, period)

diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py
@@ -0,0 +1,57 @@
+#############################################################################
+##  © Copyright CERN 0238. All rights not expressly granted are reserved.  ##
+##                                                                         ##
+## This program is free software: you can redistribute it and/or modify it ##
+##  under the terms of the GNU General Public License as published by the  ##
+## Free Software Foundation, either version 3 of the License, or (at your  ##
+## option) any later version. This program is distributed in the hope that ##
+##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
+##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
+##           See the GNU General Public License for more details.          ##
+##    You should have received a copy of the GNU General Public License    ##
+##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
+#############################################################################
+
+import os
+import munch # pylint: disable=import-error, no-name-in-module
+from ROOT import TFile # pylint: disable=import-error, no-name-in-module
+
+from machine_learning_hep.analysis.analyzer import Analyzer
+
+class AnalyzerJets(Analyzer):
+    species = "analyzer"
+
+    def __init__(self, datap, case, typean, period):
+        super().__init__(datap, case, typean, period)
+
+        self.cfg = munch.munchify(datap)
+        self.cfg.ana = munch.munchify(datap).analysis[typean]
+
+        # output directories
+        self.d_resultsallpmc = datap["analysis"][typean]["mc"]["results"][period] \
+                if period is not None else datap["analysis"][typean]["mc"]["resultsallp"]
+        self.d_resultsallpdata = datap["analysis"][typean]["data"]["results"][period] \
+                if period is not None else datap["analysis"][typean]["data"]["resultsallp"]
+
+        # input directories (processor output)
+        self.d_resultsallpmc_proc = self.d_resultsallpmc
+        self.d_resultsallpdata_proc = self.d_resultsallpdata
+
+        # input files
+        n_filemass_name = datap["files_names"]["histofilename"]
+        self.n_filemass = os.path.join(self.d_resultsallpdata_proc, n_filemass_name)
+        self.n_filemass_mc = os.path.join(self.d_resultsallpmc_proc, n_filemass_name)
+        self.n_fileeff = datap["files_names"]["efffilename"]
+        self.n_fileeff = os.path.join(self.d_resultsallpmc_proc, self.n_fileeff)
+        self.n_fileresp = datap["files_names"]["respfilename"]
+        self.n_fileresp = os.path.join(self.d_resultsallpmc_proc, self.n_fileresp)
+
+    def qa(self): # pylint: disable=too-many-branches, too-many-locals
+        self.logger.info("Running D0 jet qa")
+
+        with TFile(self.n_filemass) as rfile:
+            histonorm = rfile.Get("histonorm")
+            if not histonorm:
+                self.logger.critical('histonorm not found')
+            p_nevents = histonorm.GetBinContent(1)
+            self.logger.debug(f'Number of selected event: {p_nevents}')
diff --git a/machine_learning_hep/analysis/analyzer_manager.py b/machine_learning_hep/analysis/analyzer_manager.py
@@ -82,7 +82,7 @@ def initialize(self):
         self.is_initialized = True
 
 
-    def analyze(self, *ana_steps):
+    def analyze(self, ana_steps):
         """
         Gives a list of analyzers and analysis steps do each step for each analyzer
         Args:
@@ -96,10 +96,8 @@ def analyze(self, *ana_steps):
 
         self.initialize()
 
-        self.logger.info("Run all registered analyzers of type %s for following analysis steps",
-                         self.ana_class.__name__)
-        for step in ana_steps:
-            print(f"  -> {step}")
+        self.logger.info("Run all registered analyzers of type %s for following analysis steps: %s",
+                         self.ana_class.__name__, ana_steps)
 
         # Collect potentially failed systematic steps
         failed_steps = []

diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet.yml
diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_LcToPKPi.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_LcToPKPi.yml
@@ -146,6 +146,7 @@ LcpKpi:
           xlim:
             - 0
             - 0.0001
+
   files_names:
     namefile_unmerged_tree: AnalysisResults_trees.root
     namefile_reco: AnalysisResultsReco.pkl
@@ -196,6 +197,7 @@ LcpKpi:
       pkl_skimmed_merge_for_ml_all: /data2/MLhep/prod_LHC22b1b_MC/mltotmc
       pkl_evtcounter_all: /data2/MLhep/prod_LHC22b1b_MC/evttotmc
       mcreweights: [../Analyses]
+
   ml:
     evtsel: null
     triggersel:

diff --git a/machine_learning_hep/logger.py b/machine_learning_hep/logger.py
@@ -46,16 +46,16 @@ class MLLoggerFormatter(logging.Formatter):
 
     level_map = {
         logging.DEBUG: (None, 'blue', False),
-        logging.INFO: (None, 'black', False),
+        logging.INFO: (None, 'green', False),
         logging.WARNING: (None, 'yellow', False),
-        logging.ERROR: (None, 'red', False),
+        logging.ERROR: (None, 'orange', False),
         logging.CRITICAL: ('red', 'white', True),
     }
     csi = '\x1b['
     reset = '\x1b[0m'
 
     # Define default format string
-    def __init__(self, fmt='%(levelname)s in %(pathname)s:%(lineno)d:\n%(message)s',
+    def __init__(self, fmt='%(levelname)s in %(pathname)s:%(lineno)d:\n ↳ %(message)s',
                  datefmt=None, style='%', color=False):
         logging.Formatter.__init__(self, fmt, datefmt, style)
         self.color = color
@@ -102,7 +102,7 @@ def configure_logger(debug, logfile=None):
         logger.setLevel(logging.INFO)
 
     sh = logging.StreamHandler()
-    formatter = MLLoggerFormatter(color=lambda : getattr(sh.stream, 'isatty', None)) # pylint: disable=C0326
+    formatter = MLLoggerFormatter(color=lambda : getattr(sh.stream, 'isatty', None))
 
     sh.setFormatter(formatter)
     logger.addHandler(sh)
@@ -123,5 +123,5 @@ def get_logger():
     """
     Get the global logger for this package and set handler together with formatters.
     """
-    configure_logger(False, None)
+    # configure_logger(False, None)
     return logging.getLogger("MachinelearningHEP")
diff --git a/machine_learning_hep/multiprocesser.py b/machine_learning_hep/multiprocesser.py
@@ -19,8 +19,12 @@
 from machine_learning_hep.processer import Processer # pylint: disable=unused-import
 from machine_learning_hep.utilities import merge_method, mergerootfiles, get_timestamp_string
 from machine_learning_hep.io import parse_yaml, dump_yaml_from_dict
+from machine_learning_hep.logger import get_logger
+
 class MultiProcesser: # pylint: disable=too-many-instance-attributes, too-many-statements
     species = "multiprocesser"
+    logger = get_logger()
+
     def __init__(self, case, proc_class, datap, typean, run_param, mcordata):
         self.case = case
         self.datap = datap
@@ -174,6 +178,7 @@ def multi_histomass(self):
             if self.p_useperiod[indexp] == 1:
                 self.process_listsample[indexp].process_histomass()
         tmp_merged = f"/data/tmp/hadd/{self.case}_{self.typean}/mass/{get_timestamp_string()}/"
+        self.logger.debug('merging all')
         mergerootfiles(self.lper_filemass, self.filemass_mergedall, tmp_merged)
 
     def multi_efficiency(self):