Merge pull request #7 from LPC-HH/collect_samples

Make signal filelists
LPC-HH · Nov 4, 2024 · a99e249 · a99e249
2 parents c9ffcd1 + 4c64806
commit a99e249
Show file tree

Hide file tree

Showing 16 changed files with 2,623 additions and 943 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,30 +17,14 @@ env:
   FORCE_COLOR: 3
 
 jobs:
-  pre-commit:
-    name: Format
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.x"
-      - uses: pre-commit/[email protected]
-        with:
-          extra_args: --hook-stage manual --all-files
-      - name: Run PyLint
-        run: pipx run nox -s pylint -- --output-format=github
-
   checks:
     name: Check Python ${{ matrix.python-version }} on ${{ matrix.runs-on }}
     runs-on: ${{ matrix.runs-on }}
     needs: [pre-commit]
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.13"]
+        python-version: ["3.8", "3.11"]
         runs-on: [ubuntu-latest, windows-latest, macos-14]
 
         include:

diff --git a/.gitignore b/.gitignore
@@ -156,3 +156,37 @@ Thumbs.db
 # Common editor files
 *~
 *.swp
+
+
+*.root
+*.parquet
+test.ipynb
+**/__pycache__
+**/.DS_Store
+outfiles
+plots
+cards
+docker_stderror
+/*.jdl
+/*.sh
+/*.err
+/*.out
+/*.log
+/condor
+src/HH4b/postprocessing/**/*.png
+src/HH4b/postprocessing/**/*.pdf
+src/HH4b/boosted/**/*.png
+src/HH4b/boosted/**/*.pdf
+src/HH4b/boosted/**/*roc_dict.pkl
+src/HH4b/boosted/**/*.pkl
+**/*test.ipynb
+
+running_jobs.txt
+
+.bashrc
+.condor_config
+.local/
+
+data/.sites_map.json
+
+*.C
diff --git a/README.md b/README.md
@@ -46,7 +46,7 @@ First, create a virtual environment (`micromamba` is recommended):
 # Install: (the micromamba directory can end up taking O(1-10GB) so make sure the directory you're using allows that quota)
 "${SHELL}" <(curl -L micro.mamba.pm/install.sh)
 # You may need to restart your shell
-micromamba create -n hh python=3.10 -c conda-forge
+micromamba env create -f environment.yaml
 micromamba activate hh
 ```
 
@@ -95,4 +95,4 @@ Clone the repository:
 ```
 git clone https://github.com/LPC-HH/bbtautau/
 pip install -e .
-```
+```
diff --git a/data/README.md b/data/README.md
@@ -1,59 +1,12 @@
 # Making filelists
 
-Uses https://github.com/dmwm/DBSClient. Use CMSSW_11_2_0 or later, and run
-`pip3 install dbs3-client --user`.
-
-## NanoAOD versions
-
-PDMV recommendations:
-https://twiki.cern.ch/twiki/bin/viewauth/CMS/PdmVRun3Analysis
-
-```
-Campaign CMSSW
---------------
-Run3Winter22 CMSSW_12_2_X POG studies
-Run3Summer22 CMSSW_12_4_X 2022 data analysis
-```
-
-Some instructions on custom nano here:
-https://github.com/cms-jet/PFNano/tree/13_0_7_from124MiniAOD
-
-### Recipe for NanoAODv12
-
-```
-cmsrel CMSSW_13_1_0
-cd CMSSW_13_1_0/src
-eval `scram runtime -sh`
-scram b
-```
-
-#### For data:
-
-2023-Prompt
+e.g.
 
+```bash
+python index_private_nano.py --users rkansal --samples HHbbtt
 ```
-# taken from: https://cmsweb.cern.ch/couchdb/reqmgr_config_cache/32c5d6d84a05232e68c9abd3937a291e/configFile
-cmsDriver.py --python_filename test_nanoTuples_data2023_PromptNanoAODv12_cfg.py --eventcontent NANOAOD --customise Configuration/DataProcessing/Utils.addMonitoring,PhysicsTools/NanoAOD/nano_cff.nanoL1TrigObjCustomize --datatier NANOAOD \
---fileout file:nano_data2023_PromptNanoAODv12.root \
---conditions 130X_dataRun3_Prompt_v3 --step NANO --scenario pp \
---filein /store/data/Run2023C/JetMET0/MINIAOD/PromptReco-v2/000/367/516/00000/056efdee-d563-4fdc-9d9c-6e9bf5833df7.root \
---era Run3 --nThreads 2 --no_exec --data -n 100
-```
-
-2023-MC Run3Summer23:
 
+For full list of options:
+```bash
+python index_private_nano.py -h
 ```
-# taken from https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get_test/PPD-Run3Summer23NanoAODv12-00002
-cmsDriver.py --python_filename test_nanoTuples_Run3Summer23_PromptNanoAODv12_cfg.py --eventcontent NANOAOD --customise Configuration/DataProcessing/Utils.addMonitoring --datatier NANOAODSIM \
---fileout file:nano_mcRun3Summer23_NanoAODv12.root \
---conditions 130X_mcRun3_2023_realistic_v8 --step NANO --scenario pp \
---filein "dbs:/MinBias_TuneCP5_13p6TeV-pythia8/Run3Summer23MiniAODv4-NoPU_Pilot_130X_mcRun3_2023_realistic_v8-v2/MINIAODSIM" \
---era Run3_2023 --no_exec --mc  -n 100
-```
-
-## Cross sections
-
-Reference:
-https://xsdb-temp.app.cern.ch/xsdb/?columns=67108863&currentPage=0&pageSize=30&searchQuery=energy%3D13.6
-
-https://twiki.cern.ch/twiki/bin/viewauth/CMS/MATRIXCrossSectionsat13p6TeV
diff --git a/data/index_2022.json b/data/index_2022.json
diff --git a/data/index_2022EE.json b/data/index_2022EE.json
diff --git a/data/index_2023.json b/data/index_2023.json
diff --git a/data/index_2023BPix.json b/data/index_2023BPix.json
diff --git a/data/index_private_nano.py b/data/index_private_nano.py
@@ -0,0 +1,193 @@
+"""
+Create a JSON list of files of privately produced NanoAOD files.
+
+Author: Raghav Kansal
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+from XRootD import client
+
+from bbtautau import hh_vars, utils
+
+
+def _dirlist(fs, path) -> list:
+    status, listing = fs.dirlist(str(path))
+    if not status.ok:
+        raise FileNotFoundError(f"Failed to list directory: {status}")
+
+    return [f.name for f in listing]
+
+
+def xrootd_index_private_nano(
+    base_dir: str,
+    redirector: str = "root://cmseos.fnal.gov/",
+    users: list[str] = None,
+    years: list[str] = None,
+    samples: list[str] = None,
+    subsamples: list[str] = None,
+    files: dict[str] = None,
+) -> list:
+    """Recursively search for privately produced NanoAOD files via XRootD.
+
+    Can specify specific users, years, samples, and subsamples to search for;
+    otherwise, it will search for all by default.
+
+    Files are organized as:
+
+    MC:
+    ......redirector.......|...............base_dir....................|..user.|year|sample|....................................subsample.......................................|
+    root://cmseos.fnal.gov//store/user/lpcdihiggsboost/NanoAOD_v12_ParT/rkansal/2022/HHbbtt/GluGlutoHHto2B2Tau_kl-1p00_kt-1p00_c2-0p00_LHEweights_TuneCP5_13p6TeV_powheg-pythia8/
+    .............................f1...........................|.....f2......|.f3.|......
+    GluGlutoHHto2B2Tau_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV/241028_235514/0000/*.root
+
+    Data:
+    TODO
+    """
+    fs = client.FileSystem(redirector)
+    base_dir = Path(base_dir)
+
+    users = _dirlist(fs, base_dir) if users is None else users
+    years = hh_vars.years if years is None else years
+
+    if files is None:
+        files = {}
+
+    for user in users:
+        print(f"\t{user}")
+        for year in years:
+            print(f"\t\t{year}")
+            if year not in files:
+                files[year] = {}
+
+            ypath = base_dir / user / year
+            tsamples = _dirlist(fs, ypath) if samples is None else samples
+            for sample in tsamples:
+                if sample not in files[year]:
+                    files[year][sample] = {}
+
+                print(f"\t\t\t{sample}")
+                spath = ypath / sample
+                tsubsamples = _dirlist(fs, spath) if subsamples is None else subsamples
+                for subsample in tsubsamples:
+                    if subsample in files[year][sample]:
+                        # Change to warning?
+                        raise ValueError(f"Duplicate subsample found! {subsample}")
+
+                    print(f"\t\t\t\t{subsample}")
+                    sspath = spath / subsample
+                    for f1 in _dirlist(fs, sspath):
+                        f1path = sspath / f1
+                        for f2 in _dirlist(fs, f1path):
+                            f2path = f1path / f2
+                            for f3 in _dirlist(fs, f2path):
+                                f3path = f2path / f3
+                                tfiles = [
+                                    f"{redirector}{f3path!s}/{f}"
+                                    for f in _dirlist(fs, f3path)
+                                    if f.endswith(".root")
+                                ]
+
+                    files[year][sample][subsample] = tfiles
+                    print(f"\t\t\t\t\t{len(tfiles)} files")
+
+    return files
+
+
+def main():
+    # Set up argument parser
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--out-name",
+        type=str,
+        default="index",
+        help="Output JSON name (year and .json will automatically be appended)",
+    )
+
+    utils.add_bool_arg(
+        parser, "append", "Append to existing JSON file versus overwriting it", default=True
+    )
+
+    parser.add_argument(
+        "--redirector",
+        type=str,
+        default="root://cmseos.fnal.gov/",
+        help="Base XRootD redirector",
+    )
+
+    parser.add_argument(
+        "--base-dir",
+        type=str,
+        default="/store/user/lpcdihiggsboost/NanoAOD_v12_ParT",
+        help="Base directory for XRootD search",
+    )
+
+    parser.add_argument(
+        "--users",
+        nargs="+",
+        type=str,
+        help="Which users' directories. By default searches all.",
+        default=None,
+    )
+
+    parser.add_argument(
+        "--years",
+        nargs="+",
+        type=str,
+        help="Which years to index. By default searches all.",
+        default=hh_vars.years,
+    )
+
+    parser.add_argument(
+        "--samples",
+        nargs="+",
+        type=str,
+        help="Which samples to index. By default searches all.",
+        default=None,
+    )
+
+    parser.add_argument(
+        "--subsamples",
+        nargs="+",
+        type=str,
+        help="Which subsamples to index. By default searches all.",
+        default=None,
+    )
+
+    args = parser.parse_args()
+
+    if args.append:
+        # check if output file exists for each year; if so, load and save to files dict.
+        files = {}
+        for year in args.years:
+            try:
+                with Path(f"{args.out_name}_{year}.json").open() as f:
+                    files[year] = json.load(f)
+            except FileNotFoundError:
+                continue
+    else:
+        files = None
+
+    files = xrootd_index_private_nano(
+        args.base_dir,
+        args.redirector,
+        args.users,
+        args.years,
+        args.samples,
+        args.subsamples,
+        files,
+    )
+
+    # save files per year
+    for year in files:
+        with Path(f"{args.out_name}_{year}.json").open("w") as f:
+            json.dump(files[year], f, indent=4)
+
+
+if __name__ == "__main__":
+    main()