From 95ec94d84db4e7fb10e2cc8635c864e781208298 Mon Sep 17 00:00:00 2001
From: Allan Bolipata <allan.bolipata@gmail.com>
Date: Fri, 7 Feb 2025 13:33:12 -0500
Subject: [PATCH 01/16] nucleo sv progress

---
 runner/operator/access/__init__.py            |  73 +++++++---
 .../v2_1_0/structural_variants/__init__.py    | 126 ++++++++++++++++++
 .../input_template.json.jinja2                |  20 +++
 3 files changed, 204 insertions(+), 15 deletions(-)
 mode change 100644 => 100755 runner/operator/access/__init__.py
 create mode 100755 runner/operator/access/v2_1_0/structural_variants/__init__.py
 create mode 100755 runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2

diff --git a/runner/operator/access/__init__.py b/runner/operator/access/__init__.py
old mode 100644
new mode 100755
index 2ffa3f994..77a3d6f7f
--- a/runner/operator/access/__init__.py
+++ b/runner/operator/access/__init__.py
@@ -29,29 +29,35 @@ def get_request_id(run_ids, request_id=None):
     raise Exception("Could not get find request id")
 
 
-def get_request_id_runs(request_id):
+def get_request_id_runs(app, run_ids, request_id):
     """
     Get the latest completed bam-generation runs for the given request ID
 
     :param request_id: str - IGO request ID
     :return: List[str] - List of most recent runs from given request ID
     """
-    operator_run_id = (
-        Run.objects.filter(
-            tags__igoRequestId=request_id,
-            app__name__in=["access legacy", "access nucleo"],
-            operator_run__status=RunStatus.COMPLETED,
+
+
+    if not request_id:
+            most_recent_runs_for_request = Run.objects.filter(pk__in=run_ids, status=RunStatus.COMPLETED)
+            request_id = RunStatus[0].tags["igoRequestId"]
+    else:
+        most_recent_runs_for_request = (
+            Run.objects.filter(
+                tags__igoRequestId=request_id,
+                app__name__in=app,
+                status=RunStatus.COMPLETED,
+                operator_run__status=RunStatus.COMPLETED,
+            )
+            .order_by("-created_date")
+            .first()
+            .operator_run.runs.all()
+            .filter(status=RunStatus.COMPLETED)
         )
-        .exclude(finished_date__isnull=True)
-        .order_by("-finished_date")
-        .first()
-        .operator_run_id
-    )
+        if not len(most_recent_runs_for_request):
+            raise Exception("No matching Nucleo runs found for request {}".format(request_id))
 
-    request_id_runs = Run.objects.filter(
-        operator_run_id=operator_run_id, app__name__in=["access legacy", "access nucleo"], status=RunStatus.COMPLETED
-    )
-    return request_id_runs
+    return most_recent_runs_for_request, request_id
 
 
 def create_cwl_file_object(file_path):
@@ -180,3 +186,40 @@ def get_unfiltered_matched_normal(patient_id, request_id=None):
         logger.warning(msg)
 
     return unfiltered_matched_normal_bam, unfiltered_matched_normal_sample_id
+
+
+def parse_nucleo_output_ports(run, port_name):
+    bam_bai = Port.objects.get(name=port_name, run=run.pk)
+    if not len(bam_bai.files.all()) in [1, 2]:
+        raise Exception("Port {} for run {} should have just 1 bam or 1 (bam/bai) pair".format(port_name, run.id))
+    bam = [b for b in bam_bai.files.all() if b.file_name.endswith(".bam")][0]
+    return bam
+
+def find_request_bams(run):
+    """
+    Find simplex and duplex bams from a request's nucleo run
+    - run_ids: run_ids from a request's nucleo run
+
+    :return: list of paired simplex and duplex bams and normal bam
+    """
+    nucleo_output_port_names = [
+        "uncollapsed_bam",
+        "fgbio_group_reads_by_umi_bam",
+        "fgbio_collapsed_bam",
+        "fgbio_filter_consensus_reads_duplex_bam",
+        "fgbio_postprocessing_simplex_bam",
+    ]
+    bams = {}
+    for o in nucleo_output_port_names:
+        # We are running a multi-sample workflow on just one sample,
+        # so we create single-element lists here
+        bam = parse_nucleo_output_ports(run, o)
+        bams[o] = bam
+
+    return bams
+
+def is_tumor_bam(file):
+    if not file.endswith(".bam"):
+        return False
+    t_n_timepoint = file.split("-")[2]
+    return not t_n_timepoint[0] == "N"
\ No newline at end of file
diff --git a/runner/operator/access/v2_1_0/structural_variants/__init__.py b/runner/operator/access/v2_1_0/structural_variants/__init__.py
new file mode 100755
index 000000000..7a7440dc8
--- /dev/null
+++ b/runner/operator/access/v2_1_0/structural_variants/__init__.py
@@ -0,0 +1,126 @@
+import os
+import json
+import logging
+
+from django.conf import settings
+from jinja2 import Template
+from runner.models import Port, RunStatus
+from runner.operator.operator import Operator
+from runner.run.objects.run_creator_object import RunCreator
+from file_system.repository.file_repository import File
+from runner.operator.access import get_request_id, get_request_id_runs, create_cwl_file_object, find_request_bams, is_tumor_bam
+from runner.models import RunStatus, Port, Run, Pipeline
+from datetime import datetime
+# DUPLEX_BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-duplex.bam"
+BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR.bam"
+logger = logging.getLogger(__name__)
+WORKDIR = os.path.dirname(os.path.abspath(__file__))
+TUMOR_OR_NORMAL_SEARCH = "-L0"
+SAMPLE_ID_SEP = "_cl_aln"
+ACCESS_DEFAULT_SV_NORMAL_ID = "DONOR22-TP"
+ACCESS_DEFAULT_SV_NORMAL_FILENAME = "Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R_cl_aln_srt_MD_IR_FX_BR.bam"
+
+
+class AccessV2LegacySVOperator(Operator):
+    """
+    Operator for the ACCESS Legacy Structural Variants workflow:
+
+    http://www.github.com/mskcc/access-pipeline/workflows/subworkflows/manta.cwl
+
+    This Operator will search for Standard Bam files based on an IGO Request ID
+    """
+
+    @staticmethod
+    def is_tumor_bam(file):
+        if not file.file_name.endswith(".bam"):
+            return False
+        t_n_timepoint = file.file_name.split("-")[2]
+        return not t_n_timepoint[0] == "N"
+
+    def get_sample_inputs(self, runs):
+        """
+        Create all sample inputs for all runs triggered in this instance of the operator
+
+        :return: list of json_objects
+        """
+        bams = []
+        for run in runs:
+            bams.append(find_request_bams(run))
+
+        # TUMOR
+        standard_tumor_bams = [
+            b["uncollapsed_bam"]
+            for b in bams
+            if is_tumor_bam(b["uncollapsed_bam"].file_name)
+        ]
+
+        # standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)]
+        # sample_ids = [f.file_name.split("_cl_aln")[0] for f in standard_tumor_bams]
+        sample_ids = [b.file_name.replace(BAM_STEM, "") for b in standard_tumor_bams]
+        normal_bam = File.objects.filter(file_name=ACCESS_DEFAULT_SV_NORMAL_FILENAME)
+        normal_bam = normal_bam[0]
+
+        sample_inputs = []
+        for i, b in enumerate(standard_tumor_bams):
+            sample_input = self.construct_sample_inputs(sample_ids[i], b, normal_bam)
+            sample_inputs.append(sample_input)
+
+        return sample_inputs
+
+    def get_jobs(self):
+        """
+        Convert job inputs into serialized jobs
+
+        :return: list[(serialized job info, Job)]
+        """
+        app = self.get_pipeline_id()
+        pipeline = Pipeline.objects.get(id=app)
+        # output_directory = pipeline.output_directory
+        run_date = datetime.now().strftime("%Y%m%d_%H:%M:%f")
+        # If no request_id, get request id from run information
+        # else request_id given directly
+        runs, self.request_id = get_request_id_runs(["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id)
+        sample_inputs = self.get_sample_inputs(runs)
+
+        return [
+            RunCreator(
+                **{
+                    "name": "ACCESS LEGACY SV M1: %s, %i of %i" % (self.request_id, i + 1, len(sample_inputs)),
+                    "app": self.get_pipeline_id(),
+                    "inputs": job,
+                    "tags": {
+                        settings.REQUEST_ID_METADATA_KEY: self.request_id,
+                        "cmoSampleIds": job["sv_sample_id"],
+                        settings.PATIENT_ID_METADATA_KEY: "-".join(job["sv_sample_id"][0].split("-")[0:2]),
+                    },
+                }
+            )
+            for i, job in enumerate(sample_inputs)
+        ]
+
+    def construct_sample_inputs(self, tumor_sample_id, tumor_bam, normal_bam):
+        """
+        Use sample metadata and json template to create inputs for the CWL run
+
+        :return: JSON format sample inputs
+        """
+        with open(os.path.join(WORKDIR, "input_template.json.jinja2")) as file:
+            template = Template(file.read())
+
+            tumor_sample_names = [tumor_sample_id]
+            tumor_path = tumor_bam.path.replace("file://", "juno://")
+            if not tumor_path.startswith("juno://"):
+                tumor_path = "juno://" + tumor_path
+            tumor_bams = [{"class": "File", "location": tumor_path}]
+
+            normal_bam = create_cwl_file_object(normal_bam.path)
+
+            input_file = template.render(
+                tumor_sample_id=tumor_sample_id,
+                tumor_sample_names=json.dumps(tumor_sample_names),
+                tumor_bams=json.dumps(tumor_bams),
+                normal_bam=json.dumps(normal_bam),
+            )
+
+            sample_input = json.loads(input_file)
+            return sample_input
diff --git a/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2 b/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2
new file mode 100755
index 000000000..e03a40610
--- /dev/null
+++ b/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2
@@ -0,0 +1,20 @@
+{
+    "version": "1.3.40",
+    "project_name": "{{ tumor_sample_id }}",
+    "ref_fasta": {
+        "class": "File",
+        "location": "juno:///juno/work/access/production/resources/reference/versions/hg19/Homo_sapiens_assembly19.fasta"
+    },
+
+    "sv_sample_id": {{ tumor_sample_names }},
+    "sv_tumor_bams": {{ tumor_bams }},
+    "sv_normal_bam": {{ normal_bam }},
+
+    "sv_run_tools": {
+        "java_8": "/opt/common/CentOS_6/java/jdk1.8.0_31/bin/java",
+        "r_path": "/opt/common/CentOS_6-dev/R/R-3.5.0/bin/Rscript",
+        "manta_python": "/home/accessbot/miniconda3/envs/ACCESS_1.3.36/bin/python2",
+        "manta": "/opt/common/CentOS_6-dev/manta/1.5.0",
+        "sv_repo": "/work/access/production/resources/tools/ACCESS-SV/versions/ACCESS_SV"
+    }
+}
\ No newline at end of file

From 397fa6e25cf4a0f5d0f7e8245fc0c402e604ffe3 Mon Sep 17 00:00:00 2001
From: Allan Bolipata <allan.bolipata@gmail.com>
Date: Mon, 10 Feb 2025 16:29:33 -0500
Subject: [PATCH 02/16] staring msi and cnv

---
 runner/operator/access/v2_1_0/cnv/__init__.py | 125 ++++++++++++++++
 .../v2_1_0/cnv/input_template.json.jinja2     |  27 ++++
 runner/operator/access/v2_1_0/msi/__init__.py | 135 ++++++++++++++++++
 .../v2_1_0/msi/input_template.json.jinja2     |  18 +++
 .../v2_1_0/structural_variants/__init__.py    |   2 +-
 .../input_template.json.jinja2                |   3 +-
 6 files changed, 308 insertions(+), 2 deletions(-)
 create mode 100644 runner/operator/access/v2_1_0/cnv/__init__.py
 create mode 100644 runner/operator/access/v2_1_0/cnv/input_template.json.jinja2
 create mode 100644 runner/operator/access/v2_1_0/msi/__init__.py
 create mode 100644 runner/operator/access/v2_1_0/msi/input_template.json.jinja2

diff --git a/runner/operator/access/v2_1_0/cnv/__init__.py b/runner/operator/access/v2_1_0/cnv/__init__.py
new file mode 100644
index 000000000..46d930b89
--- /dev/null
+++ b/runner/operator/access/v2_1_0/cnv/__init__.py
@@ -0,0 +1,125 @@
+import os
+import json
+import logging
+from jinja2 import Template
+
+from django.conf import settings
+from runner.models import Port, RunStatus
+from runner.operator.operator import Operator
+from runner.run.objects.run_creator_object import RunCreator
+from file_system.repository.file_repository import FileRepository
+from runner.operator.access import get_request_id, get_request_id_runs
+
+
+logger = logging.getLogger(__name__)
+
+SAMPLE_ID_SEP = "_cl_aln"
+TUMOR_SEARCH = "-L0"
+NORMAL_SEARCH = "-N0"
+WORKDIR = os.path.dirname(os.path.abspath(__file__))
+ACCESS_DEFAULT_CNV_NORMAL_FILENAME = r"DONOR22-TP_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX.bam$"
+UNFILTERED_BAM_SEARCH = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX.bam"
+
+
+class AccessLegacyCNVOperator(Operator):
+    """
+    Operator for the ACCESS Legacy Copy Number Variants workflow:
+
+    http://www.github.com/mskcc/access-pipeline/workflows/subworkflows/call_cnv.cwl
+
+    This Operator will search for ACCESS Unfiltered Bam files based on an IGO Request ID.
+    """
+
+    @staticmethod
+    def is_tumor_bam(file):
+        if not file.file_name.endswith(".bam"):
+            return False
+        t_n_timepoint = file.file_name.split("-")[2]
+        return not t_n_timepoint[0] == "N"
+
+    def get_sample_inputs(self):
+        """
+        Create all sample inputs for all runs triggered in this instance of the operator.
+
+        :return: list of json_objects
+        """
+        run_ids = self.run_ids if self.run_ids else [r.id for r in get_request_id_runs(self.request_id)]
+
+        # Get all unfiltered bam ports for these runs
+        unfiltered_bam_ports = Port.objects.filter(
+            name__in=["unfiltered_bams", "fgbio_collapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED
+        )
+
+        unfiltered_tumor_bams = [f for p in unfiltered_bam_ports for f in p.files.all() if self.is_tumor_bam(f)]
+
+        sample_ids = []
+        tumor_bams = []
+        sample_sexes = []
+
+        for tumor_bam in unfiltered_tumor_bams:
+            sample_id = tumor_bam.file_name.split("_cl_aln")[0]
+            # Use the initial fastq metadata to get the sex of the sample
+            # Todo: Need to store this info on the bams themselves
+            tumor_fastqs = FileRepository.filter(
+                file_type="fastq",
+                metadata={"tumorOrNormal": "Tumor", settings.CMO_SAMPLE_NAME_METADATA_KEY: sample_id},
+                filter_redact=True,
+            )
+            sample_sex = tumor_fastqs[0].metadata["sex"]
+            tumor_bams.append(tumor_bam)
+            sample_sexes.append(sample_sex)
+            sample_ids.append(sample_id)
+
+        sample_inputs = [
+            self.construct_sample_inputs(tumor_bams[i], sample_sexes[i]) for i in range(0, len(tumor_bams))
+        ]
+
+        return sample_inputs, sample_ids
+
+    def get_jobs(self):
+        """
+        Convert job inputs into serialized jobs
+
+        :return: list[(serialized job info, Job)]
+        """
+        self.request_id = get_request_id(self.run_ids, self.request_id)
+        inputs, sample_ids = self.get_sample_inputs()
+
+        return [
+            (
+                RunCreator(
+                    **{
+                        "name": "ACCESS LEGACY CNV M1: %s, %i of %i" % (self.request_id, i + 1, len(inputs)),
+                        "app": self.get_pipeline_id(),
+                        "inputs": job,
+                        "tags": {
+                            settings.REQUEST_ID_METADATA_KEY: self.request_id,
+                            "cmoSampleIds": sample_ids[i],
+                            settings.PATIENT_ID_METADATA_KEY: "-".join(sample_ids[i].split("-")[0:2]),
+                        },
+                    }
+                )
+            )
+            for i, job in enumerate(inputs)
+        ]
+
+    def construct_sample_inputs(self, tumor_bam, sample_sex):
+        """
+        Use sample metadata and json template to create inputs for the CWL run
+
+        :return: JSON format sample inputs
+        """
+        with open(os.path.join(WORKDIR, "input_template.json.jinja2")) as file:
+            template = Template(file.read())
+
+            tumor_sample_list = tumor_bam.path + "\t" + sample_sex
+            # Todo: need this to work with Nucleo bams:
+            tumor_sample_id = tumor_bam.file_name.split("_cl_aln_srt_MD_IR_FX_BR")[0]
+
+            input_file = template.render(
+                tumor_sample_id=tumor_sample_id,
+                tumor_sample_list_content=json.dumps(tumor_sample_list),
+            )
+
+            sample_input = json.loads(input_file)
+            return sample_input
diff --git a/runner/operator/access/v2_1_0/cnv/input_template.json.jinja2 b/runner/operator/access/v2_1_0/cnv/input_template.json.jinja2
new file mode 100644
index 000000000..9f7570649
--- /dev/null
+++ b/runner/operator/access/v2_1_0/cnv/input_template.json.jinja2
@@ -0,0 +1,27 @@
+{
+   "project_name": "{{ tumor_sample_id }}",
+    "tumor_sample_list": {
+        "contents": {{ tumor_sample_list_content }},
+        "basename": "tumor_manifest.txt",
+        "class": "File"
+    },
+    "normal_sample_list": {
+        "class": "File",
+        "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/novaseq_curated_unfiltered_bams_dmp/versions/v1.0/normal_manifest_access_v2_plasma.txt"
+    },
+    "threads": 8,
+    "tmp_dir": "/scratch",
+    "targets_coverage_bed": {
+        "class": "File",
+        "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/regions_of_interest/versions/v1.0/ACCESSv2_targets_coverage.bed"
+    },
+    "targets_coverage_annotation": {
+        "class": "File",
+        "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/regions_of_interest/versions/v1.0/ACCESSv2_targets_coverage.txt"
+    },
+    "reference_fasta": {
+        "class": "File",
+        "location": "juno:///juno/work/access/production/resources/reference/versions/hg19_virus_special/hg19_virus.fasta"
+    },
+    "version": "1.3.40"
+}
diff --git a/runner/operator/access/v2_1_0/msi/__init__.py b/runner/operator/access/v2_1_0/msi/__init__.py
new file mode 100644
index 000000000..c9e5cb71d
--- /dev/null
+++ b/runner/operator/access/v2_1_0/msi/__init__.py
@@ -0,0 +1,135 @@
+"""""" """""" """""" """""" """""
+" ACCESS-Pipeline MSI workflow operator
+" http://www.github.com/mskcc/access-pipeline/workflows/msi.cwl
+""" """""" """""" """""" """""" ""
+
+import os
+import json
+import logging
+from jinja2 import Template
+from django.conf import settings
+from file_system.models import File
+from runner.operator.operator import Operator
+from runner.run.objects.run_creator_object import RunCreator
+from runner.operator.access import get_request_id, get_request_id_runs, create_cwl_file_object
+from runner.models import Port, RunStatus
+
+
+logger = logging.getLogger(__name__)
+
+# Todo: needs to work for Nucleo bams as well
+SAMPLE_ID_SEP = "_cl_aln"
+TUMOR_SEARCH = "-L0"
+TUMOR_SEARCH_NEW = "_L0"
+NORMAL_SEARCH = "-N0"
+NORMAL_SEARCH_NEW = "_N0"
+STANDARD_BAM_SEARCH = "_cl_aln_srt_MD_IR_FX_BR.bam"
+WORKDIR = os.path.dirname(os.path.abspath(__file__))
+
+
+class AccessLegacyMSIOperator(Operator):
+    """
+    Operator for the ACCESS Legacy Microsatellite Instability workflow:
+
+    http://www.github.com/mskcc/access-pipeline/workflows/subworkflows/msi.cwl
+
+    This Operator will search for ACCESS Standard Bam files based on an IGO Request ID. It will
+    also find the matched normals based on the patient ID.
+    """
+
+    @staticmethod
+    def is_tumor_bam(file):
+        # Todo: extract to common fn across 4 downstream operators
+        if not file.file_name.endswith(".bam"):
+            return False
+        t_n_timepoint = file.file_name.split("-")[2]
+        return not t_n_timepoint[0] == "N"
+
+    def get_sample_inputs(self):
+        """
+        Create all sample inputs for all runs triggered in this instance of the operator.
+
+        :return: list of json_objects
+        """
+        run_ids = self.run_ids if self.run_ids else [r.id for r in get_request_id_runs(self.request_id)]
+
+        # Get all standard bam ports for these runs
+        standard_bam_ports = Port.objects.filter(
+            name__in=["standard_bams", "uncollapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED
+        )
+
+        standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)]
+
+        # Dictionary that associates tumor bam with standard bam with tumor_sample_id
+        sample_tumor_normal = {}
+        for standard_tumor_bam in standard_tumor_bams:
+            tumor_sample_id = standard_tumor_bam.file_name.split("_cl_aln")[0]
+            patient_id = "-".join(tumor_sample_id.split("-")[0:2])
+
+            # Find the matched Normal Standard bam (which could be associated with a different request_id)
+            sample_search_start = patient_id + NORMAL_SEARCH
+            matched_normal_bam = File.objects.filter(
+                file_name__startswith=sample_search_start, file_name__endswith=STANDARD_BAM_SEARCH
+            )
+            if not len(matched_normal_bam) > 0:
+                msg = "No matching standard normal Bam found for patient {}".format(patient_id)
+                logger.warning(msg)
+                continue
+
+            matched_normal_bam = matched_normal_bam.order_by("-created_date").first()
+
+            sample_tumor_normal[tumor_sample_id] = {"normal": matched_normal_bam, "tumor": standard_tumor_bam}
+
+        sample_inputs = [
+            self.construct_sample_inputs(key, value["tumor"], value["normal"])
+            for key, value in sample_tumor_normal.items()
+        ]
+
+        return sample_inputs
+
+    def get_jobs(self):
+        """
+        Convert job inputs into serialized jobs
+
+        :return: list[(serialized job info, Job)]
+        """
+        self.request_id = get_request_id(self.run_ids, self.request_id)
+        inputs = self.get_sample_inputs()
+
+        return [
+            RunCreator(
+                **{
+                    "name": "ACCESS LEGACY MSI M1: %s, %i of %i" % (self.request_id, i + 1, len(inputs)),
+                    "app": self.get_pipeline_id(),
+                    "inputs": job,
+                    "tags": {
+                        settings.REQUEST_ID_METADATA_KEY: self.request_id,
+                        "cmoSampleIds": job["sample_name"],
+                        settings.PATIENT_ID_METADATA_KEY: "-".join(job["sample_name"][0].split("-")[0:2]),
+                    },
+                }
+            )
+            for i, job in enumerate(inputs)
+        ]
+
+    def construct_sample_inputs(self, sample_name, tumor_bam, matched_normal_bam):
+        """
+        Use sample metadata and json template to create inputs for the CWL run
+
+        :return: JSON format sample inputs
+        """
+        with open(os.path.join(WORKDIR, "input_template.json.jinja2")) as file:
+            template = Template(file.read())
+
+        sample_names = [sample_name]
+        matched_normal_bams = [create_cwl_file_object(matched_normal_bam.path)]
+        tumor_bams = [create_cwl_file_object(tumor_bam.path)]
+
+        input_file = template.render(
+            tumor_bams=json.dumps(tumor_bams),
+            normal_bams=json.dumps(matched_normal_bams),
+            sample_names=json.dumps(sample_names),
+        )
+
+        sample_input = json.loads(input_file)
+        return sample_input
diff --git a/runner/operator/access/v2_1_0/msi/input_template.json.jinja2 b/runner/operator/access/v2_1_0/msi/input_template.json.jinja2
new file mode 100644
index 000000000..84b5421c3
--- /dev/null
+++ b/runner/operator/access/v2_1_0/msi/input_template.json.jinja2
@@ -0,0 +1,18 @@
+{
+   "file_path": "/home/accessbot/miniconda3/envs/ACCESS_cmplx_geno_test/lib/python2.7/site-packages/cwl_tools/msi",
+   "project_name": "MSI_test",
+   "tumor_bam": {{tumor_bams}},
+   "normal_bam": {{normal_bams}},
+   "sample_name": {{sample_names}},
+   "threads": 4,
+   "microsatellites": {
+      "class": "File",
+      "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/regions_of_interest/versions/v1.0/microsatellites.list"
+   },
+   "model": {
+      "class": "File",
+      "location": "juno:///juno/work/access/production/resources/admie/versions/v1.0/ADMIE.joblib"
+   },
+   "tmp_dir": "/scratch",
+   "version": "1.3.23-13-g07e0d12"
+}
diff --git a/runner/operator/access/v2_1_0/structural_variants/__init__.py b/runner/operator/access/v2_1_0/structural_variants/__init__.py
index 7a7440dc8..d4086995c 100755
--- a/runner/operator/access/v2_1_0/structural_variants/__init__.py
+++ b/runner/operator/access/v2_1_0/structural_variants/__init__.py
@@ -17,7 +17,7 @@
 WORKDIR = os.path.dirname(os.path.abspath(__file__))
 TUMOR_OR_NORMAL_SEARCH = "-L0"
 SAMPLE_ID_SEP = "_cl_aln"
-ACCESS_DEFAULT_SV_NORMAL_ID = "DONOR22-TP"
+ACCESS_DEFAULT_SV_NORMAL_ID = "Donor19F21c2206-TP01"
 ACCESS_DEFAULT_SV_NORMAL_FILENAME = "Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R_cl_aln_srt_MD_IR_FX_BR.bam"
 
 
diff --git a/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2 b/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2
index e03a40610..f465530de 100755
--- a/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2
+++ b/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2
@@ -1,9 +1,10 @@
 {
     "version": "1.3.40",
     "project_name": "{{ tumor_sample_id }}",
+    #TODO change reference 
     "ref_fasta": {
         "class": "File",
-        "location": "juno:///juno/work/access/production/resources/reference/versions/hg19/Homo_sapiens_assembly19.fasta"
+        "location": "juno:///juno/work/access/production/resources/reference/versions/hg19_virus_special/hg19_virus.fasta"
     },
 
     "sv_sample_id": {{ tumor_sample_names }},

From 62101f448b1770c6d005a165cd1de7242ee00da6 Mon Sep 17 00:00:00 2001
From: buehlere <buehlere@mskcc.org>
Date: Tue, 11 Feb 2025 11:11:49 -0500
Subject: [PATCH 03/16] adding msi and cnv

---
 runner/operator/access/__init__.py            |  1 +
 runner/operator/access/v2_1_0/cnv/__init__.py | 31 +++++++++++--------
 runner/operator/access/v2_1_0/msi/__init__.py | 26 +++++++++++-----
 .../v2_1_0/structural_variants/__init__.py    |  2 +-
 4 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/runner/operator/access/__init__.py b/runner/operator/access/__init__.py
index 77a3d6f7f..7ff2ce1ed 100755
--- a/runner/operator/access/__init__.py
+++ b/runner/operator/access/__init__.py
@@ -3,6 +3,7 @@
 from django.conf import settings
 from runner.models import Run, RunStatus, Port
 from file_system.models import File, FileMetadata
+from runner.operator.access import get_request_id_runs
 
 
 logger = logging.getLogger(__name__)
diff --git a/runner/operator/access/v2_1_0/cnv/__init__.py b/runner/operator/access/v2_1_0/cnv/__init__.py
index 46d930b89..a7e4603a4 100644
--- a/runner/operator/access/v2_1_0/cnv/__init__.py
+++ b/runner/operator/access/v2_1_0/cnv/__init__.py
@@ -8,17 +8,12 @@
 from runner.operator.operator import Operator
 from runner.run.objects.run_creator_object import RunCreator
 from file_system.repository.file_repository import FileRepository
-from runner.operator.access import get_request_id, get_request_id_runs
+from runner.operator.access import get_request_id_runs, find_request_bams, is_tumor_bam
 
 
 logger = logging.getLogger(__name__)
 
-SAMPLE_ID_SEP = "_cl_aln"
-TUMOR_SEARCH = "-L0"
-NORMAL_SEARCH = "-N0"
 WORKDIR = os.path.dirname(os.path.abspath(__file__))
-ACCESS_DEFAULT_CNV_NORMAL_FILENAME = r"DONOR22-TP_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX.bam$"
-UNFILTERED_BAM_SEARCH = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX.bam"
 
 
 class AccessLegacyCNVOperator(Operator):
@@ -43,14 +38,25 @@ def get_sample_inputs(self):
 
         :return: list of json_objects
         """
-        run_ids = self.run_ids if self.run_ids else [r.id for r in get_request_id_runs(self.request_id)]
+        runs, self.request_id = get_request_id_runs(["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id)
+        
+        bams = []
+        for run in runs:
+            bams.append(find_request_bams(run))
+
+        # TUMOR
+        unfiltered_bam_ports = [
+            b[["unfiltered_bams", "fgbio_collapsed_bam"]]
+            for b in bams
+                if is_tumor_bam(b["unfiltered_bams"].file_name)
+        ]
 
         # Get all unfiltered bam ports for these runs
-        unfiltered_bam_ports = Port.objects.filter(
-            name__in=["unfiltered_bams", "fgbio_collapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED
-        )
+        # unfiltered_bam_ports = Port.objects.filter(
+        #     name__in=["unfiltered_bams", "fgbio_collapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED
+        # )
 
-        unfiltered_tumor_bams = [f for p in unfiltered_bam_ports for f in p.files.all() if self.is_tumor_bam(f)]
+        # unfiltered_tumor_bams = [f for p in unfiltered_bam_ports for f in p.files.all() if self.is_tumor_bam(f)]
 
         sample_ids = []
         tumor_bams = []
@@ -89,7 +95,7 @@ def get_jobs(self):
             (
                 RunCreator(
                     **{
-                        "name": "ACCESS LEGACY CNV M1: %s, %i of %i" % (self.request_id, i + 1, len(inputs)),
+                        "name": "ACCESS V2 LEGACY CNV M1: %s, %i of %i" % (self.request_id, i + 1, len(inputs)),
                         "app": self.get_pipeline_id(),
                         "inputs": job,
                         "tags": {
@@ -113,7 +119,6 @@ def construct_sample_inputs(self, tumor_bam, sample_sex):
             template = Template(file.read())
 
             tumor_sample_list = tumor_bam.path + "\t" + sample_sex
-            # Todo: need this to work with Nucleo bams:
             tumor_sample_id = tumor_bam.file_name.split("_cl_aln_srt_MD_IR_FX_BR")[0]
 
             input_file = template.render(
diff --git a/runner/operator/access/v2_1_0/msi/__init__.py b/runner/operator/access/v2_1_0/msi/__init__.py
index c9e5cb71d..d474cccb5 100644
--- a/runner/operator/access/v2_1_0/msi/__init__.py
+++ b/runner/operator/access/v2_1_0/msi/__init__.py
@@ -11,8 +11,8 @@
 from file_system.models import File
 from runner.operator.operator import Operator
 from runner.run.objects.run_creator_object import RunCreator
-from runner.operator.access import get_request_id, get_request_id_runs, create_cwl_file_object
 from runner.models import Port, RunStatus
+from runner.operator.access import get_request_id_runs, find_request_bams, is_tumor_bam, create_cwl_file_object
 
 
 logger = logging.getLogger(__name__)
@@ -51,14 +51,26 @@ def get_sample_inputs(self):
 
         :return: list of json_objects
         """
-        run_ids = self.run_ids if self.run_ids else [r.id for r in get_request_id_runs(self.request_id)]
+        runs, self.request_id = get_request_id_runs(["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id)
+        
+        bams = []
+        for run in runs:
+            bams.append(find_request_bams(run))
+
+        # TUMOR
+        standard_tumor_bams = [
+            b[["standard_bams", "uncollapsed_bam"]]
+            for b in bams
+                if is_tumor_bam(b["unfiltered_bams"].file_name)
+        ]
+        # run_ids = self.run_ids if self.run_ids else [r.id for r in get_request_id_runs(self.request_id)]
 
-        # Get all standard bam ports for these runs
-        standard_bam_ports = Port.objects.filter(
-            name__in=["standard_bams", "uncollapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED
-        )
+        # # Get all standard bam ports for these runs
+        # standard_bam_ports = Port.objects.filter(
+        #     name__in=["standard_bams", "uncollapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED
+        # )
 
-        standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)]
+        # standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)]
 
         # Dictionary that associates tumor bam with standard bam with tumor_sample_id
         sample_tumor_normal = {}
diff --git a/runner/operator/access/v2_1_0/structural_variants/__init__.py b/runner/operator/access/v2_1_0/structural_variants/__init__.py
index d4086995c..60fea3adc 100755
--- a/runner/operator/access/v2_1_0/structural_variants/__init__.py
+++ b/runner/operator/access/v2_1_0/structural_variants/__init__.py
@@ -85,7 +85,7 @@ def get_jobs(self):
         return [
             RunCreator(
                 **{
-                    "name": "ACCESS LEGACY SV M1: %s, %i of %i" % (self.request_id, i + 1, len(sample_inputs)),
+                    "name": "ACCESS V2 LEGACY SV M1: %s, %i of %i" % (self.request_id, i + 1, len(sample_inputs)),
                     "app": self.get_pipeline_id(),
                     "inputs": job,
                     "tags": {

From 8453181ce67d9b8b11268363c01515d7fb17bde0 Mon Sep 17 00:00:00 2001
From: buehlere <buehlere@mskcc.org>
Date: Tue, 11 Feb 2025 12:49:32 -0500
Subject: [PATCH 04/16] lininting and class name udpate

---
 runner/operator/access/__init__.py            |  9 +++++----
 runner/operator/access/v2_1_0/cnv/__init__.py | 17 +++++++----------
 runner/operator/access/v2_1_0/msi/__init__.py | 15 ++++++---------
 .../v2_1_0/structural_variants/__init__.py    | 19 ++++++++++++-------
 4 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/runner/operator/access/__init__.py b/runner/operator/access/__init__.py
index 7ff2ce1ed..4947235c1 100755
--- a/runner/operator/access/__init__.py
+++ b/runner/operator/access/__init__.py
@@ -38,10 +38,9 @@ def get_request_id_runs(app, run_ids, request_id):
     :return: List[str] - List of most recent runs from given request ID
     """
 
-
     if not request_id:
-            most_recent_runs_for_request = Run.objects.filter(pk__in=run_ids, status=RunStatus.COMPLETED)
-            request_id = RunStatus[0].tags["igoRequestId"]
+        most_recent_runs_for_request = Run.objects.filter(pk__in=run_ids, status=RunStatus.COMPLETED)
+        request_id = RunStatus[0].tags["igoRequestId"]
     else:
         most_recent_runs_for_request = (
             Run.objects.filter(
@@ -196,6 +195,7 @@ def parse_nucleo_output_ports(run, port_name):
     bam = [b for b in bam_bai.files.all() if b.file_name.endswith(".bam")][0]
     return bam
 
+
 def find_request_bams(run):
     """
     Find simplex and duplex bams from a request's nucleo run
@@ -219,8 +219,9 @@ def find_request_bams(run):
 
     return bams
 
+
 def is_tumor_bam(file):
     if not file.endswith(".bam"):
         return False
     t_n_timepoint = file.split("-")[2]
-    return not t_n_timepoint[0] == "N"
\ No newline at end of file
+    return not t_n_timepoint[0] == "N"
diff --git a/runner/operator/access/v2_1_0/cnv/__init__.py b/runner/operator/access/v2_1_0/cnv/__init__.py
index a7e4603a4..3bb8ebeb0 100644
--- a/runner/operator/access/v2_1_0/cnv/__init__.py
+++ b/runner/operator/access/v2_1_0/cnv/__init__.py
@@ -16,7 +16,7 @@
 WORKDIR = os.path.dirname(os.path.abspath(__file__))
 
 
-class AccessLegacyCNVOperator(Operator):
+class AccessV2LegacyCNVOperator(Operator):
     """
     Operator for the ACCESS Legacy Copy Number Variants workflow:
 
@@ -38,25 +38,23 @@ def get_sample_inputs(self):
 
         :return: list of json_objects
         """
-        runs, self.request_id = get_request_id_runs(["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id)
-        
+        runs, self.request_id = get_request_id_runs(
+            ["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id
+        )
+
         bams = []
         for run in runs:
             bams.append(find_request_bams(run))
 
         # TUMOR
-        unfiltered_bam_ports = [
-            b[["unfiltered_bams", "fgbio_collapsed_bam"]]
-            for b in bams
-                if is_tumor_bam(b["unfiltered_bams"].file_name)
-        ]
+        unfiltered_bam_ports = [b[["unfiltered_bams", "fgbio_collapsed_bam"]] for b in bams]
 
         # Get all unfiltered bam ports for these runs
         # unfiltered_bam_ports = Port.objects.filter(
         #     name__in=["unfiltered_bams", "fgbio_collapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED
         # )
 
-        # unfiltered_tumor_bams = [f for p in unfiltered_bam_ports for f in p.files.all() if self.is_tumor_bam(f)]
+        unfiltered_tumor_bams = [f for p in unfiltered_bam_ports for f in p.files.all() if is_tumor_bam(f)]
 
         sample_ids = []
         tumor_bams = []
@@ -88,7 +86,6 @@ def get_jobs(self):
 
         :return: list[(serialized job info, Job)]
         """
-        self.request_id = get_request_id(self.run_ids, self.request_id)
         inputs, sample_ids = self.get_sample_inputs()
 
         return [
diff --git a/runner/operator/access/v2_1_0/msi/__init__.py b/runner/operator/access/v2_1_0/msi/__init__.py
index d474cccb5..a730505a8 100644
--- a/runner/operator/access/v2_1_0/msi/__init__.py
+++ b/runner/operator/access/v2_1_0/msi/__init__.py
@@ -51,18 +51,16 @@ def get_sample_inputs(self):
 
         :return: list of json_objects
         """
-        runs, self.request_id = get_request_id_runs(["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id)
-        
+        runs, self.request_id = get_request_id_runs(
+            ["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id
+        )
+
         bams = []
         for run in runs:
             bams.append(find_request_bams(run))
 
         # TUMOR
-        standard_tumor_bams = [
-            b[["standard_bams", "uncollapsed_bam"]]
-            for b in bams
-                if is_tumor_bam(b["unfiltered_bams"].file_name)
-        ]
+        standard_bam_ports = [b[["standard_bams", "uncollapsed_bam"]] for b in bams]
         # run_ids = self.run_ids if self.run_ids else [r.id for r in get_request_id_runs(self.request_id)]
 
         # # Get all standard bam ports for these runs
@@ -70,7 +68,7 @@ def get_sample_inputs(self):
         #     name__in=["standard_bams", "uncollapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED
         # )
 
-        # standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)]
+        standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)]
 
         # Dictionary that associates tumor bam with standard bam with tumor_sample_id
         sample_tumor_normal = {}
@@ -105,7 +103,6 @@ def get_jobs(self):
 
         :return: list[(serialized job info, Job)]
         """
-        self.request_id = get_request_id(self.run_ids, self.request_id)
         inputs = self.get_sample_inputs()
 
         return [
diff --git a/runner/operator/access/v2_1_0/structural_variants/__init__.py b/runner/operator/access/v2_1_0/structural_variants/__init__.py
index 60fea3adc..7460168c2 100755
--- a/runner/operator/access/v2_1_0/structural_variants/__init__.py
+++ b/runner/operator/access/v2_1_0/structural_variants/__init__.py
@@ -8,9 +8,16 @@
 from runner.operator.operator import Operator
 from runner.run.objects.run_creator_object import RunCreator
 from file_system.repository.file_repository import File
-from runner.operator.access import get_request_id, get_request_id_runs, create_cwl_file_object, find_request_bams, is_tumor_bam
+from runner.operator.access import (
+    get_request_id,
+    get_request_id_runs,
+    create_cwl_file_object,
+    find_request_bams,
+    is_tumor_bam,
+)
 from runner.models import RunStatus, Port, Run, Pipeline
 from datetime import datetime
+
 # DUPLEX_BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-duplex.bam"
 BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR.bam"
 logger = logging.getLogger(__name__)
@@ -48,11 +55,7 @@ def get_sample_inputs(self, runs):
             bams.append(find_request_bams(run))
 
         # TUMOR
-        standard_tumor_bams = [
-            b["uncollapsed_bam"]
-            for b in bams
-            if is_tumor_bam(b["uncollapsed_bam"].file_name)
-        ]
+        standard_tumor_bams = [b["uncollapsed_bam"] for b in bams if is_tumor_bam(b["uncollapsed_bam"].file_name)]
 
         # standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)]
         # sample_ids = [f.file_name.split("_cl_aln")[0] for f in standard_tumor_bams]
@@ -79,7 +82,9 @@ def get_jobs(self):
         run_date = datetime.now().strftime("%Y%m%d_%H:%M:%f")
         # If no request_id, get request id from run information
         # else request_id given directly
-        runs, self.request_id = get_request_id_runs(["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id)
+        runs, self.request_id = get_request_id_runs(
+            ["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id
+        )
         sample_inputs = self.get_sample_inputs(runs)
 
         return [

From 1e9f45777adccdd86552b320e5409ca43ddf1364 Mon Sep 17 00:00:00 2001
From: Allan Bolipata <allan.bolipata@gmail.com>
Date: Tue, 11 Feb 2025 12:51:12 -0500
Subject: [PATCH 05/16] access helper import update

---
 runner/operator/access/__init__.py            | 2 --
 runner/operator/access/v2_1_0/msi/__init__.py | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 runner/operator/access/v2_1_0/msi/__init__.py

diff --git a/runner/operator/access/__init__.py b/runner/operator/access/__init__.py
index 7ff2ce1ed..82c0e5488 100755
--- a/runner/operator/access/__init__.py
+++ b/runner/operator/access/__init__.py
@@ -3,8 +3,6 @@
 from django.conf import settings
 from runner.models import Run, RunStatus, Port
 from file_system.models import File, FileMetadata
-from runner.operator.access import get_request_id_runs
-
 
 logger = logging.getLogger(__name__)
 
diff --git a/runner/operator/access/v2_1_0/msi/__init__.py b/runner/operator/access/v2_1_0/msi/__init__.py
old mode 100644
new mode 100755
index d474cccb5..bd8994eda
--- a/runner/operator/access/v2_1_0/msi/__init__.py
+++ b/runner/operator/access/v2_1_0/msi/__init__.py
@@ -12,7 +12,7 @@
 from runner.operator.operator import Operator
 from runner.run.objects.run_creator_object import RunCreator
 from runner.models import Port, RunStatus
-from runner.operator.access import get_request_id_runs, find_request_bams, is_tumor_bam, create_cwl_file_object
+from runner.operator.access import get_request_id, get_request_id_runs, create_cwl_file_object, find_request_bams, is_tumor_bam
 
 
 logger = logging.getLogger(__name__)
@@ -27,7 +27,7 @@
 WORKDIR = os.path.dirname(os.path.abspath(__file__))
 
 
-class AccessLegacyMSIOperator(Operator):
+class AccessLegacyV2MSIOperator(Operator):
     """
     Operator for the ACCESS Legacy Microsatellite Instability workflow:
 

From f0c66c481697afb3a9975eb22aa03129b4d11f6d Mon Sep 17 00:00:00 2001
From: Allan Bolipata <allan.bolipata@gmail.com>
Date: Wed, 12 Feb 2025 10:03:26 -0500
Subject: [PATCH 06/16] working operators, cnv hits v2 bug

---
 runner/operator/access/v2_1_0/cnv/__init__.py   | 13 +++----------
 runner/operator/access/v2_1_0/msi/__init__.py   | 17 +++--------------
 .../v2_1_0/structural_variants/__init__.py      | 14 +++-----------
 .../input_template.json.jinja2                  |  1 -
 4 files changed, 9 insertions(+), 36 deletions(-)
 mode change 100644 => 100755 runner/operator/access/v2_1_0/cnv/__init__.py

diff --git a/runner/operator/access/v2_1_0/cnv/__init__.py b/runner/operator/access/v2_1_0/cnv/__init__.py
old mode 100644
new mode 100755
index 3bb8ebeb0..2f82900d3
--- a/runner/operator/access/v2_1_0/cnv/__init__.py
+++ b/runner/operator/access/v2_1_0/cnv/__init__.py
@@ -46,15 +46,8 @@ def get_sample_inputs(self):
         for run in runs:
             bams.append(find_request_bams(run))
 
-        # TUMOR
-        unfiltered_bam_ports = [b[["unfiltered_bams", "fgbio_collapsed_bam"]] for b in bams]
-
-        # Get all unfiltered bam ports for these runs
-        # unfiltered_bam_ports = Port.objects.filter(
-        #     name__in=["unfiltered_bams", "fgbio_collapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED
-        # )
-
-        unfiltered_tumor_bams = [f for p in unfiltered_bam_ports for f in p.files.all() if is_tumor_bam(f)]
+        # TUMOR Unfiltered 
+        unfiltered_tumor_bams = [b["fgbio_collapsed_bam"] for b in bams if is_tumor_bam(b["fgbio_collapsed_bam"].file_name)]
 
         sample_ids = []
         tumor_bams = []
@@ -122,6 +115,6 @@ def construct_sample_inputs(self, tumor_bam, sample_sex):
                 tumor_sample_id=tumor_sample_id,
                 tumor_sample_list_content=json.dumps(tumor_sample_list),
             )
-
+            print(input_file)
             sample_input = json.loads(input_file)
             return sample_input
diff --git a/runner/operator/access/v2_1_0/msi/__init__.py b/runner/operator/access/v2_1_0/msi/__init__.py
index 26b17497a..ecacc6888 100755
--- a/runner/operator/access/v2_1_0/msi/__init__.py
+++ b/runner/operator/access/v2_1_0/msi/__init__.py
@@ -18,16 +18,13 @@
 logger = logging.getLogger(__name__)
 
 # Todo: needs to work for Nucleo bams as well
-SAMPLE_ID_SEP = "_cl_aln"
 TUMOR_SEARCH = "-L0"
-TUMOR_SEARCH_NEW = "_L0"
 NORMAL_SEARCH = "-N0"
-NORMAL_SEARCH_NEW = "_N0"
 STANDARD_BAM_SEARCH = "_cl_aln_srt_MD_IR_FX_BR.bam"
 WORKDIR = os.path.dirname(os.path.abspath(__file__))
 
 
-class AccessLegacyV2MSIOperator(Operator):
+class AccessV2LegacyMSIOperator(Operator):
     """
     Operator for the ACCESS Legacy Microsatellite Instability workflow:
 
@@ -59,16 +56,8 @@ def get_sample_inputs(self):
         for run in runs:
             bams.append(find_request_bams(run))
 
-        # TUMOR
-        standard_bam_ports = [b[["standard_bams", "uncollapsed_bam"]] for b in bams]
-        # run_ids = self.run_ids if self.run_ids else [r.id for r in get_request_id_runs(self.request_id)]
-
-        # # Get all standard bam ports for these runs
-        # standard_bam_ports = Port.objects.filter(
-        #     name__in=["standard_bams", "uncollapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED
-        # )
-
-        standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)]
+        # TUMOR Uncollapsed 
+        standard_tumor_bams = [b["uncollapsed_bam"] for b in bams if is_tumor_bam(b["fgbio_collapsed_bam"].file_name)]
 
         # Dictionary that associates tumor bam with standard bam with tumor_sample_id
         sample_tumor_normal = {}
diff --git a/runner/operator/access/v2_1_0/structural_variants/__init__.py b/runner/operator/access/v2_1_0/structural_variants/__init__.py
index 7460168c2..36cf86f22 100755
--- a/runner/operator/access/v2_1_0/structural_variants/__init__.py
+++ b/runner/operator/access/v2_1_0/structural_variants/__init__.py
@@ -4,27 +4,21 @@
 
 from django.conf import settings
 from jinja2 import Template
-from runner.models import Port, RunStatus
 from runner.operator.operator import Operator
 from runner.run.objects.run_creator_object import RunCreator
 from file_system.repository.file_repository import File
 from runner.operator.access import (
-    get_request_id,
     get_request_id_runs,
     create_cwl_file_object,
     find_request_bams,
     is_tumor_bam,
 )
-from runner.models import RunStatus, Port, Run, Pipeline
+from runner.models import Pipeline
 from datetime import datetime
 
-# DUPLEX_BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-duplex.bam"
 BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR.bam"
 logger = logging.getLogger(__name__)
 WORKDIR = os.path.dirname(os.path.abspath(__file__))
-TUMOR_OR_NORMAL_SEARCH = "-L0"
-SAMPLE_ID_SEP = "_cl_aln"
-ACCESS_DEFAULT_SV_NORMAL_ID = "Donor19F21c2206-TP01"
 ACCESS_DEFAULT_SV_NORMAL_FILENAME = "Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R_cl_aln_srt_MD_IR_FX_BR.bam"
 
 
@@ -54,11 +48,9 @@ def get_sample_inputs(self, runs):
         for run in runs:
             bams.append(find_request_bams(run))
 
-        # TUMOR
+        # Standard TUMOR
         standard_tumor_bams = [b["uncollapsed_bam"] for b in bams if is_tumor_bam(b["uncollapsed_bam"].file_name)]
 
-        # standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)]
-        # sample_ids = [f.file_name.split("_cl_aln")[0] for f in standard_tumor_bams]
         sample_ids = [b.file_name.replace(BAM_STEM, "") for b in standard_tumor_bams]
         normal_bam = File.objects.filter(file_name=ACCESS_DEFAULT_SV_NORMAL_FILENAME)
         normal_bam = normal_bam[0]
@@ -126,6 +118,6 @@ def construct_sample_inputs(self, tumor_sample_id, tumor_bam, normal_bam):
                 tumor_bams=json.dumps(tumor_bams),
                 normal_bam=json.dumps(normal_bam),
             )
-
+            print(input_file)
             sample_input = json.loads(input_file)
             return sample_input
diff --git a/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2 b/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2
index f465530de..295355be3 100755
--- a/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2
+++ b/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2
@@ -1,7 +1,6 @@
 {
     "version": "1.3.40",
     "project_name": "{{ tumor_sample_id }}",
-    #TODO change reference 
     "ref_fasta": {
         "class": "File",
         "location": "juno:///juno/work/access/production/resources/reference/versions/hg19_virus_special/hg19_virus.fasta"

From 5eaa6017e9d7d8c06c851930ced8fda4380d8432 Mon Sep 17 00:00:00 2001
From: buehlere <buehlere@mskcc.org>
Date: Wed, 12 Feb 2025 10:04:37 -0500
Subject: [PATCH 07/16] linting

---
 runner/operator/access/v2_1_0/cnv/__init__.py |  6 ++++--
 runner/operator/access/v2_1_0/msi/__init__.py | 10 ++++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/runner/operator/access/v2_1_0/cnv/__init__.py b/runner/operator/access/v2_1_0/cnv/__init__.py
index 2f82900d3..5c483a375 100755
--- a/runner/operator/access/v2_1_0/cnv/__init__.py
+++ b/runner/operator/access/v2_1_0/cnv/__init__.py
@@ -46,8 +46,10 @@ def get_sample_inputs(self):
         for run in runs:
             bams.append(find_request_bams(run))
 
-        # TUMOR Unfiltered 
-        unfiltered_tumor_bams = [b["fgbio_collapsed_bam"] for b in bams if is_tumor_bam(b["fgbio_collapsed_bam"].file_name)]
+        # TUMOR Unfiltered
+        unfiltered_tumor_bams = [
+            b["fgbio_collapsed_bam"] for b in bams if is_tumor_bam(b["fgbio_collapsed_bam"].file_name)
+        ]
 
         sample_ids = []
         tumor_bams = []
diff --git a/runner/operator/access/v2_1_0/msi/__init__.py b/runner/operator/access/v2_1_0/msi/__init__.py
index ecacc6888..6bcd47557 100755
--- a/runner/operator/access/v2_1_0/msi/__init__.py
+++ b/runner/operator/access/v2_1_0/msi/__init__.py
@@ -12,7 +12,13 @@
 from runner.operator.operator import Operator
 from runner.run.objects.run_creator_object import RunCreator
 from runner.models import Port, RunStatus
-from runner.operator.access import get_request_id, get_request_id_runs, create_cwl_file_object, find_request_bams, is_tumor_bam
+from runner.operator.access import (
+    get_request_id,
+    get_request_id_runs,
+    create_cwl_file_object,
+    find_request_bams,
+    is_tumor_bam,
+)
 
 
 logger = logging.getLogger(__name__)
@@ -56,7 +62,7 @@ def get_sample_inputs(self):
         for run in runs:
             bams.append(find_request_bams(run))
 
-        # TUMOR Uncollapsed 
+        # TUMOR Uncollapsed
         standard_tumor_bams = [b["uncollapsed_bam"] for b in bams if is_tumor_bam(b["fgbio_collapsed_bam"].file_name)]
 
         # Dictionary that associates tumor bam with standard bam with tumor_sample_id

From 4852c27a3d404b8c68d646557a8d521b034e6c1a Mon Sep 17 00:00:00 2001
From: Allan Bolipata <allan.bolipata@gmail.com>
Date: Thu, 13 Feb 2025 17:07:01 -0500
Subject: [PATCH 08/16] updated snv

---
 .../input_template.json.jinja2                | 20 ---------
 .../v2_1_0/snps_and_indels/snps_and_indels.py | 42 ++++++-------------
 2 files changed, 13 insertions(+), 49 deletions(-)

diff --git a/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2 b/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2
index d10b5d90a..0dda14c3d 100755
--- a/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2
+++ b/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2
@@ -112,26 +112,6 @@
         "class": "File",
         "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/regions_of_interest/versions/v1.0/mutect_annotate_concat_header.txt"
     },
-    "curated_duplex_bams": {
-        "novaseq": {
-            "class": "Directory",
-            "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/novaseq_curated_duplex_bams_dmp"
-        },
-        "hiseq": {
-            "class": "Directory",
-            "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/hiseq4000_curated_duplex_bams_dmp"
-        }
-    },
-    "curated_simplex_bams": {
-        "novaseq": {
-            "class": "Directory",
-            "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/novaseq_curated_simplex_bams_dmp"
-        },
-        "hiseq": {
-            "class": "Directory",
-            "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/hiseq4000_curated_simplex_bams_dmp"
-        }
-    },
     "reference_bam_for_VC": {
         "class": "File",
         "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/novaseq_unmatched_normal_plasma_duplex_bams_dmp/versions/v1.0/Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-duplex.bam"
diff --git a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py
index e6fcc7943..afc52201a 100755
--- a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py
+++ b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py
@@ -26,6 +26,7 @@
 LOGGER = logging.getLogger(__name__)
 ACCESS_CURATED_BAMS_FILE_GROUP_SLUG = "accessv2_curated_normals_02"
 ACCESS_DEFAULT_NORMAL_ID = "Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R"
+BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX"
 NORMAL_SAMPLE_SEARCH = "-N0"
 TUMOR_SAMPLE_SEARCH = "-L0"
 DUPLEX_BAM_SEARCH = "__aln_srt_IR_FX-duplex.bam"
@@ -42,7 +43,6 @@
 DUPLEX_BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-duplex.bam"
 SIMPLEX_BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-simplex.bam"
 UNCOLLAPSED_BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR.bam"
-BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX"
 ACCESS_DEFAULT_NORMAL_FILENAME_DUPLEX = (
     "Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-duplex.bam"
 )
@@ -460,14 +460,6 @@ def get_dmp_matched_patient_geno_samples(patient_id):
         metadata__type="T",
         file__path__endswith=DMP_REGEX,
     )
-    # if not matched_tumors_dmp:
-    #     matched_tumors_dmp = FileMetadata.objects.filter(
-    #         file__file_group=DMP_FILE_GROUP,
-    #         metadata__patient__cmo=patient_id.lstrip("C-"),
-    #         metadata__assay="XS1",
-    #         metadata__type="T",
-    #         file__path__endswith=DMP_REGEX,
-    #     )
     matched_tumors_dmp_simplex = [b.file for b in matched_tumors_dmp]
     matched_tumors_dmp_duplex = copy.deepcopy(matched_tumors_dmp_simplex)
 
@@ -681,27 +673,19 @@ def construct_sample_inputs(self, sample_info):
 
             tumor_bam_duplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][0].path)]
             tumor_bam_simplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][1].path)]
-            tumor_sample_name = [sample_info["tumor_bam"][0][0].file_name.replace(DUPLEX_BAM_STEM, "")]
+            tumor_sample_name = [sample_info["tumor_bam"][0][0].file_name.split("_")[0]]
             tumor_duplex_id = [
-                sample_info["tumor_bam"][0][0].file_name.replace(BAM_STEM, "").replace("-duplex.bam", "")
+                sample_info["tumor_bam"][0][0].file_name.split("_")[0]
             ]
             tumor_simplex_id = [
-                sample_info["tumor_bam"][0][1].file_name.replace(BAM_STEM, "").replace("-simplex.bam", "-SIMPLEX")
+                sample_info["tumor_bam"][0][1].file_name.split("_")[0] + "-SIMPLEX"
             ]
 
             normal_bam_duplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][0].path)]
             normal_bam_simplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][1].path)]
-            normal_sample_name = [sample_info["normal_bam"][0][0].file_name.replace(DUPLEX_BAM_STEM, "")]
-            normal_duplex_id = [
-                sample_info["normal_bam"][0][0]
-                .file_name.replace(BAM_STEM, "")
-                .replace("-duplex.bam", "-CURATED-DUPLEX")
-            ]
-            normal_simplex_id = [
-                sample_info["normal_bam"][0][1]
-                .file_name.replace(BAM_STEM, "")
-                .replace("-simplex.bam", "-CURATED-SIMPLEX")
-            ]
+            normal_sample_name = [sample_info["normal_bam"][0][0].file_name.split("_")[0]]
+            normal_duplex_id = [sample_info["normal_bam"][0][0].file_name.split("_")[0] + "-CURATED-DUPLEX"]
+            normal_simplex_id = [sample_info["normal_bam"][0][1].file_name.split("_")[0]  + "-CURATED-SIMPLEX"]
 
             genotyping_bams_ids = tumor_duplex_id + tumor_simplex_id + normal_duplex_id + normal_simplex_id
 
@@ -712,7 +696,7 @@ def construct_sample_inputs(self, sample_info):
             else:
                 matched_normal = [_create_cwl_bam_object(sample_info["matched_normal_unfiltered"][0].path)]
                 matched_normal_id = [
-                    sample_info["matched_normal_unfiltered"][0].file_name.replace(UNCOLLAPSED_BAM_STEM, "")
+                    sample_info["matched_normal_unfiltered"][0].file_name.split("_")[0]
                 ]
                 genotyping_bams_ids += matched_normal_id
                 genotyping_bams += matched_normal
@@ -721,22 +705,22 @@ def construct_sample_inputs(self, sample_info):
                 for f in files:
                     if key == "geno_samples":
                         # TODO jsut do the replace here
-                        sample_id_duplex = f[0].file_name.replace(BAM_STEM, "").replace("-duplex.bam", "")
-                        sample_id_simplex = f[1].file_name.replace(BAM_STEM, "").replace("-simplex.bam", "-SIMPLEX")
+                        sample_id_duplex = f[0].file_name.split("_")[0]
+                        sample_id_simplex = f[1].file_name.split("_")[0] + "-SIMPLEX"
                         genotyping_bams_ids.append(sample_id_duplex)
                         genotyping_bams_ids.append(sample_id_simplex)
                         genotyping_bams.append(_create_cwl_bam_object(f[0].path))
                         genotyping_bams.append(_create_cwl_bam_object(f[1].path))
                     if key == "geno_samples_normal_unfiltered":
-                        sample_id = f.file_name.replace(UNCOLLAPSED_BAM_STEM, "")
+                        sample_id = f.file_name.split("_")[0]
                         genotyping_bams_ids.append(sample_id)
                         genotyping_bams.append(_create_cwl_bam_object(f.path))
                     if key == "curated_normal_bams":
                         sample_id_duplex = (
-                            f[0].file_name.replace(BAM_STEM, "").replace("-duplex.bam", "-CURATED-DUPLEX")
+                            f[0].file_name.split("_")[0] + "-CURATED-DUPLEX"
                         )
                         sample_id_simplex = (
-                            f[1].file_name.replace(BAM_STEM, "").replace("-simplex.bam", "-CURATED-SIMPLEX")
+                            f[1].file_name.split("_")[0] + "-CURATED-SIMPLEX"
                         )
                         genotyping_bams_ids.append(sample_id_duplex)
                         genotyping_bams_ids.append(sample_id_simplex)

From f5ec280b72b60d733f987d5c4e22ea94a740ab93 Mon Sep 17 00:00:00 2001
From: buehlere <buehlere@mskcc.org>
Date: Thu, 13 Feb 2025 17:13:59 -0500
Subject: [PATCH 09/16] Update snps_and_indels.py

---
 .../v2_1_0/snps_and_indels/snps_and_indels.py | 23 +++++--------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py
index afc52201a..d16334996 100755
--- a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py
+++ b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py
@@ -674,18 +674,14 @@ def construct_sample_inputs(self, sample_info):
             tumor_bam_duplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][0].path)]
             tumor_bam_simplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][1].path)]
             tumor_sample_name = [sample_info["tumor_bam"][0][0].file_name.split("_")[0]]
-            tumor_duplex_id = [
-                sample_info["tumor_bam"][0][0].file_name.split("_")[0]
-            ]
-            tumor_simplex_id = [
-                sample_info["tumor_bam"][0][1].file_name.split("_")[0] + "-SIMPLEX"
-            ]
+            tumor_duplex_id = [sample_info["tumor_bam"][0][0].file_name.split("_")[0]]
+            tumor_simplex_id = [sample_info["tumor_bam"][0][1].file_name.split("_")[0] + "-SIMPLEX"]
 
             normal_bam_duplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][0].path)]
             normal_bam_simplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][1].path)]
             normal_sample_name = [sample_info["normal_bam"][0][0].file_name.split("_")[0]]
             normal_duplex_id = [sample_info["normal_bam"][0][0].file_name.split("_")[0] + "-CURATED-DUPLEX"]
-            normal_simplex_id = [sample_info["normal_bam"][0][1].file_name.split("_")[0]  + "-CURATED-SIMPLEX"]
+            normal_simplex_id = [sample_info["normal_bam"][0][1].file_name.split("_")[0] + "-CURATED-SIMPLEX"]
 
             genotyping_bams_ids = tumor_duplex_id + tumor_simplex_id + normal_duplex_id + normal_simplex_id
 
@@ -695,16 +691,13 @@ def construct_sample_inputs(self, sample_info):
                 matched_normal_id = [""]
             else:
                 matched_normal = [_create_cwl_bam_object(sample_info["matched_normal_unfiltered"][0].path)]
-                matched_normal_id = [
-                    sample_info["matched_normal_unfiltered"][0].file_name.split("_")[0]
-                ]
+                matched_normal_id = [sample_info["matched_normal_unfiltered"][0].file_name.split("_")[0]]
                 genotyping_bams_ids += matched_normal_id
                 genotyping_bams += matched_normal
 
             for key, files in sample_info.items():
                 for f in files:
                     if key == "geno_samples":
-                        # TODO jsut do the replace here
                         sample_id_duplex = f[0].file_name.split("_")[0]
                         sample_id_simplex = f[1].file_name.split("_")[0] + "-SIMPLEX"
                         genotyping_bams_ids.append(sample_id_duplex)
@@ -716,12 +709,8 @@ def construct_sample_inputs(self, sample_info):
                         genotyping_bams_ids.append(sample_id)
                         genotyping_bams.append(_create_cwl_bam_object(f.path))
                     if key == "curated_normal_bams":
-                        sample_id_duplex = (
-                            f[0].file_name.split("_")[0] + "-CURATED-DUPLEX"
-                        )
-                        sample_id_simplex = (
-                            f[1].file_name.split("_")[0] + "-CURATED-SIMPLEX"
-                        )
+                        sample_id_duplex = f[0].file_name.split("_")[0] + "-CURATED-DUPLEX"
+                        sample_id_simplex = f[1].file_name.split("_")[0] + "-CURATED-SIMPLEX"
                         genotyping_bams_ids.append(sample_id_duplex)
                         genotyping_bams_ids.append(sample_id_simplex)
                         genotyping_bams.append(_create_cwl_bam_object(f[0].path))

From 814b7e2fe6209704a05175ec8be2e0899744b368 Mon Sep 17 00:00:00 2001
From: Sinisa Ivkovic <sinisa.ivkovic@gmail.com>
Date: Wed, 19 Feb 2025 16:22:39 -0500
Subject: [PATCH 10/16] Version bump 1.91.7

---
 beagle/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/beagle/__init__.py b/beagle/__init__.py
index d522dec1c..7f346af82 100644
--- a/beagle/__init__.py
+++ b/beagle/__init__.py
@@ -1 +1 @@
-__version__ = "1.91.6"
+__version__ = "1.91.7"

From ec2daf7ce8de211738d85065c587083bd8634c23 Mon Sep 17 00:00:00 2001
From: buehlere <buehlere@mskcc.org>
Date: Mon, 24 Feb 2025 16:51:32 -0500
Subject: [PATCH 11/16] Update snps_and_indels.py

---
 .../v2_1_0/snps_and_indels/snps_and_indels.py | 116 +++++-------------
 1 file changed, 34 insertions(+), 82 deletions(-)

diff --git a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py
index d16334996..61f1300b0 100755
--- a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py
+++ b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py
@@ -24,7 +24,7 @@
 
 WORKDIR = os.path.dirname(os.path.abspath(__file__))
 LOGGER = logging.getLogger(__name__)
-ACCESS_CURATED_BAMS_FILE_GROUP_SLUG = "accessv2_curated_normals_02"
+ACCESS_CURATED_BAMS_FILE_GROUP_SLUG = "accessv2_curated_normals"
 ACCESS_DEFAULT_NORMAL_ID = "Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R"
 BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX"
 NORMAL_SAMPLE_SEARCH = "-N0"
@@ -51,6 +51,15 @@
 )
 
 
+def check_genotype_list(genotyping_bams, genotyping_bams_ids):
+    if len(genotyping_bams_ids) != len(genotyping_bams):
+            raise Exception(f"list of genotyping bams: {genotyping_bams} is a different length from list of genotyping ids {genotyping_bams_ids}")
+    for id in genotyping_bams_ids:
+        for bam in genotyping_bams:
+            if not id in bam:
+                raise Exception(f"list of genotyping bams: {genotyping_bams} is a different order from list of genotyping ids {genotyping_bams_ids}")
+    return True
+
 def register_file(file):
     fname = os.path.basename(file)
     file_group = FileGroup.objects.get(id=DMP_FILE_GROUP)
@@ -185,7 +194,6 @@ def get_unfiltered_matched_normal(patient_id, fillout_unfiltered_normals, reques
 
     # Case 1
     if request_id:
-        # Todo: Joining to Port -> Run makes this query slow, make use of output_metadata for requestId instead
         for bam in fillout_unfiltered_normals:
             if bam.file_name.startswith(patient_normals_search):
                 unfiltered_matched_normal_bam = bam
@@ -555,11 +563,10 @@ def create_sample_info(
             msg = "ACCESS SNV Operator Error: Duplex sample IDs not matched to Simplex sample IDs"
             raise Exception(msg)
         # Add in any DMP ACCESS samples
-        (dmp_matched_tumors_duplex, dmp_matched_tumors_simplex) = get_dmp_matched_patient_geno_samples(patient_id)
-        # TODO not flipping file name
+        # (dmp_matched_tumors_duplex, dmp_matched_tumors_simplex) = get_dmp_matched_patient_geno_samples(patient_id)
 
-        geno_samples_duplex = geno_samples_duplex  # + dmp_matched_tumors_duplex
-        geno_samples_simplex = geno_samples_simplex  # + dmp_matched_tumors_simplex
+        geno_samples_duplex = geno_samples_duplex  
+        geno_samples_simplex = geno_samples_simplex 
         geno_samples = make_pairs(geno_samples_duplex, geno_samples_simplex)
         sample_info = {
             "matched_normal_unfiltered": [matched_normal_unfiltered_bam],
@@ -569,73 +576,6 @@ def create_sample_info(
 
         return sample_info
 
-    def mapping_bams(self, sample_info):
-        # sample_id,normal_path,duplex_path,simplex_path,type
-        # patient_id,sample_id,type,maf,standard_bam,standard_bai,duplex_bam,duplex_bai,simplex_bam,simplex_bai
-        bams = []
-        aux_bams = []
-        for key, value in sample_info.items():
-            for v in value:
-                map = {}
-                if key == "tumor_bam":
-                    map["patient_id"] = "null"
-                    map["sample_id"] = v[0].file_name.replace(DUPLEX_BAM_STEM, "")
-                    map["maf"] = "null"
-                    map["standard_bam"] = "null"
-                    map["standard_bai"] = "null"
-                    map["duplex_bam"] = _create_cwl_bam_object(v[0].path)
-                    map["duplex_bai"] = _create_cwl_bam_object(v[0].path.replace(".bam", ".bai"))
-                    map["simplex_bam"] = _create_cwl_bam_object(v[1].path)
-                    map["simplex_bai"] = _create_cwl_bam_object(v[1].path.replace(".bam", ".bai"))
-                    map["type"] = "CASE"
-                    bams.append(map)
-                if key == "normal_bam":
-                    map["patient_id"] = "null"
-                    map["sample_id"] = v[0].file_name.replace(
-                        "-TP_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-duplex.bam", ""
-                    )
-                    map["maf"] = "null"
-                    map["standard_bam"] = "null"
-                    map["standard_bai"] = "null"
-                    map["duplex_bam"] = _create_cwl_bam_object(v[0].path)
-                    map["duplex_bai"] = _create_cwl_bam_object(v[0].path.replace(".bam", ".bai"))
-                    map["simplex_bam"] = _create_cwl_bam_object(v[1].path)
-                    map["simplex_bai"] = _create_cwl_bam_object(v[1].path.replace(".bam", ".bai"))
-                    map["type"] = "CONTROL"
-                    bams.append(map)
-                if key == "geno_samples":
-                    # TODO jsut do the replace here
-                    sample_id = v[0].file_name.replace(DUPLEX_BAM_STEM, "")
-                    sample_id = sample_id.replace(DMP_DUPLEX_REGEX, "")
-                    map["sample_id"] = sample_id
-                    map["normal_path"] = "null"
-                    map["duplex_path"] = _create_cwl_bam_object(v[0].path)
-                    map["simplex_path"] = _create_cwl_bam_object(v[1].path)
-                    map["type"] = "PLASMA"
-                    aux_bams.append(map)
-                if key == "geno_samples_normal_unfiltered":
-                    map["sample_id"] = v.file_name.replace(UNCOLLAPSED_BAM_STEM, "")
-                    map["normal_path"] = _create_cwl_bam_object(v.path)
-                    map["duplex_path"] = "null"
-                    map["simplex_path"] = "null"
-                    map["type"] = "UNMATCHED_NORMAL"
-                    aux_bams.append(map)
-                if key == "curated_normal_bams":
-                    map["sample_id"] = v[0].file_name.replace(DUPLEX_BAM_STEM, "")
-                    map["normal_path"] = "null"
-                    map["duplex_path"] = _create_cwl_bam_object(v[0].path)
-                    map["simplex_path"] = _create_cwl_bam_object(v[1].path)
-                    map["type"] = "CURATED"
-                    aux_bams.append(map)
-                if key == "matched_normal_unfiltered":
-                    map["sample_id"] = v.file_name.replace(UNCOLLAPSED_BAM_STEM, "")
-                    map["normal_path"] = _create_cwl_bam_object(v.path)
-                    map["duplex_path"] = "null"
-                    map["simplex_path"] = "null"
-                    map["type"] = "MATCHED_NORMAL"
-                    aux_bams.append(map)
-        return (bams, aux_bams)
-
     def get_request_id_runs(self, app):
         """
         Get the latest completed bam-generation runs for the given request ID
@@ -670,28 +610,33 @@ def construct_sample_inputs(self, sample_info):
             template = Template(file.read())
             genotyping_bams = []
             genotyping_bams_ids = []
-
             tumor_bam_duplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][0].path)]
             tumor_bam_simplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][1].path)]
             tumor_sample_name = [sample_info["tumor_bam"][0][0].file_name.split("_")[0]]
-            tumor_duplex_id = [sample_info["tumor_bam"][0][0].file_name.split("_")[0]]
-            tumor_simplex_id = [sample_info["tumor_bam"][0][1].file_name.split("_")[0] + "-SIMPLEX"]
+            tumor_duplex_id = [
+                sample_info["tumor_bam"][0][0].file_name.split("_")[0]
+            ]
+            tumor_simplex_id = [
+                sample_info["tumor_bam"][0][1].file_name.split("_")[0] + "-SIMPLEX"
+            ]
 
             normal_bam_duplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][0].path)]
             normal_bam_simplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][1].path)]
             normal_sample_name = [sample_info["normal_bam"][0][0].file_name.split("_")[0]]
             normal_duplex_id = [sample_info["normal_bam"][0][0].file_name.split("_")[0] + "-CURATED-DUPLEX"]
-            normal_simplex_id = [sample_info["normal_bam"][0][1].file_name.split("_")[0] + "-CURATED-SIMPLEX"]
+            normal_simplex_id = [sample_info["normal_bam"][0][1].file_name.split("_")[0]  + "-CURATED-SIMPLEX"]
 
             genotyping_bams_ids = tumor_duplex_id + tumor_simplex_id + normal_duplex_id + normal_simplex_id
 
-            genotyping_bams = normal_bam_duplex + normal_bam_simplex + tumor_bam_duplex + tumor_bam_simplex
+            genotyping_bams = tumor_bam_duplex + tumor_bam_simplex + normal_bam_duplex + normal_bam_simplex
 
             if sample_info["matched_normal_unfiltered"][0] == None:
                 matched_normal_id = [""]
             else:
                 matched_normal = [_create_cwl_bam_object(sample_info["matched_normal_unfiltered"][0].path)]
-                matched_normal_id = [sample_info["matched_normal_unfiltered"][0].file_name.split("_")[0]]
+                matched_normal_id = [
+                    sample_info["matched_normal_unfiltered"][0].file_name.split("_")[0]
+                ]
                 genotyping_bams_ids += matched_normal_id
                 genotyping_bams += matched_normal
 
@@ -709,13 +654,20 @@ def construct_sample_inputs(self, sample_info):
                         genotyping_bams_ids.append(sample_id)
                         genotyping_bams.append(_create_cwl_bam_object(f.path))
                     if key == "curated_normal_bams":
-                        sample_id_duplex = f[0].file_name.split("_")[0] + "-CURATED-DUPLEX"
-                        sample_id_simplex = f[1].file_name.split("_")[0] + "-CURATED-SIMPLEX"
+                        sample_id_duplex = (
+                            f[0].file_name.split("_")[0] + "-CURATED-DUPLEX"
+                        )
+                        sample_id_simplex = (
+                            f[1].file_name.split("_")[0] + "-CURATED-SIMPLEX"
+                        )
                         genotyping_bams_ids.append(sample_id_duplex)
                         genotyping_bams_ids.append(sample_id_simplex)
                         genotyping_bams.append(_create_cwl_bam_object(f[0].path))
                         genotyping_bams.append(_create_cwl_bam_object(f[1].path))
 
+            # check genotyping bams
+            check_genotype_list(genotyping_bams, genotyping_bams_ids)
+
             input_file = template.render(
                 tumor_bams=json.dumps(tumor_bam_duplex),
                 normal_bams=json.dumps(normal_bam_duplex),
@@ -822,7 +774,7 @@ def get_jobs(self):
                 settings.CMO_SAMPLE_NAME_METADATA_KEY: sample,
             }
             job_json = {
-                "name": "ACCESS LEGACY SNV {sample}: {run_date}".format(sample=sample, run_date=run_date),
+                "name": "ACCESS V2 LEGACY SNV {request_id}: {run_date}".format(request_id=self.request_id, run_date=run_date),
                 "app": app,
                 "inputs": sample_input,
                 "tags": job_tags,

From 583eac47def2a0e9b1ff0762ccde4d721d71d4db Mon Sep 17 00:00:00 2001
From: buehlere <buehlere@mskcc.org>
Date: Mon, 24 Feb 2025 16:51:38 -0500
Subject: [PATCH 12/16] Update __init__.py

---
 runner/operator/access/v2_1_0/msi/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runner/operator/access/v2_1_0/msi/__init__.py b/runner/operator/access/v2_1_0/msi/__init__.py
index 6bcd47557..a6f2df21f 100755
--- a/runner/operator/access/v2_1_0/msi/__init__.py
+++ b/runner/operator/access/v2_1_0/msi/__init__.py
@@ -103,7 +103,7 @@ def get_jobs(self):
         return [
             RunCreator(
                 **{
-                    "name": "ACCESS LEGACY MSI M1: %s, %i of %i" % (self.request_id, i + 1, len(inputs)),
+                    "name": "ACCESS V2 LEGACY MSI M1: %s, %i of %i" % (self.request_id, i + 1, len(inputs)),
                     "app": self.get_pipeline_id(),
                     "inputs": job,
                     "tags": {

From 46e06e15aa7ffec2d13c8810db420000cbb4d1ab Mon Sep 17 00:00:00 2001
From: buehlere <buehlere@mskcc.org>
Date: Tue, 25 Feb 2025 10:38:06 -0500
Subject: [PATCH 13/16] Update snps_and_indels.py

---
 .../v2_1_0/snps_and_indels/snps_and_indels.py | 46 +++++++++----------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py
index 61f1300b0..adccd9d64 100755
--- a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py
+++ b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py
@@ -53,13 +53,17 @@
 
 def check_genotype_list(genotyping_bams, genotyping_bams_ids):
     if len(genotyping_bams_ids) != len(genotyping_bams):
-            raise Exception(f"list of genotyping bams: {genotyping_bams} is a different length from list of genotyping ids {genotyping_bams_ids}")
-    for id in genotyping_bams_ids:
-        for bam in genotyping_bams:
-            if not id in bam:
-                raise Exception(f"list of genotyping bams: {genotyping_bams} is a different order from list of genotyping ids {genotyping_bams_ids}")
+        raise Exception(
+            f"list of genotyping bams: {genotyping_bams} is a different length from list of genotyping ids {genotyping_bams_ids}"
+        )
+    for ix in range(len(genotyping_bams_ids)):
+        id = genotyping_bams_ids[ix].replace("-CURATED", "").replace("-DUPLEX", "").replace("-SIMPLEX", "")
+        bam = genotyping_bams[ix]["location"]
+        if id not in bam:
+            raise Exception(f"Sample ID, {id} does not match bam path: {bam}")
     return True
 
+
 def register_file(file):
     fname = os.path.basename(file)
     file_group = FileGroup.objects.get(id=DMP_FILE_GROUP)
@@ -565,8 +569,8 @@ def create_sample_info(
         # Add in any DMP ACCESS samples
         # (dmp_matched_tumors_duplex, dmp_matched_tumors_simplex) = get_dmp_matched_patient_geno_samples(patient_id)
 
-        geno_samples_duplex = geno_samples_duplex  
-        geno_samples_simplex = geno_samples_simplex 
+        geno_samples_duplex = geno_samples_duplex
+        geno_samples_simplex = geno_samples_simplex
         geno_samples = make_pairs(geno_samples_duplex, geno_samples_simplex)
         sample_info = {
             "matched_normal_unfiltered": [matched_normal_unfiltered_bam],
@@ -613,18 +617,14 @@ def construct_sample_inputs(self, sample_info):
             tumor_bam_duplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][0].path)]
             tumor_bam_simplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][1].path)]
             tumor_sample_name = [sample_info["tumor_bam"][0][0].file_name.split("_")[0]]
-            tumor_duplex_id = [
-                sample_info["tumor_bam"][0][0].file_name.split("_")[0]
-            ]
-            tumor_simplex_id = [
-                sample_info["tumor_bam"][0][1].file_name.split("_")[0] + "-SIMPLEX"
-            ]
+            tumor_duplex_id = [sample_info["tumor_bam"][0][0].file_name.split("_")[0]]
+            tumor_simplex_id = [sample_info["tumor_bam"][0][1].file_name.split("_")[0] + "-SIMPLEX"]
 
             normal_bam_duplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][0].path)]
             normal_bam_simplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][1].path)]
             normal_sample_name = [sample_info["normal_bam"][0][0].file_name.split("_")[0]]
             normal_duplex_id = [sample_info["normal_bam"][0][0].file_name.split("_")[0] + "-CURATED-DUPLEX"]
-            normal_simplex_id = [sample_info["normal_bam"][0][1].file_name.split("_")[0]  + "-CURATED-SIMPLEX"]
+            normal_simplex_id = [sample_info["normal_bam"][0][1].file_name.split("_")[0] + "-CURATED-SIMPLEX"]
 
             genotyping_bams_ids = tumor_duplex_id + tumor_simplex_id + normal_duplex_id + normal_simplex_id
 
@@ -634,9 +634,7 @@ def construct_sample_inputs(self, sample_info):
                 matched_normal_id = [""]
             else:
                 matched_normal = [_create_cwl_bam_object(sample_info["matched_normal_unfiltered"][0].path)]
-                matched_normal_id = [
-                    sample_info["matched_normal_unfiltered"][0].file_name.split("_")[0]
-                ]
+                matched_normal_id = [sample_info["matched_normal_unfiltered"][0].file_name.split("_")[0]]
                 genotyping_bams_ids += matched_normal_id
                 genotyping_bams += matched_normal
 
@@ -654,12 +652,8 @@ def construct_sample_inputs(self, sample_info):
                         genotyping_bams_ids.append(sample_id)
                         genotyping_bams.append(_create_cwl_bam_object(f.path))
                     if key == "curated_normal_bams":
-                        sample_id_duplex = (
-                            f[0].file_name.split("_")[0] + "-CURATED-DUPLEX"
-                        )
-                        sample_id_simplex = (
-                            f[1].file_name.split("_")[0] + "-CURATED-SIMPLEX"
-                        )
+                        sample_id_duplex = f[0].file_name.split("_")[0] + "-CURATED-DUPLEX"
+                        sample_id_simplex = f[1].file_name.split("_")[0] + "-CURATED-SIMPLEX"
                         genotyping_bams_ids.append(sample_id_duplex)
                         genotyping_bams_ids.append(sample_id_simplex)
                         genotyping_bams.append(_create_cwl_bam_object(f[0].path))
@@ -726,7 +720,7 @@ def get_jobs(self):
             if is_tumor_bam(b["fgbio_filter_consensus_reads_duplex_bam"].file_name)
         ]
         fillout_unfiltered_normals = [
-            b["uncollapsed_bam"] for b in bams if not is_tumor_bam(b["uncollapsed_bam"].file_name)
+            b["fgbio_collapsed_bam"] for b in bams if not is_tumor_bam(b["fgbio_collapsed_bam"].file_name)
         ]
 
         # NORMAL BAM
@@ -774,7 +768,9 @@ def get_jobs(self):
                 settings.CMO_SAMPLE_NAME_METADATA_KEY: sample,
             }
             job_json = {
-                "name": "ACCESS V2 LEGACY SNV {request_id}: {run_date}".format(request_id=self.request_id, run_date=run_date),
+                "name": "ACCESS V2 LEGACY SNV {request_id}: {run_date}".format(
+                    request_id=self.request_id, run_date=run_date
+                ),
                 "app": app,
                 "inputs": sample_input,
                 "tags": job_tags,

From f26266290c90350c2a0e8c5d764d5f93adb0b9a3 Mon Sep 17 00:00:00 2001
From: buehlere <buehlere@mskcc.org>
Date: Tue, 25 Feb 2025 13:23:11 -0500
Subject: [PATCH 14/16] Update input_template.json.jinja2

---
 .../access/v2_1_0/snps_and_indels/input_template.json.jinja2  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2 b/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2
index 0dda14c3d..821cc9a47 100755
--- a/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2
+++ b/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2
@@ -94,11 +94,11 @@
     },
     "hotspots": {
         "class": "File",
-        "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/regions_of_interest/versions/v1.0/hotspot-list-union-v1-v2_with_TERT.txt"
+        "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/regions_of_interest/versions/v1.0/hotspot-list-union-v1-v2_with_TERT.txt"
     },
     "blacklist_file": {
         "class": "File",
-        "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/regions_of_interest/versions/v1.0/ACCESS_blocklist_26_10_2022.txt"
+        "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/regions_of_interest/versions/v1.0/ACCESSv2_blocklist_26_10_2022.txt"
     },
     "custom_enst_file": {
         "class": "File",

From cf9854e6f4d7014c649a0fec4f1ce8f3462e06d8 Mon Sep 17 00:00:00 2001
From: buehlere <buehlere@mskcc.org>
Date: Tue, 25 Feb 2025 13:23:26 -0500
Subject: [PATCH 15/16] Update input_template.json.jinja2

---
 .../access/v2_1_0/snps_and_indels/input_template.json.jinja2    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2 b/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2
index 821cc9a47..3f998ca12 100755
--- a/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2
+++ b/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2
@@ -102,7 +102,7 @@
     },
     "custom_enst_file": {
         "class": "File",
-        "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/regions_of_interest/versions/v1.0/dmp_ACCESS-panelA-v1-isoform-overrides"
+        "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/regions_of_interest/versions/v1.0/dmp_isoform_merged_overrides.txt"
     },
     "bed_file": {
         "class": "File",

From ebf9e23345a2efefced8e5f56bd77c3b56c112e5 Mon Sep 17 00:00:00 2001
From: buehlere <buehlere@mskcc.org>
Date: Tue, 25 Feb 2025 14:33:01 -0500
Subject: [PATCH 16/16] Update snps_and_indels.py

---
 runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py
index adccd9d64..bdecf1177 100755
--- a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py
+++ b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py
@@ -26,7 +26,6 @@
 LOGGER = logging.getLogger(__name__)
 ACCESS_CURATED_BAMS_FILE_GROUP_SLUG = "accessv2_curated_normals"
 ACCESS_DEFAULT_NORMAL_ID = "Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R"
-BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX"
 NORMAL_SAMPLE_SEARCH = "-N0"
 TUMOR_SAMPLE_SEARCH = "-L0"
 DUPLEX_BAM_SEARCH = "__aln_srt_IR_FX-duplex.bam"