From 95ec94d84db4e7fb10e2cc8635c864e781208298 Mon Sep 17 00:00:00 2001 From: Allan Bolipata Date: Fri, 7 Feb 2025 13:33:12 -0500 Subject: [PATCH 01/16] nucleo sv progress --- runner/operator/access/__init__.py | 73 +++++++--- .../v2_1_0/structural_variants/__init__.py | 126 ++++++++++++++++++ .../input_template.json.jinja2 | 20 +++ 3 files changed, 204 insertions(+), 15 deletions(-) mode change 100644 => 100755 runner/operator/access/__init__.py create mode 100755 runner/operator/access/v2_1_0/structural_variants/__init__.py create mode 100755 runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2 diff --git a/runner/operator/access/__init__.py b/runner/operator/access/__init__.py old mode 100644 new mode 100755 index 2ffa3f994..77a3d6f7f --- a/runner/operator/access/__init__.py +++ b/runner/operator/access/__init__.py @@ -29,29 +29,35 @@ def get_request_id(run_ids, request_id=None): raise Exception("Could not get find request id") -def get_request_id_runs(request_id): +def get_request_id_runs(app, run_ids, request_id): """ Get the latest completed bam-generation runs for the given request ID :param request_id: str - IGO request ID :return: List[str] - List of most recent runs from given request ID """ - operator_run_id = ( - Run.objects.filter( - tags__igoRequestId=request_id, - app__name__in=["access legacy", "access nucleo"], - operator_run__status=RunStatus.COMPLETED, + + + if not request_id: + most_recent_runs_for_request = Run.objects.filter(pk__in=run_ids, status=RunStatus.COMPLETED) + request_id = RunStatus[0].tags["igoRequestId"] + else: + most_recent_runs_for_request = ( + Run.objects.filter( + tags__igoRequestId=request_id, + app__name__in=app, + status=RunStatus.COMPLETED, + operator_run__status=RunStatus.COMPLETED, + ) + .order_by("-created_date") + .first() + .operator_run.runs.all() + .filter(status=RunStatus.COMPLETED) ) - .exclude(finished_date__isnull=True) - .order_by("-finished_date") - .first() - .operator_run_id - ) + if not len(most_recent_runs_for_request): + raise Exception("No matching Nucleo runs found for request {}".format(request_id)) - request_id_runs = Run.objects.filter( - operator_run_id=operator_run_id, app__name__in=["access legacy", "access nucleo"], status=RunStatus.COMPLETED - ) - return request_id_runs + return most_recent_runs_for_request, request_id def create_cwl_file_object(file_path): @@ -180,3 +186,40 @@ def get_unfiltered_matched_normal(patient_id, request_id=None): logger.warning(msg) return unfiltered_matched_normal_bam, unfiltered_matched_normal_sample_id + + +def parse_nucleo_output_ports(run, port_name): + bam_bai = Port.objects.get(name=port_name, run=run.pk) + if not len(bam_bai.files.all()) in [1, 2]: + raise Exception("Port {} for run {} should have just 1 bam or 1 (bam/bai) pair".format(port_name, run.id)) + bam = [b for b in bam_bai.files.all() if b.file_name.endswith(".bam")][0] + return bam + +def find_request_bams(run): + """ + Find simplex and duplex bams from a request's nucleo run + - run_ids: run_ids from a request's nucleo run + + :return: list of paired simplex and duplex bams and normal bam + """ + nucleo_output_port_names = [ + "uncollapsed_bam", + "fgbio_group_reads_by_umi_bam", + "fgbio_collapsed_bam", + "fgbio_filter_consensus_reads_duplex_bam", + "fgbio_postprocessing_simplex_bam", + ] + bams = {} + for o in nucleo_output_port_names: + # We are running a multi-sample workflow on just one sample, + # so we create single-element lists here + bam = parse_nucleo_output_ports(run, o) + bams[o] = bam + + return bams + +def is_tumor_bam(file): + if not file.endswith(".bam"): + return False + t_n_timepoint = file.split("-")[2] + return not t_n_timepoint[0] == "N" \ No newline at end of file diff --git a/runner/operator/access/v2_1_0/structural_variants/__init__.py b/runner/operator/access/v2_1_0/structural_variants/__init__.py new file mode 100755 index 000000000..7a7440dc8 --- /dev/null +++ b/runner/operator/access/v2_1_0/structural_variants/__init__.py @@ -0,0 +1,126 @@ +import os +import json +import logging + +from django.conf import settings +from jinja2 import Template +from runner.models import Port, RunStatus +from runner.operator.operator import Operator +from runner.run.objects.run_creator_object import RunCreator +from file_system.repository.file_repository import File +from runner.operator.access import get_request_id, get_request_id_runs, create_cwl_file_object, find_request_bams, is_tumor_bam +from runner.models import RunStatus, Port, Run, Pipeline +from datetime import datetime +# DUPLEX_BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-duplex.bam" +BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR.bam" +logger = logging.getLogger(__name__) +WORKDIR = os.path.dirname(os.path.abspath(__file__)) +TUMOR_OR_NORMAL_SEARCH = "-L0" +SAMPLE_ID_SEP = "_cl_aln" +ACCESS_DEFAULT_SV_NORMAL_ID = "DONOR22-TP" +ACCESS_DEFAULT_SV_NORMAL_FILENAME = "Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R_cl_aln_srt_MD_IR_FX_BR.bam" + + +class AccessV2LegacySVOperator(Operator): + """ + Operator for the ACCESS Legacy Structural Variants workflow: + + http://www.github.com/mskcc/access-pipeline/workflows/subworkflows/manta.cwl + + This Operator will search for Standard Bam files based on an IGO Request ID + """ + + @staticmethod + def is_tumor_bam(file): + if not file.file_name.endswith(".bam"): + return False + t_n_timepoint = file.file_name.split("-")[2] + return not t_n_timepoint[0] == "N" + + def get_sample_inputs(self, runs): + """ + Create all sample inputs for all runs triggered in this instance of the operator + + :return: list of json_objects + """ + bams = [] + for run in runs: + bams.append(find_request_bams(run)) + + # TUMOR + standard_tumor_bams = [ + b["uncollapsed_bam"] + for b in bams + if is_tumor_bam(b["uncollapsed_bam"].file_name) + ] + + # standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)] + # sample_ids = [f.file_name.split("_cl_aln")[0] for f in standard_tumor_bams] + sample_ids = [b.file_name.replace(BAM_STEM, "") for b in standard_tumor_bams] + normal_bam = File.objects.filter(file_name=ACCESS_DEFAULT_SV_NORMAL_FILENAME) + normal_bam = normal_bam[0] + + sample_inputs = [] + for i, b in enumerate(standard_tumor_bams): + sample_input = self.construct_sample_inputs(sample_ids[i], b, normal_bam) + sample_inputs.append(sample_input) + + return sample_inputs + + def get_jobs(self): + """ + Convert job inputs into serialized jobs + + :return: list[(serialized job info, Job)] + """ + app = self.get_pipeline_id() + pipeline = Pipeline.objects.get(id=app) + # output_directory = pipeline.output_directory + run_date = datetime.now().strftime("%Y%m%d_%H:%M:%f") + # If no request_id, get request id from run information + # else request_id given directly + runs, self.request_id = get_request_id_runs(["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id) + sample_inputs = self.get_sample_inputs(runs) + + return [ + RunCreator( + **{ + "name": "ACCESS LEGACY SV M1: %s, %i of %i" % (self.request_id, i + 1, len(sample_inputs)), + "app": self.get_pipeline_id(), + "inputs": job, + "tags": { + settings.REQUEST_ID_METADATA_KEY: self.request_id, + "cmoSampleIds": job["sv_sample_id"], + settings.PATIENT_ID_METADATA_KEY: "-".join(job["sv_sample_id"][0].split("-")[0:2]), + }, + } + ) + for i, job in enumerate(sample_inputs) + ] + + def construct_sample_inputs(self, tumor_sample_id, tumor_bam, normal_bam): + """ + Use sample metadata and json template to create inputs for the CWL run + + :return: JSON format sample inputs + """ + with open(os.path.join(WORKDIR, "input_template.json.jinja2")) as file: + template = Template(file.read()) + + tumor_sample_names = [tumor_sample_id] + tumor_path = tumor_bam.path.replace("file://", "juno://") + if not tumor_path.startswith("juno://"): + tumor_path = "juno://" + tumor_path + tumor_bams = [{"class": "File", "location": tumor_path}] + + normal_bam = create_cwl_file_object(normal_bam.path) + + input_file = template.render( + tumor_sample_id=tumor_sample_id, + tumor_sample_names=json.dumps(tumor_sample_names), + tumor_bams=json.dumps(tumor_bams), + normal_bam=json.dumps(normal_bam), + ) + + sample_input = json.loads(input_file) + return sample_input diff --git a/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2 b/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2 new file mode 100755 index 000000000..e03a40610 --- /dev/null +++ b/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2 @@ -0,0 +1,20 @@ +{ + "version": "1.3.40", + "project_name": "{{ tumor_sample_id }}", + "ref_fasta": { + "class": "File", + "location": "juno:///juno/work/access/production/resources/reference/versions/hg19/Homo_sapiens_assembly19.fasta" + }, + + "sv_sample_id": {{ tumor_sample_names }}, + "sv_tumor_bams": {{ tumor_bams }}, + "sv_normal_bam": {{ normal_bam }}, + + "sv_run_tools": { + "java_8": "/opt/common/CentOS_6/java/jdk1.8.0_31/bin/java", + "r_path": "/opt/common/CentOS_6-dev/R/R-3.5.0/bin/Rscript", + "manta_python": "/home/accessbot/miniconda3/envs/ACCESS_1.3.36/bin/python2", + "manta": "/opt/common/CentOS_6-dev/manta/1.5.0", + "sv_repo": "/work/access/production/resources/tools/ACCESS-SV/versions/ACCESS_SV" + } +} \ No newline at end of file From 397fa6e25cf4a0f5d0f7e8245fc0c402e604ffe3 Mon Sep 17 00:00:00 2001 From: Allan Bolipata Date: Mon, 10 Feb 2025 16:29:33 -0500 Subject: [PATCH 02/16] staring msi and cnv --- runner/operator/access/v2_1_0/cnv/__init__.py | 125 ++++++++++++++++ .../v2_1_0/cnv/input_template.json.jinja2 | 27 ++++ runner/operator/access/v2_1_0/msi/__init__.py | 135 ++++++++++++++++++ .../v2_1_0/msi/input_template.json.jinja2 | 18 +++ .../v2_1_0/structural_variants/__init__.py | 2 +- .../input_template.json.jinja2 | 3 +- 6 files changed, 308 insertions(+), 2 deletions(-) create mode 100644 runner/operator/access/v2_1_0/cnv/__init__.py create mode 100644 runner/operator/access/v2_1_0/cnv/input_template.json.jinja2 create mode 100644 runner/operator/access/v2_1_0/msi/__init__.py create mode 100644 runner/operator/access/v2_1_0/msi/input_template.json.jinja2 diff --git a/runner/operator/access/v2_1_0/cnv/__init__.py b/runner/operator/access/v2_1_0/cnv/__init__.py new file mode 100644 index 000000000..46d930b89 --- /dev/null +++ b/runner/operator/access/v2_1_0/cnv/__init__.py @@ -0,0 +1,125 @@ +import os +import json +import logging +from jinja2 import Template + +from django.conf import settings +from runner.models import Port, RunStatus +from runner.operator.operator import Operator +from runner.run.objects.run_creator_object import RunCreator +from file_system.repository.file_repository import FileRepository +from runner.operator.access import get_request_id, get_request_id_runs + + +logger = logging.getLogger(__name__) + +SAMPLE_ID_SEP = "_cl_aln" +TUMOR_SEARCH = "-L0" +NORMAL_SEARCH = "-N0" +WORKDIR = os.path.dirname(os.path.abspath(__file__)) +ACCESS_DEFAULT_CNV_NORMAL_FILENAME = r"DONOR22-TP_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX.bam$" +UNFILTERED_BAM_SEARCH = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX.bam" + + +class AccessLegacyCNVOperator(Operator): + """ + Operator for the ACCESS Legacy Copy Number Variants workflow: + + http://www.github.com/mskcc/access-pipeline/workflows/subworkflows/call_cnv.cwl + + This Operator will search for ACCESS Unfiltered Bam files based on an IGO Request ID. + """ + + @staticmethod + def is_tumor_bam(file): + if not file.file_name.endswith(".bam"): + return False + t_n_timepoint = file.file_name.split("-")[2] + return not t_n_timepoint[0] == "N" + + def get_sample_inputs(self): + """ + Create all sample inputs for all runs triggered in this instance of the operator. + + :return: list of json_objects + """ + run_ids = self.run_ids if self.run_ids else [r.id for r in get_request_id_runs(self.request_id)] + + # Get all unfiltered bam ports for these runs + unfiltered_bam_ports = Port.objects.filter( + name__in=["unfiltered_bams", "fgbio_collapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED + ) + + unfiltered_tumor_bams = [f for p in unfiltered_bam_ports for f in p.files.all() if self.is_tumor_bam(f)] + + sample_ids = [] + tumor_bams = [] + sample_sexes = [] + + for tumor_bam in unfiltered_tumor_bams: + sample_id = tumor_bam.file_name.split("_cl_aln")[0] + # Use the initial fastq metadata to get the sex of the sample + # Todo: Need to store this info on the bams themselves + tumor_fastqs = FileRepository.filter( + file_type="fastq", + metadata={"tumorOrNormal": "Tumor", settings.CMO_SAMPLE_NAME_METADATA_KEY: sample_id}, + filter_redact=True, + ) + sample_sex = tumor_fastqs[0].metadata["sex"] + tumor_bams.append(tumor_bam) + sample_sexes.append(sample_sex) + sample_ids.append(sample_id) + + sample_inputs = [ + self.construct_sample_inputs(tumor_bams[i], sample_sexes[i]) for i in range(0, len(tumor_bams)) + ] + + return sample_inputs, sample_ids + + def get_jobs(self): + """ + Convert job inputs into serialized jobs + + :return: list[(serialized job info, Job)] + """ + self.request_id = get_request_id(self.run_ids, self.request_id) + inputs, sample_ids = self.get_sample_inputs() + + return [ + ( + RunCreator( + **{ + "name": "ACCESS LEGACY CNV M1: %s, %i of %i" % (self.request_id, i + 1, len(inputs)), + "app": self.get_pipeline_id(), + "inputs": job, + "tags": { + settings.REQUEST_ID_METADATA_KEY: self.request_id, + "cmoSampleIds": sample_ids[i], + settings.PATIENT_ID_METADATA_KEY: "-".join(sample_ids[i].split("-")[0:2]), + }, + } + ) + ) + for i, job in enumerate(inputs) + ] + + def construct_sample_inputs(self, tumor_bam, sample_sex): + """ + Use sample metadata and json template to create inputs for the CWL run + + :return: JSON format sample inputs + """ + with open(os.path.join(WORKDIR, "input_template.json.jinja2")) as file: + template = Template(file.read()) + + tumor_sample_list = tumor_bam.path + "\t" + sample_sex + # Todo: need this to work with Nucleo bams: + tumor_sample_id = tumor_bam.file_name.split("_cl_aln_srt_MD_IR_FX_BR")[0] + + input_file = template.render( + tumor_sample_id=tumor_sample_id, + tumor_sample_list_content=json.dumps(tumor_sample_list), + ) + + sample_input = json.loads(input_file) + return sample_input diff --git a/runner/operator/access/v2_1_0/cnv/input_template.json.jinja2 b/runner/operator/access/v2_1_0/cnv/input_template.json.jinja2 new file mode 100644 index 000000000..9f7570649 --- /dev/null +++ b/runner/operator/access/v2_1_0/cnv/input_template.json.jinja2 @@ -0,0 +1,27 @@ +{ + "project_name": "{{ tumor_sample_id }}", + "tumor_sample_list": { + "contents": {{ tumor_sample_list_content }}, + "basename": "tumor_manifest.txt", + "class": "File" + }, + "normal_sample_list": { + "class": "File", + "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/novaseq_curated_unfiltered_bams_dmp/versions/v1.0/normal_manifest_access_v2_plasma.txt" + }, + "threads": 8, + "tmp_dir": "/scratch", + "targets_coverage_bed": { + "class": "File", + "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/regions_of_interest/versions/v1.0/ACCESSv2_targets_coverage.bed" + }, + "targets_coverage_annotation": { + "class": "File", + "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/regions_of_interest/versions/v1.0/ACCESSv2_targets_coverage.txt" + }, + "reference_fasta": { + "class": "File", + "location": "juno:///juno/work/access/production/resources/reference/versions/hg19_virus_special/hg19_virus.fasta" + }, + "version": "1.3.40" +} diff --git a/runner/operator/access/v2_1_0/msi/__init__.py b/runner/operator/access/v2_1_0/msi/__init__.py new file mode 100644 index 000000000..c9e5cb71d --- /dev/null +++ b/runner/operator/access/v2_1_0/msi/__init__.py @@ -0,0 +1,135 @@ +"""""" """""" """""" """""" """"" +" ACCESS-Pipeline MSI workflow operator +" http://www.github.com/mskcc/access-pipeline/workflows/msi.cwl +""" """""" """""" """""" """""" "" + +import os +import json +import logging +from jinja2 import Template +from django.conf import settings +from file_system.models import File +from runner.operator.operator import Operator +from runner.run.objects.run_creator_object import RunCreator +from runner.operator.access import get_request_id, get_request_id_runs, create_cwl_file_object +from runner.models import Port, RunStatus + + +logger = logging.getLogger(__name__) + +# Todo: needs to work for Nucleo bams as well +SAMPLE_ID_SEP = "_cl_aln" +TUMOR_SEARCH = "-L0" +TUMOR_SEARCH_NEW = "_L0" +NORMAL_SEARCH = "-N0" +NORMAL_SEARCH_NEW = "_N0" +STANDARD_BAM_SEARCH = "_cl_aln_srt_MD_IR_FX_BR.bam" +WORKDIR = os.path.dirname(os.path.abspath(__file__)) + + +class AccessLegacyMSIOperator(Operator): + """ + Operator for the ACCESS Legacy Microsatellite Instability workflow: + + http://www.github.com/mskcc/access-pipeline/workflows/subworkflows/msi.cwl + + This Operator will search for ACCESS Standard Bam files based on an IGO Request ID. It will + also find the matched normals based on the patient ID. + """ + + @staticmethod + def is_tumor_bam(file): + # Todo: extract to common fn across 4 downstream operators + if not file.file_name.endswith(".bam"): + return False + t_n_timepoint = file.file_name.split("-")[2] + return not t_n_timepoint[0] == "N" + + def get_sample_inputs(self): + """ + Create all sample inputs for all runs triggered in this instance of the operator. + + :return: list of json_objects + """ + run_ids = self.run_ids if self.run_ids else [r.id for r in get_request_id_runs(self.request_id)] + + # Get all standard bam ports for these runs + standard_bam_ports = Port.objects.filter( + name__in=["standard_bams", "uncollapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED + ) + + standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)] + + # Dictionary that associates tumor bam with standard bam with tumor_sample_id + sample_tumor_normal = {} + for standard_tumor_bam in standard_tumor_bams: + tumor_sample_id = standard_tumor_bam.file_name.split("_cl_aln")[0] + patient_id = "-".join(tumor_sample_id.split("-")[0:2]) + + # Find the matched Normal Standard bam (which could be associated with a different request_id) + sample_search_start = patient_id + NORMAL_SEARCH + matched_normal_bam = File.objects.filter( + file_name__startswith=sample_search_start, file_name__endswith=STANDARD_BAM_SEARCH + ) + if not len(matched_normal_bam) > 0: + msg = "No matching standard normal Bam found for patient {}".format(patient_id) + logger.warning(msg) + continue + + matched_normal_bam = matched_normal_bam.order_by("-created_date").first() + + sample_tumor_normal[tumor_sample_id] = {"normal": matched_normal_bam, "tumor": standard_tumor_bam} + + sample_inputs = [ + self.construct_sample_inputs(key, value["tumor"], value["normal"]) + for key, value in sample_tumor_normal.items() + ] + + return sample_inputs + + def get_jobs(self): + """ + Convert job inputs into serialized jobs + + :return: list[(serialized job info, Job)] + """ + self.request_id = get_request_id(self.run_ids, self.request_id) + inputs = self.get_sample_inputs() + + return [ + RunCreator( + **{ + "name": "ACCESS LEGACY MSI M1: %s, %i of %i" % (self.request_id, i + 1, len(inputs)), + "app": self.get_pipeline_id(), + "inputs": job, + "tags": { + settings.REQUEST_ID_METADATA_KEY: self.request_id, + "cmoSampleIds": job["sample_name"], + settings.PATIENT_ID_METADATA_KEY: "-".join(job["sample_name"][0].split("-")[0:2]), + }, + } + ) + for i, job in enumerate(inputs) + ] + + def construct_sample_inputs(self, sample_name, tumor_bam, matched_normal_bam): + """ + Use sample metadata and json template to create inputs for the CWL run + + :return: JSON format sample inputs + """ + with open(os.path.join(WORKDIR, "input_template.json.jinja2")) as file: + template = Template(file.read()) + + sample_names = [sample_name] + matched_normal_bams = [create_cwl_file_object(matched_normal_bam.path)] + tumor_bams = [create_cwl_file_object(tumor_bam.path)] + + input_file = template.render( + tumor_bams=json.dumps(tumor_bams), + normal_bams=json.dumps(matched_normal_bams), + sample_names=json.dumps(sample_names), + ) + + sample_input = json.loads(input_file) + return sample_input diff --git a/runner/operator/access/v2_1_0/msi/input_template.json.jinja2 b/runner/operator/access/v2_1_0/msi/input_template.json.jinja2 new file mode 100644 index 000000000..84b5421c3 --- /dev/null +++ b/runner/operator/access/v2_1_0/msi/input_template.json.jinja2 @@ -0,0 +1,18 @@ +{ + "file_path": "/home/accessbot/miniconda3/envs/ACCESS_cmplx_geno_test/lib/python2.7/site-packages/cwl_tools/msi", + "project_name": "MSI_test", + "tumor_bam": {{tumor_bams}}, + "normal_bam": {{normal_bams}}, + "sample_name": {{sample_names}}, + "threads": 4, + "microsatellites": { + "class": "File", + "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/regions_of_interest/versions/v1.0/microsatellites.list" + }, + "model": { + "class": "File", + "location": "juno:///juno/work/access/production/resources/admie/versions/v1.0/ADMIE.joblib" + }, + "tmp_dir": "/scratch", + "version": "1.3.23-13-g07e0d12" +} diff --git a/runner/operator/access/v2_1_0/structural_variants/__init__.py b/runner/operator/access/v2_1_0/structural_variants/__init__.py index 7a7440dc8..d4086995c 100755 --- a/runner/operator/access/v2_1_0/structural_variants/__init__.py +++ b/runner/operator/access/v2_1_0/structural_variants/__init__.py @@ -17,7 +17,7 @@ WORKDIR = os.path.dirname(os.path.abspath(__file__)) TUMOR_OR_NORMAL_SEARCH = "-L0" SAMPLE_ID_SEP = "_cl_aln" -ACCESS_DEFAULT_SV_NORMAL_ID = "DONOR22-TP" +ACCESS_DEFAULT_SV_NORMAL_ID = "Donor19F21c2206-TP01" ACCESS_DEFAULT_SV_NORMAL_FILENAME = "Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R_cl_aln_srt_MD_IR_FX_BR.bam" diff --git a/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2 b/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2 index e03a40610..f465530de 100755 --- a/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2 +++ b/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2 @@ -1,9 +1,10 @@ { "version": "1.3.40", "project_name": "{{ tumor_sample_id }}", + #TODO change reference "ref_fasta": { "class": "File", - "location": "juno:///juno/work/access/production/resources/reference/versions/hg19/Homo_sapiens_assembly19.fasta" + "location": "juno:///juno/work/access/production/resources/reference/versions/hg19_virus_special/hg19_virus.fasta" }, "sv_sample_id": {{ tumor_sample_names }}, From 62101f448b1770c6d005a165cd1de7242ee00da6 Mon Sep 17 00:00:00 2001 From: buehlere Date: Tue, 11 Feb 2025 11:11:49 -0500 Subject: [PATCH 03/16] adding msi and cnv --- runner/operator/access/__init__.py | 1 + runner/operator/access/v2_1_0/cnv/__init__.py | 31 +++++++++++-------- runner/operator/access/v2_1_0/msi/__init__.py | 26 +++++++++++----- .../v2_1_0/structural_variants/__init__.py | 2 +- 4 files changed, 39 insertions(+), 21 deletions(-) diff --git a/runner/operator/access/__init__.py b/runner/operator/access/__init__.py index 77a3d6f7f..7ff2ce1ed 100755 --- a/runner/operator/access/__init__.py +++ b/runner/operator/access/__init__.py @@ -3,6 +3,7 @@ from django.conf import settings from runner.models import Run, RunStatus, Port from file_system.models import File, FileMetadata +from runner.operator.access import get_request_id_runs logger = logging.getLogger(__name__) diff --git a/runner/operator/access/v2_1_0/cnv/__init__.py b/runner/operator/access/v2_1_0/cnv/__init__.py index 46d930b89..a7e4603a4 100644 --- a/runner/operator/access/v2_1_0/cnv/__init__.py +++ b/runner/operator/access/v2_1_0/cnv/__init__.py @@ -8,17 +8,12 @@ from runner.operator.operator import Operator from runner.run.objects.run_creator_object import RunCreator from file_system.repository.file_repository import FileRepository -from runner.operator.access import get_request_id, get_request_id_runs +from runner.operator.access import get_request_id_runs, find_request_bams, is_tumor_bam logger = logging.getLogger(__name__) -SAMPLE_ID_SEP = "_cl_aln" -TUMOR_SEARCH = "-L0" -NORMAL_SEARCH = "-N0" WORKDIR = os.path.dirname(os.path.abspath(__file__)) -ACCESS_DEFAULT_CNV_NORMAL_FILENAME = r"DONOR22-TP_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX.bam$" -UNFILTERED_BAM_SEARCH = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX.bam" class AccessLegacyCNVOperator(Operator): @@ -43,14 +38,25 @@ def get_sample_inputs(self): :return: list of json_objects """ - run_ids = self.run_ids if self.run_ids else [r.id for r in get_request_id_runs(self.request_id)] + runs, self.request_id = get_request_id_runs(["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id) + + bams = [] + for run in runs: + bams.append(find_request_bams(run)) + + # TUMOR + unfiltered_bam_ports = [ + b[["unfiltered_bams", "fgbio_collapsed_bam"]] + for b in bams + if is_tumor_bam(b["unfiltered_bams"].file_name) + ] # Get all unfiltered bam ports for these runs - unfiltered_bam_ports = Port.objects.filter( - name__in=["unfiltered_bams", "fgbio_collapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED - ) + # unfiltered_bam_ports = Port.objects.filter( + # name__in=["unfiltered_bams", "fgbio_collapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED + # ) - unfiltered_tumor_bams = [f for p in unfiltered_bam_ports for f in p.files.all() if self.is_tumor_bam(f)] + # unfiltered_tumor_bams = [f for p in unfiltered_bam_ports for f in p.files.all() if self.is_tumor_bam(f)] sample_ids = [] tumor_bams = [] @@ -89,7 +95,7 @@ def get_jobs(self): ( RunCreator( **{ - "name": "ACCESS LEGACY CNV M1: %s, %i of %i" % (self.request_id, i + 1, len(inputs)), + "name": "ACCESS V2 LEGACY CNV M1: %s, %i of %i" % (self.request_id, i + 1, len(inputs)), "app": self.get_pipeline_id(), "inputs": job, "tags": { @@ -113,7 +119,6 @@ def construct_sample_inputs(self, tumor_bam, sample_sex): template = Template(file.read()) tumor_sample_list = tumor_bam.path + "\t" + sample_sex - # Todo: need this to work with Nucleo bams: tumor_sample_id = tumor_bam.file_name.split("_cl_aln_srt_MD_IR_FX_BR")[0] input_file = template.render( diff --git a/runner/operator/access/v2_1_0/msi/__init__.py b/runner/operator/access/v2_1_0/msi/__init__.py index c9e5cb71d..d474cccb5 100644 --- a/runner/operator/access/v2_1_0/msi/__init__.py +++ b/runner/operator/access/v2_1_0/msi/__init__.py @@ -11,8 +11,8 @@ from file_system.models import File from runner.operator.operator import Operator from runner.run.objects.run_creator_object import RunCreator -from runner.operator.access import get_request_id, get_request_id_runs, create_cwl_file_object from runner.models import Port, RunStatus +from runner.operator.access import get_request_id_runs, find_request_bams, is_tumor_bam, create_cwl_file_object logger = logging.getLogger(__name__) @@ -51,14 +51,26 @@ def get_sample_inputs(self): :return: list of json_objects """ - run_ids = self.run_ids if self.run_ids else [r.id for r in get_request_id_runs(self.request_id)] + runs, self.request_id = get_request_id_runs(["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id) + + bams = [] + for run in runs: + bams.append(find_request_bams(run)) + + # TUMOR + standard_tumor_bams = [ + b[["standard_bams", "uncollapsed_bam"]] + for b in bams + if is_tumor_bam(b["unfiltered_bams"].file_name) + ] + # run_ids = self.run_ids if self.run_ids else [r.id for r in get_request_id_runs(self.request_id)] - # Get all standard bam ports for these runs - standard_bam_ports = Port.objects.filter( - name__in=["standard_bams", "uncollapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED - ) + # # Get all standard bam ports for these runs + # standard_bam_ports = Port.objects.filter( + # name__in=["standard_bams", "uncollapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED + # ) - standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)] + # standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)] # Dictionary that associates tumor bam with standard bam with tumor_sample_id sample_tumor_normal = {} diff --git a/runner/operator/access/v2_1_0/structural_variants/__init__.py b/runner/operator/access/v2_1_0/structural_variants/__init__.py index d4086995c..60fea3adc 100755 --- a/runner/operator/access/v2_1_0/structural_variants/__init__.py +++ b/runner/operator/access/v2_1_0/structural_variants/__init__.py @@ -85,7 +85,7 @@ def get_jobs(self): return [ RunCreator( **{ - "name": "ACCESS LEGACY SV M1: %s, %i of %i" % (self.request_id, i + 1, len(sample_inputs)), + "name": "ACCESS V2 LEGACY SV M1: %s, %i of %i" % (self.request_id, i + 1, len(sample_inputs)), "app": self.get_pipeline_id(), "inputs": job, "tags": { From 8453181ce67d9b8b11268363c01515d7fb17bde0 Mon Sep 17 00:00:00 2001 From: buehlere Date: Tue, 11 Feb 2025 12:49:32 -0500 Subject: [PATCH 04/16] lininting and class name udpate --- runner/operator/access/__init__.py | 9 +++++---- runner/operator/access/v2_1_0/cnv/__init__.py | 17 +++++++---------- runner/operator/access/v2_1_0/msi/__init__.py | 15 ++++++--------- .../v2_1_0/structural_variants/__init__.py | 19 ++++++++++++------- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/runner/operator/access/__init__.py b/runner/operator/access/__init__.py index 7ff2ce1ed..4947235c1 100755 --- a/runner/operator/access/__init__.py +++ b/runner/operator/access/__init__.py @@ -38,10 +38,9 @@ def get_request_id_runs(app, run_ids, request_id): :return: List[str] - List of most recent runs from given request ID """ - if not request_id: - most_recent_runs_for_request = Run.objects.filter(pk__in=run_ids, status=RunStatus.COMPLETED) - request_id = RunStatus[0].tags["igoRequestId"] + most_recent_runs_for_request = Run.objects.filter(pk__in=run_ids, status=RunStatus.COMPLETED) + request_id = RunStatus[0].tags["igoRequestId"] else: most_recent_runs_for_request = ( Run.objects.filter( @@ -196,6 +195,7 @@ def parse_nucleo_output_ports(run, port_name): bam = [b for b in bam_bai.files.all() if b.file_name.endswith(".bam")][0] return bam + def find_request_bams(run): """ Find simplex and duplex bams from a request's nucleo run @@ -219,8 +219,9 @@ def find_request_bams(run): return bams + def is_tumor_bam(file): if not file.endswith(".bam"): return False t_n_timepoint = file.split("-")[2] - return not t_n_timepoint[0] == "N" \ No newline at end of file + return not t_n_timepoint[0] == "N" diff --git a/runner/operator/access/v2_1_0/cnv/__init__.py b/runner/operator/access/v2_1_0/cnv/__init__.py index a7e4603a4..3bb8ebeb0 100644 --- a/runner/operator/access/v2_1_0/cnv/__init__.py +++ b/runner/operator/access/v2_1_0/cnv/__init__.py @@ -16,7 +16,7 @@ WORKDIR = os.path.dirname(os.path.abspath(__file__)) -class AccessLegacyCNVOperator(Operator): +class AccessV2LegacyCNVOperator(Operator): """ Operator for the ACCESS Legacy Copy Number Variants workflow: @@ -38,25 +38,23 @@ def get_sample_inputs(self): :return: list of json_objects """ - runs, self.request_id = get_request_id_runs(["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id) - + runs, self.request_id = get_request_id_runs( + ["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id + ) + bams = [] for run in runs: bams.append(find_request_bams(run)) # TUMOR - unfiltered_bam_ports = [ - b[["unfiltered_bams", "fgbio_collapsed_bam"]] - for b in bams - if is_tumor_bam(b["unfiltered_bams"].file_name) - ] + unfiltered_bam_ports = [b[["unfiltered_bams", "fgbio_collapsed_bam"]] for b in bams] # Get all unfiltered bam ports for these runs # unfiltered_bam_ports = Port.objects.filter( # name__in=["unfiltered_bams", "fgbio_collapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED # ) - # unfiltered_tumor_bams = [f for p in unfiltered_bam_ports for f in p.files.all() if self.is_tumor_bam(f)] + unfiltered_tumor_bams = [f for p in unfiltered_bam_ports for f in p.files.all() if is_tumor_bam(f)] sample_ids = [] tumor_bams = [] @@ -88,7 +86,6 @@ def get_jobs(self): :return: list[(serialized job info, Job)] """ - self.request_id = get_request_id(self.run_ids, self.request_id) inputs, sample_ids = self.get_sample_inputs() return [ diff --git a/runner/operator/access/v2_1_0/msi/__init__.py b/runner/operator/access/v2_1_0/msi/__init__.py index d474cccb5..a730505a8 100644 --- a/runner/operator/access/v2_1_0/msi/__init__.py +++ b/runner/operator/access/v2_1_0/msi/__init__.py @@ -51,18 +51,16 @@ def get_sample_inputs(self): :return: list of json_objects """ - runs, self.request_id = get_request_id_runs(["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id) - + runs, self.request_id = get_request_id_runs( + ["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id + ) + bams = [] for run in runs: bams.append(find_request_bams(run)) # TUMOR - standard_tumor_bams = [ - b[["standard_bams", "uncollapsed_bam"]] - for b in bams - if is_tumor_bam(b["unfiltered_bams"].file_name) - ] + standard_bam_ports = [b[["standard_bams", "uncollapsed_bam"]] for b in bams] # run_ids = self.run_ids if self.run_ids else [r.id for r in get_request_id_runs(self.request_id)] # # Get all standard bam ports for these runs @@ -70,7 +68,7 @@ def get_sample_inputs(self): # name__in=["standard_bams", "uncollapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED # ) - # standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)] + standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)] # Dictionary that associates tumor bam with standard bam with tumor_sample_id sample_tumor_normal = {} @@ -105,7 +103,6 @@ def get_jobs(self): :return: list[(serialized job info, Job)] """ - self.request_id = get_request_id(self.run_ids, self.request_id) inputs = self.get_sample_inputs() return [ diff --git a/runner/operator/access/v2_1_0/structural_variants/__init__.py b/runner/operator/access/v2_1_0/structural_variants/__init__.py index 60fea3adc..7460168c2 100755 --- a/runner/operator/access/v2_1_0/structural_variants/__init__.py +++ b/runner/operator/access/v2_1_0/structural_variants/__init__.py @@ -8,9 +8,16 @@ from runner.operator.operator import Operator from runner.run.objects.run_creator_object import RunCreator from file_system.repository.file_repository import File -from runner.operator.access import get_request_id, get_request_id_runs, create_cwl_file_object, find_request_bams, is_tumor_bam +from runner.operator.access import ( + get_request_id, + get_request_id_runs, + create_cwl_file_object, + find_request_bams, + is_tumor_bam, +) from runner.models import RunStatus, Port, Run, Pipeline from datetime import datetime + # DUPLEX_BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-duplex.bam" BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR.bam" logger = logging.getLogger(__name__) @@ -48,11 +55,7 @@ def get_sample_inputs(self, runs): bams.append(find_request_bams(run)) # TUMOR - standard_tumor_bams = [ - b["uncollapsed_bam"] - for b in bams - if is_tumor_bam(b["uncollapsed_bam"].file_name) - ] + standard_tumor_bams = [b["uncollapsed_bam"] for b in bams if is_tumor_bam(b["uncollapsed_bam"].file_name)] # standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)] # sample_ids = [f.file_name.split("_cl_aln")[0] for f in standard_tumor_bams] @@ -79,7 +82,9 @@ def get_jobs(self): run_date = datetime.now().strftime("%Y%m%d_%H:%M:%f") # If no request_id, get request id from run information # else request_id given directly - runs, self.request_id = get_request_id_runs(["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id) + runs, self.request_id = get_request_id_runs( + ["access v2 nucleo", "access nucleo"], self.run_ids, self.request_id + ) sample_inputs = self.get_sample_inputs(runs) return [ From 1e9f45777adccdd86552b320e5409ca43ddf1364 Mon Sep 17 00:00:00 2001 From: Allan Bolipata Date: Tue, 11 Feb 2025 12:51:12 -0500 Subject: [PATCH 05/16] access helper import update --- runner/operator/access/__init__.py | 2 -- runner/operator/access/v2_1_0/msi/__init__.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) mode change 100644 => 100755 runner/operator/access/v2_1_0/msi/__init__.py diff --git a/runner/operator/access/__init__.py b/runner/operator/access/__init__.py index 7ff2ce1ed..82c0e5488 100755 --- a/runner/operator/access/__init__.py +++ b/runner/operator/access/__init__.py @@ -3,8 +3,6 @@ from django.conf import settings from runner.models import Run, RunStatus, Port from file_system.models import File, FileMetadata -from runner.operator.access import get_request_id_runs - logger = logging.getLogger(__name__) diff --git a/runner/operator/access/v2_1_0/msi/__init__.py b/runner/operator/access/v2_1_0/msi/__init__.py old mode 100644 new mode 100755 index d474cccb5..bd8994eda --- a/runner/operator/access/v2_1_0/msi/__init__.py +++ b/runner/operator/access/v2_1_0/msi/__init__.py @@ -12,7 +12,7 @@ from runner.operator.operator import Operator from runner.run.objects.run_creator_object import RunCreator from runner.models import Port, RunStatus -from runner.operator.access import get_request_id_runs, find_request_bams, is_tumor_bam, create_cwl_file_object +from runner.operator.access import get_request_id, get_request_id_runs, create_cwl_file_object, find_request_bams, is_tumor_bam logger = logging.getLogger(__name__) @@ -27,7 +27,7 @@ WORKDIR = os.path.dirname(os.path.abspath(__file__)) -class AccessLegacyMSIOperator(Operator): +class AccessLegacyV2MSIOperator(Operator): """ Operator for the ACCESS Legacy Microsatellite Instability workflow: From f0c66c481697afb3a9975eb22aa03129b4d11f6d Mon Sep 17 00:00:00 2001 From: Allan Bolipata Date: Wed, 12 Feb 2025 10:03:26 -0500 Subject: [PATCH 06/16] working operators, cnv hits v2 bug --- runner/operator/access/v2_1_0/cnv/__init__.py | 13 +++---------- runner/operator/access/v2_1_0/msi/__init__.py | 17 +++-------------- .../v2_1_0/structural_variants/__init__.py | 14 +++----------- .../input_template.json.jinja2 | 1 - 4 files changed, 9 insertions(+), 36 deletions(-) mode change 100644 => 100755 runner/operator/access/v2_1_0/cnv/__init__.py diff --git a/runner/operator/access/v2_1_0/cnv/__init__.py b/runner/operator/access/v2_1_0/cnv/__init__.py old mode 100644 new mode 100755 index 3bb8ebeb0..2f82900d3 --- a/runner/operator/access/v2_1_0/cnv/__init__.py +++ b/runner/operator/access/v2_1_0/cnv/__init__.py @@ -46,15 +46,8 @@ def get_sample_inputs(self): for run in runs: bams.append(find_request_bams(run)) - # TUMOR - unfiltered_bam_ports = [b[["unfiltered_bams", "fgbio_collapsed_bam"]] for b in bams] - - # Get all unfiltered bam ports for these runs - # unfiltered_bam_ports = Port.objects.filter( - # name__in=["unfiltered_bams", "fgbio_collapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED - # ) - - unfiltered_tumor_bams = [f for p in unfiltered_bam_ports for f in p.files.all() if is_tumor_bam(f)] + # TUMOR Unfiltered + unfiltered_tumor_bams = [b["fgbio_collapsed_bam"] for b in bams if is_tumor_bam(b["fgbio_collapsed_bam"].file_name)] sample_ids = [] tumor_bams = [] @@ -122,6 +115,6 @@ def construct_sample_inputs(self, tumor_bam, sample_sex): tumor_sample_id=tumor_sample_id, tumor_sample_list_content=json.dumps(tumor_sample_list), ) - + print(input_file) sample_input = json.loads(input_file) return sample_input diff --git a/runner/operator/access/v2_1_0/msi/__init__.py b/runner/operator/access/v2_1_0/msi/__init__.py index 26b17497a..ecacc6888 100755 --- a/runner/operator/access/v2_1_0/msi/__init__.py +++ b/runner/operator/access/v2_1_0/msi/__init__.py @@ -18,16 +18,13 @@ logger = logging.getLogger(__name__) # Todo: needs to work for Nucleo bams as well -SAMPLE_ID_SEP = "_cl_aln" TUMOR_SEARCH = "-L0" -TUMOR_SEARCH_NEW = "_L0" NORMAL_SEARCH = "-N0" -NORMAL_SEARCH_NEW = "_N0" STANDARD_BAM_SEARCH = "_cl_aln_srt_MD_IR_FX_BR.bam" WORKDIR = os.path.dirname(os.path.abspath(__file__)) -class AccessLegacyV2MSIOperator(Operator): +class AccessV2LegacyMSIOperator(Operator): """ Operator for the ACCESS Legacy Microsatellite Instability workflow: @@ -59,16 +56,8 @@ def get_sample_inputs(self): for run in runs: bams.append(find_request_bams(run)) - # TUMOR - standard_bam_ports = [b[["standard_bams", "uncollapsed_bam"]] for b in bams] - # run_ids = self.run_ids if self.run_ids else [r.id for r in get_request_id_runs(self.request_id)] - - # # Get all standard bam ports for these runs - # standard_bam_ports = Port.objects.filter( - # name__in=["standard_bams", "uncollapsed_bam"], run__id__in=run_ids, run__status=RunStatus.COMPLETED - # ) - - standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)] + # TUMOR Uncollapsed + standard_tumor_bams = [b["uncollapsed_bam"] for b in bams if is_tumor_bam(b["fgbio_collapsed_bam"].file_name)] # Dictionary that associates tumor bam with standard bam with tumor_sample_id sample_tumor_normal = {} diff --git a/runner/operator/access/v2_1_0/structural_variants/__init__.py b/runner/operator/access/v2_1_0/structural_variants/__init__.py index 7460168c2..36cf86f22 100755 --- a/runner/operator/access/v2_1_0/structural_variants/__init__.py +++ b/runner/operator/access/v2_1_0/structural_variants/__init__.py @@ -4,27 +4,21 @@ from django.conf import settings from jinja2 import Template -from runner.models import Port, RunStatus from runner.operator.operator import Operator from runner.run.objects.run_creator_object import RunCreator from file_system.repository.file_repository import File from runner.operator.access import ( - get_request_id, get_request_id_runs, create_cwl_file_object, find_request_bams, is_tumor_bam, ) -from runner.models import RunStatus, Port, Run, Pipeline +from runner.models import Pipeline from datetime import datetime -# DUPLEX_BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-duplex.bam" BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR.bam" logger = logging.getLogger(__name__) WORKDIR = os.path.dirname(os.path.abspath(__file__)) -TUMOR_OR_NORMAL_SEARCH = "-L0" -SAMPLE_ID_SEP = "_cl_aln" -ACCESS_DEFAULT_SV_NORMAL_ID = "Donor19F21c2206-TP01" ACCESS_DEFAULT_SV_NORMAL_FILENAME = "Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R_cl_aln_srt_MD_IR_FX_BR.bam" @@ -54,11 +48,9 @@ def get_sample_inputs(self, runs): for run in runs: bams.append(find_request_bams(run)) - # TUMOR + # Standard TUMOR standard_tumor_bams = [b["uncollapsed_bam"] for b in bams if is_tumor_bam(b["uncollapsed_bam"].file_name)] - # standard_tumor_bams = [f for p in standard_bam_ports for f in p.files.all() if self.is_tumor_bam(f)] - # sample_ids = [f.file_name.split("_cl_aln")[0] for f in standard_tumor_bams] sample_ids = [b.file_name.replace(BAM_STEM, "") for b in standard_tumor_bams] normal_bam = File.objects.filter(file_name=ACCESS_DEFAULT_SV_NORMAL_FILENAME) normal_bam = normal_bam[0] @@ -126,6 +118,6 @@ def construct_sample_inputs(self, tumor_sample_id, tumor_bam, normal_bam): tumor_bams=json.dumps(tumor_bams), normal_bam=json.dumps(normal_bam), ) - + print(input_file) sample_input = json.loads(input_file) return sample_input diff --git a/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2 b/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2 index f465530de..295355be3 100755 --- a/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2 +++ b/runner/operator/access/v2_1_0/structural_variants/input_template.json.jinja2 @@ -1,7 +1,6 @@ { "version": "1.3.40", "project_name": "{{ tumor_sample_id }}", - #TODO change reference "ref_fasta": { "class": "File", "location": "juno:///juno/work/access/production/resources/reference/versions/hg19_virus_special/hg19_virus.fasta" From 5eaa6017e9d7d8c06c851930ced8fda4380d8432 Mon Sep 17 00:00:00 2001 From: buehlere Date: Wed, 12 Feb 2025 10:04:37 -0500 Subject: [PATCH 07/16] linting --- runner/operator/access/v2_1_0/cnv/__init__.py | 6 ++++-- runner/operator/access/v2_1_0/msi/__init__.py | 10 ++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/runner/operator/access/v2_1_0/cnv/__init__.py b/runner/operator/access/v2_1_0/cnv/__init__.py index 2f82900d3..5c483a375 100755 --- a/runner/operator/access/v2_1_0/cnv/__init__.py +++ b/runner/operator/access/v2_1_0/cnv/__init__.py @@ -46,8 +46,10 @@ def get_sample_inputs(self): for run in runs: bams.append(find_request_bams(run)) - # TUMOR Unfiltered - unfiltered_tumor_bams = [b["fgbio_collapsed_bam"] for b in bams if is_tumor_bam(b["fgbio_collapsed_bam"].file_name)] + # TUMOR Unfiltered + unfiltered_tumor_bams = [ + b["fgbio_collapsed_bam"] for b in bams if is_tumor_bam(b["fgbio_collapsed_bam"].file_name) + ] sample_ids = [] tumor_bams = [] diff --git a/runner/operator/access/v2_1_0/msi/__init__.py b/runner/operator/access/v2_1_0/msi/__init__.py index ecacc6888..6bcd47557 100755 --- a/runner/operator/access/v2_1_0/msi/__init__.py +++ b/runner/operator/access/v2_1_0/msi/__init__.py @@ -12,7 +12,13 @@ from runner.operator.operator import Operator from runner.run.objects.run_creator_object import RunCreator from runner.models import Port, RunStatus -from runner.operator.access import get_request_id, get_request_id_runs, create_cwl_file_object, find_request_bams, is_tumor_bam +from runner.operator.access import ( + get_request_id, + get_request_id_runs, + create_cwl_file_object, + find_request_bams, + is_tumor_bam, +) logger = logging.getLogger(__name__) @@ -56,7 +62,7 @@ def get_sample_inputs(self): for run in runs: bams.append(find_request_bams(run)) - # TUMOR Uncollapsed + # TUMOR Uncollapsed standard_tumor_bams = [b["uncollapsed_bam"] for b in bams if is_tumor_bam(b["fgbio_collapsed_bam"].file_name)] # Dictionary that associates tumor bam with standard bam with tumor_sample_id From 4852c27a3d404b8c68d646557a8d521b034e6c1a Mon Sep 17 00:00:00 2001 From: Allan Bolipata Date: Thu, 13 Feb 2025 17:07:01 -0500 Subject: [PATCH 08/16] updated snv --- .../input_template.json.jinja2 | 20 --------- .../v2_1_0/snps_and_indels/snps_and_indels.py | 42 ++++++------------- 2 files changed, 13 insertions(+), 49 deletions(-) diff --git a/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2 b/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2 index d10b5d90a..0dda14c3d 100755 --- a/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2 +++ b/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2 @@ -112,26 +112,6 @@ "class": "File", "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/regions_of_interest/versions/v1.0/mutect_annotate_concat_header.txt" }, - "curated_duplex_bams": { - "novaseq": { - "class": "Directory", - "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/novaseq_curated_duplex_bams_dmp" - }, - "hiseq": { - "class": "Directory", - "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/hiseq4000_curated_duplex_bams_dmp" - } - }, - "curated_simplex_bams": { - "novaseq": { - "class": "Directory", - "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/novaseq_curated_simplex_bams_dmp" - }, - "hiseq": { - "class": "Directory", - "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/hiseq4000_curated_simplex_bams_dmp" - } - }, "reference_bam_for_VC": { "class": "File", "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/novaseq_unmatched_normal_plasma_duplex_bams_dmp/versions/v1.0/Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-duplex.bam" diff --git a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py index e6fcc7943..afc52201a 100755 --- a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py +++ b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py @@ -26,6 +26,7 @@ LOGGER = logging.getLogger(__name__) ACCESS_CURATED_BAMS_FILE_GROUP_SLUG = "accessv2_curated_normals_02" ACCESS_DEFAULT_NORMAL_ID = "Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R" +BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX" NORMAL_SAMPLE_SEARCH = "-N0" TUMOR_SAMPLE_SEARCH = "-L0" DUPLEX_BAM_SEARCH = "__aln_srt_IR_FX-duplex.bam" @@ -42,7 +43,6 @@ DUPLEX_BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-duplex.bam" SIMPLEX_BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-simplex.bam" UNCOLLAPSED_BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR.bam" -BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX" ACCESS_DEFAULT_NORMAL_FILENAME_DUPLEX = ( "Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-duplex.bam" ) @@ -460,14 +460,6 @@ def get_dmp_matched_patient_geno_samples(patient_id): metadata__type="T", file__path__endswith=DMP_REGEX, ) - # if not matched_tumors_dmp: - # matched_tumors_dmp = FileMetadata.objects.filter( - # file__file_group=DMP_FILE_GROUP, - # metadata__patient__cmo=patient_id.lstrip("C-"), - # metadata__assay="XS1", - # metadata__type="T", - # file__path__endswith=DMP_REGEX, - # ) matched_tumors_dmp_simplex = [b.file for b in matched_tumors_dmp] matched_tumors_dmp_duplex = copy.deepcopy(matched_tumors_dmp_simplex) @@ -681,27 +673,19 @@ def construct_sample_inputs(self, sample_info): tumor_bam_duplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][0].path)] tumor_bam_simplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][1].path)] - tumor_sample_name = [sample_info["tumor_bam"][0][0].file_name.replace(DUPLEX_BAM_STEM, "")] + tumor_sample_name = [sample_info["tumor_bam"][0][0].file_name.split("_")[0]] tumor_duplex_id = [ - sample_info["tumor_bam"][0][0].file_name.replace(BAM_STEM, "").replace("-duplex.bam", "") + sample_info["tumor_bam"][0][0].file_name.split("_")[0] ] tumor_simplex_id = [ - sample_info["tumor_bam"][0][1].file_name.replace(BAM_STEM, "").replace("-simplex.bam", "-SIMPLEX") + sample_info["tumor_bam"][0][1].file_name.split("_")[0] + "-SIMPLEX" ] normal_bam_duplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][0].path)] normal_bam_simplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][1].path)] - normal_sample_name = [sample_info["normal_bam"][0][0].file_name.replace(DUPLEX_BAM_STEM, "")] - normal_duplex_id = [ - sample_info["normal_bam"][0][0] - .file_name.replace(BAM_STEM, "") - .replace("-duplex.bam", "-CURATED-DUPLEX") - ] - normal_simplex_id = [ - sample_info["normal_bam"][0][1] - .file_name.replace(BAM_STEM, "") - .replace("-simplex.bam", "-CURATED-SIMPLEX") - ] + normal_sample_name = [sample_info["normal_bam"][0][0].file_name.split("_")[0]] + normal_duplex_id = [sample_info["normal_bam"][0][0].file_name.split("_")[0] + "-CURATED-DUPLEX"] + normal_simplex_id = [sample_info["normal_bam"][0][1].file_name.split("_")[0] + "-CURATED-SIMPLEX"] genotyping_bams_ids = tumor_duplex_id + tumor_simplex_id + normal_duplex_id + normal_simplex_id @@ -712,7 +696,7 @@ def construct_sample_inputs(self, sample_info): else: matched_normal = [_create_cwl_bam_object(sample_info["matched_normal_unfiltered"][0].path)] matched_normal_id = [ - sample_info["matched_normal_unfiltered"][0].file_name.replace(UNCOLLAPSED_BAM_STEM, "") + sample_info["matched_normal_unfiltered"][0].file_name.split("_")[0] ] genotyping_bams_ids += matched_normal_id genotyping_bams += matched_normal @@ -721,22 +705,22 @@ def construct_sample_inputs(self, sample_info): for f in files: if key == "geno_samples": # TODO jsut do the replace here - sample_id_duplex = f[0].file_name.replace(BAM_STEM, "").replace("-duplex.bam", "") - sample_id_simplex = f[1].file_name.replace(BAM_STEM, "").replace("-simplex.bam", "-SIMPLEX") + sample_id_duplex = f[0].file_name.split("_")[0] + sample_id_simplex = f[1].file_name.split("_")[0] + "-SIMPLEX" genotyping_bams_ids.append(sample_id_duplex) genotyping_bams_ids.append(sample_id_simplex) genotyping_bams.append(_create_cwl_bam_object(f[0].path)) genotyping_bams.append(_create_cwl_bam_object(f[1].path)) if key == "geno_samples_normal_unfiltered": - sample_id = f.file_name.replace(UNCOLLAPSED_BAM_STEM, "") + sample_id = f.file_name.split("_")[0] genotyping_bams_ids.append(sample_id) genotyping_bams.append(_create_cwl_bam_object(f.path)) if key == "curated_normal_bams": sample_id_duplex = ( - f[0].file_name.replace(BAM_STEM, "").replace("-duplex.bam", "-CURATED-DUPLEX") + f[0].file_name.split("_")[0] + "-CURATED-DUPLEX" ) sample_id_simplex = ( - f[1].file_name.replace(BAM_STEM, "").replace("-simplex.bam", "-CURATED-SIMPLEX") + f[1].file_name.split("_")[0] + "-CURATED-SIMPLEX" ) genotyping_bams_ids.append(sample_id_duplex) genotyping_bams_ids.append(sample_id_simplex) From f5ec280b72b60d733f987d5c4e22ea94a740ab93 Mon Sep 17 00:00:00 2001 From: buehlere Date: Thu, 13 Feb 2025 17:13:59 -0500 Subject: [PATCH 09/16] Update snps_and_indels.py --- .../v2_1_0/snps_and_indels/snps_and_indels.py | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py index afc52201a..d16334996 100755 --- a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py +++ b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py @@ -674,18 +674,14 @@ def construct_sample_inputs(self, sample_info): tumor_bam_duplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][0].path)] tumor_bam_simplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][1].path)] tumor_sample_name = [sample_info["tumor_bam"][0][0].file_name.split("_")[0]] - tumor_duplex_id = [ - sample_info["tumor_bam"][0][0].file_name.split("_")[0] - ] - tumor_simplex_id = [ - sample_info["tumor_bam"][0][1].file_name.split("_")[0] + "-SIMPLEX" - ] + tumor_duplex_id = [sample_info["tumor_bam"][0][0].file_name.split("_")[0]] + tumor_simplex_id = [sample_info["tumor_bam"][0][1].file_name.split("_")[0] + "-SIMPLEX"] normal_bam_duplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][0].path)] normal_bam_simplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][1].path)] normal_sample_name = [sample_info["normal_bam"][0][0].file_name.split("_")[0]] normal_duplex_id = [sample_info["normal_bam"][0][0].file_name.split("_")[0] + "-CURATED-DUPLEX"] - normal_simplex_id = [sample_info["normal_bam"][0][1].file_name.split("_")[0] + "-CURATED-SIMPLEX"] + normal_simplex_id = [sample_info["normal_bam"][0][1].file_name.split("_")[0] + "-CURATED-SIMPLEX"] genotyping_bams_ids = tumor_duplex_id + tumor_simplex_id + normal_duplex_id + normal_simplex_id @@ -695,16 +691,13 @@ def construct_sample_inputs(self, sample_info): matched_normal_id = [""] else: matched_normal = [_create_cwl_bam_object(sample_info["matched_normal_unfiltered"][0].path)] - matched_normal_id = [ - sample_info["matched_normal_unfiltered"][0].file_name.split("_")[0] - ] + matched_normal_id = [sample_info["matched_normal_unfiltered"][0].file_name.split("_")[0]] genotyping_bams_ids += matched_normal_id genotyping_bams += matched_normal for key, files in sample_info.items(): for f in files: if key == "geno_samples": - # TODO jsut do the replace here sample_id_duplex = f[0].file_name.split("_")[0] sample_id_simplex = f[1].file_name.split("_")[0] + "-SIMPLEX" genotyping_bams_ids.append(sample_id_duplex) @@ -716,12 +709,8 @@ def construct_sample_inputs(self, sample_info): genotyping_bams_ids.append(sample_id) genotyping_bams.append(_create_cwl_bam_object(f.path)) if key == "curated_normal_bams": - sample_id_duplex = ( - f[0].file_name.split("_")[0] + "-CURATED-DUPLEX" - ) - sample_id_simplex = ( - f[1].file_name.split("_")[0] + "-CURATED-SIMPLEX" - ) + sample_id_duplex = f[0].file_name.split("_")[0] + "-CURATED-DUPLEX" + sample_id_simplex = f[1].file_name.split("_")[0] + "-CURATED-SIMPLEX" genotyping_bams_ids.append(sample_id_duplex) genotyping_bams_ids.append(sample_id_simplex) genotyping_bams.append(_create_cwl_bam_object(f[0].path)) From 814b7e2fe6209704a05175ec8be2e0899744b368 Mon Sep 17 00:00:00 2001 From: Sinisa Ivkovic Date: Wed, 19 Feb 2025 16:22:39 -0500 Subject: [PATCH 10/16] Version bump 1.91.7 --- beagle/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beagle/__init__.py b/beagle/__init__.py index d522dec1c..7f346af82 100644 --- a/beagle/__init__.py +++ b/beagle/__init__.py @@ -1 +1 @@ -__version__ = "1.91.6" +__version__ = "1.91.7" From ec2daf7ce8de211738d85065c587083bd8634c23 Mon Sep 17 00:00:00 2001 From: buehlere Date: Mon, 24 Feb 2025 16:51:32 -0500 Subject: [PATCH 11/16] Update snps_and_indels.py --- .../v2_1_0/snps_and_indels/snps_and_indels.py | 116 +++++------------- 1 file changed, 34 insertions(+), 82 deletions(-) diff --git a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py index d16334996..61f1300b0 100755 --- a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py +++ b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py @@ -24,7 +24,7 @@ WORKDIR = os.path.dirname(os.path.abspath(__file__)) LOGGER = logging.getLogger(__name__) -ACCESS_CURATED_BAMS_FILE_GROUP_SLUG = "accessv2_curated_normals_02" +ACCESS_CURATED_BAMS_FILE_GROUP_SLUG = "accessv2_curated_normals" ACCESS_DEFAULT_NORMAL_ID = "Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R" BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX" NORMAL_SAMPLE_SEARCH = "-N0" @@ -51,6 +51,15 @@ ) +def check_genotype_list(genotyping_bams, genotyping_bams_ids): + if len(genotyping_bams_ids) != len(genotyping_bams): + raise Exception(f"list of genotyping bams: {genotyping_bams} is a different length from list of genotyping ids {genotyping_bams_ids}") + for id in genotyping_bams_ids: + for bam in genotyping_bams: + if not id in bam: + raise Exception(f"list of genotyping bams: {genotyping_bams} is a different order from list of genotyping ids {genotyping_bams_ids}") + return True + def register_file(file): fname = os.path.basename(file) file_group = FileGroup.objects.get(id=DMP_FILE_GROUP) @@ -185,7 +194,6 @@ def get_unfiltered_matched_normal(patient_id, fillout_unfiltered_normals, reques # Case 1 if request_id: - # Todo: Joining to Port -> Run makes this query slow, make use of output_metadata for requestId instead for bam in fillout_unfiltered_normals: if bam.file_name.startswith(patient_normals_search): unfiltered_matched_normal_bam = bam @@ -555,11 +563,10 @@ def create_sample_info( msg = "ACCESS SNV Operator Error: Duplex sample IDs not matched to Simplex sample IDs" raise Exception(msg) # Add in any DMP ACCESS samples - (dmp_matched_tumors_duplex, dmp_matched_tumors_simplex) = get_dmp_matched_patient_geno_samples(patient_id) - # TODO not flipping file name + # (dmp_matched_tumors_duplex, dmp_matched_tumors_simplex) = get_dmp_matched_patient_geno_samples(patient_id) - geno_samples_duplex = geno_samples_duplex # + dmp_matched_tumors_duplex - geno_samples_simplex = geno_samples_simplex # + dmp_matched_tumors_simplex + geno_samples_duplex = geno_samples_duplex + geno_samples_simplex = geno_samples_simplex geno_samples = make_pairs(geno_samples_duplex, geno_samples_simplex) sample_info = { "matched_normal_unfiltered": [matched_normal_unfiltered_bam], @@ -569,73 +576,6 @@ def create_sample_info( return sample_info - def mapping_bams(self, sample_info): - # sample_id,normal_path,duplex_path,simplex_path,type - # patient_id,sample_id,type,maf,standard_bam,standard_bai,duplex_bam,duplex_bai,simplex_bam,simplex_bai - bams = [] - aux_bams = [] - for key, value in sample_info.items(): - for v in value: - map = {} - if key == "tumor_bam": - map["patient_id"] = "null" - map["sample_id"] = v[0].file_name.replace(DUPLEX_BAM_STEM, "") - map["maf"] = "null" - map["standard_bam"] = "null" - map["standard_bai"] = "null" - map["duplex_bam"] = _create_cwl_bam_object(v[0].path) - map["duplex_bai"] = _create_cwl_bam_object(v[0].path.replace(".bam", ".bai")) - map["simplex_bam"] = _create_cwl_bam_object(v[1].path) - map["simplex_bai"] = _create_cwl_bam_object(v[1].path.replace(".bam", ".bai")) - map["type"] = "CASE" - bams.append(map) - if key == "normal_bam": - map["patient_id"] = "null" - map["sample_id"] = v[0].file_name.replace( - "-TP_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-duplex.bam", "" - ) - map["maf"] = "null" - map["standard_bam"] = "null" - map["standard_bai"] = "null" - map["duplex_bam"] = _create_cwl_bam_object(v[0].path) - map["duplex_bai"] = _create_cwl_bam_object(v[0].path.replace(".bam", ".bai")) - map["simplex_bam"] = _create_cwl_bam_object(v[1].path) - map["simplex_bai"] = _create_cwl_bam_object(v[1].path.replace(".bam", ".bai")) - map["type"] = "CONTROL" - bams.append(map) - if key == "geno_samples": - # TODO jsut do the replace here - sample_id = v[0].file_name.replace(DUPLEX_BAM_STEM, "") - sample_id = sample_id.replace(DMP_DUPLEX_REGEX, "") - map["sample_id"] = sample_id - map["normal_path"] = "null" - map["duplex_path"] = _create_cwl_bam_object(v[0].path) - map["simplex_path"] = _create_cwl_bam_object(v[1].path) - map["type"] = "PLASMA" - aux_bams.append(map) - if key == "geno_samples_normal_unfiltered": - map["sample_id"] = v.file_name.replace(UNCOLLAPSED_BAM_STEM, "") - map["normal_path"] = _create_cwl_bam_object(v.path) - map["duplex_path"] = "null" - map["simplex_path"] = "null" - map["type"] = "UNMATCHED_NORMAL" - aux_bams.append(map) - if key == "curated_normal_bams": - map["sample_id"] = v[0].file_name.replace(DUPLEX_BAM_STEM, "") - map["normal_path"] = "null" - map["duplex_path"] = _create_cwl_bam_object(v[0].path) - map["simplex_path"] = _create_cwl_bam_object(v[1].path) - map["type"] = "CURATED" - aux_bams.append(map) - if key == "matched_normal_unfiltered": - map["sample_id"] = v.file_name.replace(UNCOLLAPSED_BAM_STEM, "") - map["normal_path"] = _create_cwl_bam_object(v.path) - map["duplex_path"] = "null" - map["simplex_path"] = "null" - map["type"] = "MATCHED_NORMAL" - aux_bams.append(map) - return (bams, aux_bams) - def get_request_id_runs(self, app): """ Get the latest completed bam-generation runs for the given request ID @@ -670,28 +610,33 @@ def construct_sample_inputs(self, sample_info): template = Template(file.read()) genotyping_bams = [] genotyping_bams_ids = [] - tumor_bam_duplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][0].path)] tumor_bam_simplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][1].path)] tumor_sample_name = [sample_info["tumor_bam"][0][0].file_name.split("_")[0]] - tumor_duplex_id = [sample_info["tumor_bam"][0][0].file_name.split("_")[0]] - tumor_simplex_id = [sample_info["tumor_bam"][0][1].file_name.split("_")[0] + "-SIMPLEX"] + tumor_duplex_id = [ + sample_info["tumor_bam"][0][0].file_name.split("_")[0] + ] + tumor_simplex_id = [ + sample_info["tumor_bam"][0][1].file_name.split("_")[0] + "-SIMPLEX" + ] normal_bam_duplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][0].path)] normal_bam_simplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][1].path)] normal_sample_name = [sample_info["normal_bam"][0][0].file_name.split("_")[0]] normal_duplex_id = [sample_info["normal_bam"][0][0].file_name.split("_")[0] + "-CURATED-DUPLEX"] - normal_simplex_id = [sample_info["normal_bam"][0][1].file_name.split("_")[0] + "-CURATED-SIMPLEX"] + normal_simplex_id = [sample_info["normal_bam"][0][1].file_name.split("_")[0] + "-CURATED-SIMPLEX"] genotyping_bams_ids = tumor_duplex_id + tumor_simplex_id + normal_duplex_id + normal_simplex_id - genotyping_bams = normal_bam_duplex + normal_bam_simplex + tumor_bam_duplex + tumor_bam_simplex + genotyping_bams = tumor_bam_duplex + tumor_bam_simplex + normal_bam_duplex + normal_bam_simplex if sample_info["matched_normal_unfiltered"][0] == None: matched_normal_id = [""] else: matched_normal = [_create_cwl_bam_object(sample_info["matched_normal_unfiltered"][0].path)] - matched_normal_id = [sample_info["matched_normal_unfiltered"][0].file_name.split("_")[0]] + matched_normal_id = [ + sample_info["matched_normal_unfiltered"][0].file_name.split("_")[0] + ] genotyping_bams_ids += matched_normal_id genotyping_bams += matched_normal @@ -709,13 +654,20 @@ def construct_sample_inputs(self, sample_info): genotyping_bams_ids.append(sample_id) genotyping_bams.append(_create_cwl_bam_object(f.path)) if key == "curated_normal_bams": - sample_id_duplex = f[0].file_name.split("_")[0] + "-CURATED-DUPLEX" - sample_id_simplex = f[1].file_name.split("_")[0] + "-CURATED-SIMPLEX" + sample_id_duplex = ( + f[0].file_name.split("_")[0] + "-CURATED-DUPLEX" + ) + sample_id_simplex = ( + f[1].file_name.split("_")[0] + "-CURATED-SIMPLEX" + ) genotyping_bams_ids.append(sample_id_duplex) genotyping_bams_ids.append(sample_id_simplex) genotyping_bams.append(_create_cwl_bam_object(f[0].path)) genotyping_bams.append(_create_cwl_bam_object(f[1].path)) + # check genotyping bams + check_genotype_list(genotyping_bams, genotyping_bams_ids) + input_file = template.render( tumor_bams=json.dumps(tumor_bam_duplex), normal_bams=json.dumps(normal_bam_duplex), @@ -822,7 +774,7 @@ def get_jobs(self): settings.CMO_SAMPLE_NAME_METADATA_KEY: sample, } job_json = { - "name": "ACCESS LEGACY SNV {sample}: {run_date}".format(sample=sample, run_date=run_date), + "name": "ACCESS V2 LEGACY SNV {request_id}: {run_date}".format(request_id=self.request_id, run_date=run_date), "app": app, "inputs": sample_input, "tags": job_tags, From 583eac47def2a0e9b1ff0762ccde4d721d71d4db Mon Sep 17 00:00:00 2001 From: buehlere Date: Mon, 24 Feb 2025 16:51:38 -0500 Subject: [PATCH 12/16] Update __init__.py --- runner/operator/access/v2_1_0/msi/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runner/operator/access/v2_1_0/msi/__init__.py b/runner/operator/access/v2_1_0/msi/__init__.py index 6bcd47557..a6f2df21f 100755 --- a/runner/operator/access/v2_1_0/msi/__init__.py +++ b/runner/operator/access/v2_1_0/msi/__init__.py @@ -103,7 +103,7 @@ def get_jobs(self): return [ RunCreator( **{ - "name": "ACCESS LEGACY MSI M1: %s, %i of %i" % (self.request_id, i + 1, len(inputs)), + "name": "ACCESS V2 LEGACY MSI M1: %s, %i of %i" % (self.request_id, i + 1, len(inputs)), "app": self.get_pipeline_id(), "inputs": job, "tags": { From 46e06e15aa7ffec2d13c8810db420000cbb4d1ab Mon Sep 17 00:00:00 2001 From: buehlere Date: Tue, 25 Feb 2025 10:38:06 -0500 Subject: [PATCH 13/16] Update snps_and_indels.py --- .../v2_1_0/snps_and_indels/snps_and_indels.py | 46 +++++++++---------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py index 61f1300b0..adccd9d64 100755 --- a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py +++ b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py @@ -53,13 +53,17 @@ def check_genotype_list(genotyping_bams, genotyping_bams_ids): if len(genotyping_bams_ids) != len(genotyping_bams): - raise Exception(f"list of genotyping bams: {genotyping_bams} is a different length from list of genotyping ids {genotyping_bams_ids}") - for id in genotyping_bams_ids: - for bam in genotyping_bams: - if not id in bam: - raise Exception(f"list of genotyping bams: {genotyping_bams} is a different order from list of genotyping ids {genotyping_bams_ids}") + raise Exception( + f"list of genotyping bams: {genotyping_bams} is a different length from list of genotyping ids {genotyping_bams_ids}" + ) + for ix in range(len(genotyping_bams_ids)): + id = genotyping_bams_ids[ix].replace("-CURATED", "").replace("-DUPLEX", "").replace("-SIMPLEX", "") + bam = genotyping_bams[ix]["location"] + if id not in bam: + raise Exception(f"Sample ID, {id} does not match bam path: {bam}") return True + def register_file(file): fname = os.path.basename(file) file_group = FileGroup.objects.get(id=DMP_FILE_GROUP) @@ -565,8 +569,8 @@ def create_sample_info( # Add in any DMP ACCESS samples # (dmp_matched_tumors_duplex, dmp_matched_tumors_simplex) = get_dmp_matched_patient_geno_samples(patient_id) - geno_samples_duplex = geno_samples_duplex - geno_samples_simplex = geno_samples_simplex + geno_samples_duplex = geno_samples_duplex + geno_samples_simplex = geno_samples_simplex geno_samples = make_pairs(geno_samples_duplex, geno_samples_simplex) sample_info = { "matched_normal_unfiltered": [matched_normal_unfiltered_bam], @@ -613,18 +617,14 @@ def construct_sample_inputs(self, sample_info): tumor_bam_duplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][0].path)] tumor_bam_simplex = [_create_cwl_bam_object(sample_info["tumor_bam"][0][1].path)] tumor_sample_name = [sample_info["tumor_bam"][0][0].file_name.split("_")[0]] - tumor_duplex_id = [ - sample_info["tumor_bam"][0][0].file_name.split("_")[0] - ] - tumor_simplex_id = [ - sample_info["tumor_bam"][0][1].file_name.split("_")[0] + "-SIMPLEX" - ] + tumor_duplex_id = [sample_info["tumor_bam"][0][0].file_name.split("_")[0]] + tumor_simplex_id = [sample_info["tumor_bam"][0][1].file_name.split("_")[0] + "-SIMPLEX"] normal_bam_duplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][0].path)] normal_bam_simplex = [_create_cwl_bam_object(sample_info["normal_bam"][0][1].path)] normal_sample_name = [sample_info["normal_bam"][0][0].file_name.split("_")[0]] normal_duplex_id = [sample_info["normal_bam"][0][0].file_name.split("_")[0] + "-CURATED-DUPLEX"] - normal_simplex_id = [sample_info["normal_bam"][0][1].file_name.split("_")[0] + "-CURATED-SIMPLEX"] + normal_simplex_id = [sample_info["normal_bam"][0][1].file_name.split("_")[0] + "-CURATED-SIMPLEX"] genotyping_bams_ids = tumor_duplex_id + tumor_simplex_id + normal_duplex_id + normal_simplex_id @@ -634,9 +634,7 @@ def construct_sample_inputs(self, sample_info): matched_normal_id = [""] else: matched_normal = [_create_cwl_bam_object(sample_info["matched_normal_unfiltered"][0].path)] - matched_normal_id = [ - sample_info["matched_normal_unfiltered"][0].file_name.split("_")[0] - ] + matched_normal_id = [sample_info["matched_normal_unfiltered"][0].file_name.split("_")[0]] genotyping_bams_ids += matched_normal_id genotyping_bams += matched_normal @@ -654,12 +652,8 @@ def construct_sample_inputs(self, sample_info): genotyping_bams_ids.append(sample_id) genotyping_bams.append(_create_cwl_bam_object(f.path)) if key == "curated_normal_bams": - sample_id_duplex = ( - f[0].file_name.split("_")[0] + "-CURATED-DUPLEX" - ) - sample_id_simplex = ( - f[1].file_name.split("_")[0] + "-CURATED-SIMPLEX" - ) + sample_id_duplex = f[0].file_name.split("_")[0] + "-CURATED-DUPLEX" + sample_id_simplex = f[1].file_name.split("_")[0] + "-CURATED-SIMPLEX" genotyping_bams_ids.append(sample_id_duplex) genotyping_bams_ids.append(sample_id_simplex) genotyping_bams.append(_create_cwl_bam_object(f[0].path)) @@ -726,7 +720,7 @@ def get_jobs(self): if is_tumor_bam(b["fgbio_filter_consensus_reads_duplex_bam"].file_name) ] fillout_unfiltered_normals = [ - b["uncollapsed_bam"] for b in bams if not is_tumor_bam(b["uncollapsed_bam"].file_name) + b["fgbio_collapsed_bam"] for b in bams if not is_tumor_bam(b["fgbio_collapsed_bam"].file_name) ] # NORMAL BAM @@ -774,7 +768,9 @@ def get_jobs(self): settings.CMO_SAMPLE_NAME_METADATA_KEY: sample, } job_json = { - "name": "ACCESS V2 LEGACY SNV {request_id}: {run_date}".format(request_id=self.request_id, run_date=run_date), + "name": "ACCESS V2 LEGACY SNV {request_id}: {run_date}".format( + request_id=self.request_id, run_date=run_date + ), "app": app, "inputs": sample_input, "tags": job_tags, From f26266290c90350c2a0e8c5d764d5f93adb0b9a3 Mon Sep 17 00:00:00 2001 From: buehlere Date: Tue, 25 Feb 2025 13:23:11 -0500 Subject: [PATCH 14/16] Update input_template.json.jinja2 --- .../access/v2_1_0/snps_and_indels/input_template.json.jinja2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2 b/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2 index 0dda14c3d..821cc9a47 100755 --- a/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2 +++ b/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2 @@ -94,11 +94,11 @@ }, "hotspots": { "class": "File", - "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/regions_of_interest/versions/v1.0/hotspot-list-union-v1-v2_with_TERT.txt" + "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/regions_of_interest/versions/v1.0/hotspot-list-union-v1-v2_with_TERT.txt" }, "blacklist_file": { "class": "File", - "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/regions_of_interest/versions/v1.0/ACCESS_blocklist_26_10_2022.txt" + "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/regions_of_interest/versions/v1.0/ACCESSv2_blocklist_26_10_2022.txt" }, "custom_enst_file": { "class": "File", From cf9854e6f4d7014c649a0fec4f1ce8f3462e06d8 Mon Sep 17 00:00:00 2001 From: buehlere Date: Tue, 25 Feb 2025 13:23:26 -0500 Subject: [PATCH 15/16] Update input_template.json.jinja2 --- .../access/v2_1_0/snps_and_indels/input_template.json.jinja2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2 b/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2 index 821cc9a47..3f998ca12 100755 --- a/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2 +++ b/runner/operator/access/v2_1_0/snps_and_indels/input_template.json.jinja2 @@ -102,7 +102,7 @@ }, "custom_enst_file": { "class": "File", - "location": "juno:///juno/work/access/production/resources/msk-access/v1.0/regions_of_interest/versions/v1.0/dmp_ACCESS-panelA-v1-isoform-overrides" + "location": "juno:///juno/cmo/access/production/resources/msk-access/v2.0/regions_of_interest/versions/v1.0/dmp_isoform_merged_overrides.txt" }, "bed_file": { "class": "File", From ebf9e23345a2efefced8e5f56bd77c3b56c112e5 Mon Sep 17 00:00:00 2001 From: buehlere Date: Tue, 25 Feb 2025 14:33:01 -0500 Subject: [PATCH 16/16] Update snps_and_indels.py --- runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py | 1 - 1 file changed, 1 deletion(-) diff --git a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py index adccd9d64..bdecf1177 100755 --- a/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py +++ b/runner/operator/access/v2_1_0/snps_and_indels/snps_and_indels.py @@ -26,7 +26,6 @@ LOGGER = logging.getLogger(__name__) ACCESS_CURATED_BAMS_FILE_GROUP_SLUG = "accessv2_curated_normals" ACCESS_DEFAULT_NORMAL_ID = "Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R" -BAM_STEM = "_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX" NORMAL_SAMPLE_SEARCH = "-N0" TUMOR_SAMPLE_SEARCH = "-L0" DUPLEX_BAM_SEARCH = "__aln_srt_IR_FX-duplex.bam"