Updates to ONT workflows to bring them closer to current conventions (#…

…466) - Documentation updates for `SRWholeGenome.wdl` and `ONTWholeGenome.wdl` - `VariantUtils::MergeAndSortVCFs` now has a fall-back to bcftools concat with normal compression if `--naive` fails. - Fixed `VariantUtils::MergeAndSortVCFs` to use `lr-basic:0.1.2` - `CallVariantsONT.wdl` now takes a `ref_map` instead of reference fasta/fai/dict. - `CallVariantsONT.wdl` now filters out the `mt_chr_name` if `call_small_vars_on_mitochondria` is set to `false`.
broadinstitute · Aug 16, 2024 · 821e81b · 821e81b
1 parent 6e1a2cc
commit 821e81b
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 30 deletions.
diff --git a/wdl/pipelines/ILMN/VariantCalling/SRWholeGenome.wdl b/wdl/pipelines/ILMN/VariantCalling/SRWholeGenome.wdl
@@ -20,7 +20,7 @@ workflow SRWholeGenome {
     parameter_meta {
         aligned_bams:   "Array of aligned bam files to process."
         aligned_bais:   "Array of aligned bam indices to process.  Order must correspond to `aligned_bams`."
-        ref_map_file:  "Reference map file indicating reference sequence and auxillary file locations"
+        ref_map_file:    "Table indicating reference sequence, auxillary file locations, and metadata."
         participant_name:    "The unique identifier of this sample being processed."
 
         enable_hc_pileup_mode:  "If true, will enable `pileup mode` in HaplotypeCaller."
@@ -104,6 +104,7 @@ workflow SRWholeGenome {
         Array[String] contigs_names_to_ignore = ["RANDOM_PLACEHOLDER_VALUE"]  ## Required for ignoring any filtering - this is kind of a hack - TODO: fix the task!
     }
 
+    # Read ref map into map data type so we can access its fields:
     Map[String, String] ref_map = read_map(ref_map_file)
 
     # gather across (potential multiple) input CCS BAMs

diff --git a/wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl b/wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl
@@ -16,7 +16,7 @@ workflow ONTWholeGenome {
         aligned_bais:       "GCS path to aligned BAM file indices"
         participant_name:   "name of the participant from whom these samples were obtained"
 
-        ref_map_file:       "table indicating reference sequence and auxillary file locations"
+        ref_map_file:       "Table indicating reference sequence, auxillary file locations, and metadata."
         gcs_out_root_dir:   "GCS bucket to store the reads, variants, and metrics files"
 
         call_svs:               "whether to call SVs"
@@ -60,6 +60,7 @@ workflow ONTWholeGenome {
         File? ref_scatter_interval_list_ids
     }
 
+    # Read ref map into map data type so we can access its fields:
     Map[String, String] ref_map = read_map(ref_map_file)
 
     String outdir = sub(gcs_out_root_dir, "/$", "") + "/ONTWholeGenome/~{participant_name}"
@@ -120,9 +121,7 @@ workflow ONTWholeGenome {
                 bam               = usable_bam,
                 bai               = usable_bai,
                 sample_id         = participant_name,
-                ref_fasta         = ref_map['fasta'],
-                ref_fasta_fai     = ref_map['fai'],
-                ref_dict          = ref_map['dict'],
+                ref_map_file      = ref_map_file,
                 tandem_repeat_bed = ref_map['tandem_repeat_bed'],
 
                 prefix = participant_name,

diff --git a/wdl/tasks/Utility/VariantUtils.wdl b/wdl/tasks/Utility/VariantUtils.wdl
@@ -108,13 +108,27 @@ task MergeAndSortVCFs {
         echo "==========================================================="
         echo "starting concatenation" && date
         echo "==========================================================="
+
+        # Try this first.  If it fails, do a regular compression.
+        set +e
         bcftools \
             concat \
             --naive \
             --threads ~{cores-1} \
             -f all_raw_vcfs.txt \
-            --output-type v \
             -o concatedated_raw.vcf.gz  # fast, at the expense of disk space
+        r=$?
+        if [[ $r -ne 0 ]] ; then
+            # Could not naively concatenate, so do a regular compression.
+            bcftools \
+                concat \
+                --threads ~{cores-1} \
+                -f all_raw_vcfs.txt \
+                --output-type z2 \
+                -o concatedated_raw.vcf.gz
+        fi
+        set -e
+
         for vcf in ~{sep=' ' vcfs}; do rm $vcf ; done
 
         # this is another bug in bcftools that's hot fixed but not in official release yet
@@ -153,6 +167,8 @@ task MergeAndSortVCFs {
         else
             mv concatedated_raw.vcf.gz tmp.wgs.vcf.gz
         fi
+
+        # Fix file header:
         bcftools reheader \
             --fai ~{ref_fasta_fai} \
             -o wgs_raw.vcf.gz \
@@ -168,7 +184,10 @@ task MergeAndSortVCFs {
             --output-type z \
             -o ~{prefix}.vcf.gz \
             wgs_raw.vcf.gz
+
+        # Index the output:
         bcftools index --tbi --force ~{prefix}.vcf.gz
+
         echo "==========================================================="
         echo "done sorting" && date
         echo "==========================================================="
@@ -187,7 +206,7 @@ task MergeAndSortVCFs {
         boot_disk_gb:       10,
         preemptible_tries:  1,
         max_retries:        0,
-        docker:             "us.gcr.io/broad-dsp-lrma/lr-basic:latest"
+        docker:             "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.2"
     }
     RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
     runtime {

diff --git a/wdl/tasks/VariantCalling/CallVariantsONT.wdl b/wdl/tasks/VariantCalling/CallVariantsONT.wdl
@@ -19,16 +19,18 @@ workflow CallVariants {
         minsvlen: "Minimum SV length in bp (default: 50)"
         prefix: "Prefix for output files"
         sample_id: "Sample ID"
-        ref_fasta: "Reference FASTA file"
-        ref_fasta_fai: "Reference FASTA index file"
-        ref_dict: "Reference FASTA dictionary file"
+
+        ref_map_file:       "Table indicating reference sequence, auxillary file locations, and metadata."
+
         call_svs: "Call structural variants"
         fast_less_sensitive_sv: "to trade less sensitive SV calling for faster speed"
         tandem_repeat_bed: "BED file containing TRF finder for better PBSV calls (e.g. http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.trf.bed.gz)"
+
         call_small_variants: "Call small variants"
         call_small_vars_on_mitochondria: "if false, will not attempt to call variants on mitochondria"
         sites_vcf: "for use with Clair"
         sites_vcf_tbi: "for use with Clair"
+
         run_dv_pepper_analysis: "to turn on DV-Pepper analysis or not (non-trivial increase in cost and runtime)"
         dvp_threads: "number of threads for DV-Pepper"
         dvp_memory: "memory for DV-Pepper"
@@ -43,9 +45,7 @@ workflow CallVariants {
         String prefix
         String sample_id
 
-        File ref_fasta
-        File ref_fasta_fai
-        File ref_dict
+        File ref_map_file
 
         Boolean call_svs
         Boolean fast_less_sensitive_sv
@@ -63,6 +63,9 @@ workflow CallVariants {
         File? ref_scatter_interval_list_ids
     }
 
+    # Read ref map into map data type so we can access its fields:
+    Map[String, String] ref_map = read_map(ref_map_file)
+
     ######################################################################
     # Block for small variants handling
     ######################################################################
@@ -73,10 +76,10 @@ workflow CallVariants {
     if (call_small_variants) {
         # Scatter by chromosome
         Array[String] default_filter = ['random', 'chrUn', 'decoy', 'alt', 'HLA', 'EBV']
-        Array[String] use_filter = if (call_small_vars_on_mitochondria) then default_filter else flatten([['chrM'],default_filter])
+        Array[String] use_filter = if (call_small_vars_on_mitochondria) then default_filter else flatten([['chrM', ref_map["mt_chr_name"]], default_filter])
         call Utils.MakeChrIntervalList as SmallVariantsScatterPrepp {
             input:
-                ref_dict = ref_dict,
+                ref_dict = ref_map["dict"],
                 filter = use_filter
         }
 
@@ -95,8 +98,8 @@ workflow CallVariants {
                     bam = SmallVariantsScatter.subset_bam,
                     bai = SmallVariantsScatter.subset_bai,
 
-                    ref_fasta     = ref_fasta,
-                    ref_fasta_fai = ref_fasta_fai,
+                    ref_fasta     = ref_map["fasta"],
+                    ref_fasta_fai = ref_map["fai"],
 
                     sites_vcf = sites_vcf,
                     sites_vcf_tbi = sites_vcf_tbi,
@@ -109,14 +112,14 @@ workflow CallVariants {
         call VariantUtils.MergeAndSortVCFs as MergeAndSortClairVCFs {
             input:
                 vcfs = Clair.vcf,
-                ref_fasta_fai = ref_fasta_fai,
+                ref_fasta_fai = ref_map["fai"],
                 prefix = prefix + ".clair"
         }
 
         call VariantUtils.MergeAndSortVCFs as MergeAndSortClair_gVCFs {
             input:
                 vcfs = Clair.gvcf,
-                ref_fasta_fai = ref_fasta_fai,
+                ref_fasta_fai = ref_map["fai"],
                 prefix = prefix + ".clair.g"
         }
 
@@ -142,8 +145,8 @@ workflow CallVariants {
                     input:
                         bam           = size_balanced_scatter.subset_bam,
                         bai           = size_balanced_scatter.subset_bai,
-                        ref_fasta     = ref_fasta,
-                        ref_fasta_fai = ref_fasta_fai,
+                        ref_fasta     = ref_map["fasta"],
+                        ref_fasta_fai = ref_map["fai"],
                         threads       = select_first([dvp_threads]),
                         memory        = select_first([dvp_memory]),
                         zones = arbitrary.zones
@@ -156,21 +159,21 @@ workflow CallVariants {
                 input:
                     vcfs     = Pepper.gVCF,
                     prefix   = dvp_prefix + ".g",
-                    ref_fasta_fai = ref_fasta_fai
+                    ref_fasta_fai = ref_map["fai"]
             }
 
             call VariantUtils.MergeAndSortVCFs as MergeDeepVariantPhasedVCFs {
                 input:
                     vcfs     = Pepper.phasedVCF,
                     prefix   = dvp_prefix + ".phased",
-                    ref_fasta_fai = ref_fasta_fai
+                    ref_fasta_fai = ref_map["fai"]
             }
 
             call VariantUtils.MergeAndSortVCFs as MergeDeepVariantVCFs {
                 input:
                     vcfs     = Pepper.VCF,
                     prefix   = dvp_prefix,
-                    ref_fasta_fai = ref_fasta_fai
+                    ref_fasta_fai = ref_map["fai"]
             }
 
             call Utils.MergeBams {
@@ -188,7 +191,7 @@ workflow CallVariants {
 
             call Utils.MakeChrIntervalList {
             input:
-                ref_dict = ref_dict,
+                ref_dict = ref_map["dict"],
                 filter = ['random', 'chrUn', 'decoy', 'alt', 'HLA', 'EBV']
             }
 
@@ -206,8 +209,8 @@ workflow CallVariants {
                     input:
                         bam = SubsetBam.subset_bam,
                         bai = SubsetBam.subset_bai,
-                        ref_fasta = ref_fasta,
-                        ref_fasta_fai = ref_fasta_fai,
+                        ref_fasta = ref_map["fasta"],
+                        ref_fasta_fai = ref_map["fai"],
                         prefix = prefix,
                         tandem_repeat_bed = tandem_repeat_bed,
                         is_ccs = false,
@@ -225,7 +228,7 @@ workflow CallVariants {
             call VariantUtils.MergePerChrCalls as MergePBSVVCFs {
                 input:
                     vcfs     = RunPBSV.vcf,
-                    ref_dict = ref_dict,
+                    ref_dict = ref_map["dict"],
                     prefix   = prefix + ".pbsv"
             }
 
@@ -236,8 +239,8 @@ workflow CallVariants {
                 input:
                     bam = bam,
                     bai = bai,
-                    ref_fasta = ref_fasta,
-                    ref_fasta_fai = ref_fasta_fai,
+                    ref_fasta = ref_map["fasta"],
+                    ref_fasta_fai = ref_map["fai"],
                     prefix = prefix,
                     tandem_repeat_bed = tandem_repeat_bed,
                     is_ccs = false,