Wdlupdate (#6)

* Fix for splitRG task. Disk sizing now takes place within tasks.
gatk-workflows · Mar 23, 2018 · bbfdfb3 · bbfdfb3
1 parent 84a2325
commit bbfdfb3
Show file tree

Hide file tree

Showing 9 changed files with 203 additions and 256 deletions.
diff --git a/fc_germline_single_sample_workflow.wdl b/fc_germline_single_sample_workflow.wdl
@@ -73,29 +73,12 @@ workflow germline_single_sample_workflow {
   Int preemptible_tries
   Int agg_preemptible_tries
 
-  # Optional input to increase all disk sizes in case of outlier sample with strange size behavior
-  Int? increase_disk_size
-
   Boolean skip_QC
   Boolean make_gatk4_single_sample_vcf
   Boolean use_gatk4_haplotype_caller
 
   Float cutoff_for_large_rg_in_gb = 20.0
 
-  # Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a
-  # Cromwell error from asking for 0 disk when the input is less than 1GB
-  Int additional_disk = select_first([increase_disk_size, 20])
-  # Sometimes the output is larger than the input, or a task can spill to disk. In these cases we need to account for the
-  # input (1) and the output (1.5) or the input(1), the output(1), and spillage (.5).
-  Float bwa_disk_multiplier = 2.5
-  # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data
-  # so it needs more disk space.  Also it spills to disk in an uncompressed format so we need to account for that with a
-  # larger multiplier
-  Float sort_sam_disk_multiplier = 3.25
-
-  # Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving .25 as wiggleroom
-  Float md_disk_multiplier = 2.25
-
   String bwa_commandline="bwa mem -K 100000000 -p -v 3 -t 16 -Y $bash_ref_fasta"
 
   String recalibrated_bam_basename = base_file_name + ".aligned.duplicates_marked.recalibrated"
@@ -106,25 +89,19 @@ workflow germline_single_sample_workflow {
   # by MergeBamAlignment.
   call Alignment.GetBwaVersion
 
-  # Get the size of the standard reference files as well as the additional reference files needed for BWA
-  Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
-  Float bwa_ref_size = ref_size + size(ref_alt, "GB") + size(ref_amb, "GB") + size(ref_ann, "GB") + size(ref_bwt, "GB") + size(ref_pac, "GB") + size(ref_sa, "GB")
-  Float dbsnp_size = size(dbSNP_vcf, "GB")
-
   # Align flowcell-level unmapped input bams in parallel
   scatter (unmapped_bam in flowcell_unmapped_bams) {
 
     Float unmapped_bam_size = size(unmapped_bam, "GB")
 
     String unmapped_bam_basename = basename(unmapped_bam, unmapped_bam_suffix)
-    
+
     if (!skip_QC) {
       # QC the unmapped BAM
       call QC.CollectQualityYieldMetrics as CollectQualityYieldMetrics {
         input:
           input_bam = unmapped_bam,
           metrics_filename = unmapped_bam_basename + ".unmapped.quality_yield_metrics",
-          disk_size = unmapped_bam_size + additional_disk,
           preemptible_tries = preemptible_tries
       }
     }
@@ -147,12 +124,8 @@ workflow germline_single_sample_workflow {
           ref_bwt = ref_bwt,
           ref_pac = ref_pac,
           ref_sa = ref_sa,
-          additional_disk = additional_disk,
           compression_level = compression_level,
-          preemptible_tries = preemptible_tries,
-          bwa_ref_size = bwa_ref_size,
-          disk_multiplier = bwa_disk_multiplier,
-          unmapped_bam_size = unmapped_bam_size
+          preemptible_tries = preemptible_tries
       }
     }
 
@@ -173,9 +146,6 @@ workflow germline_single_sample_workflow {
           ref_pac = ref_pac,
           ref_sa = ref_sa,
           bwa_version = GetBwaVersion.version,
-          # The merged bam can be bigger than only the aligned bam,
-          # so account for the output size by multiplying the input size by 2.75.
-          disk_size = unmapped_bam_size + bwa_ref_size + (bwa_disk_multiplier * unmapped_bam_size) + additional_disk,
           compression_level = compression_level,
           preemptible_tries = preemptible_tries
       }
@@ -192,7 +162,6 @@ workflow germline_single_sample_workflow {
         input:
           input_bam = output_aligned_bam,
           output_bam_prefix = unmapped_bam_basename + ".readgroup",
-          disk_size = mapped_bam_size + additional_disk,
           preemptible_tries = preemptible_tries
       }
     }
@@ -213,22 +182,16 @@ workflow germline_single_sample_workflow {
       input_bams = output_aligned_bam,
       output_bam_basename = base_file_name + ".aligned.unsorted.duplicates_marked",
       metrics_filename = base_file_name + ".duplicate_metrics",
-      # The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs
-      # and the merged output.
-      disk_size = (md_disk_multiplier * SumFloats.total_size) + additional_disk,
+      total_input_size = SumFloats.total_size,
       compression_level = compression_level,
       preemptible_tries = agg_preemptible_tries
   }
 
-  Float agg_bam_size = size(MarkDuplicates.output_bam, "GB")
-
   # Sort aggregated+deduped BAM file
   call Processing.SortSam as SortSampleBam {
     input:
       input_bam = MarkDuplicates.output_bam,
       output_bam_basename = base_file_name + ".aligned.duplicate_marked.sorted",
-      # This task spills to disk so we need space for the input bam, the output bam, and any spillage.
-      disk_size = (sort_sam_disk_multiplier * agg_bam_size) + additional_disk,
       compression_level = compression_level,
       preemptible_tries = agg_preemptible_tries
   }
@@ -251,7 +214,6 @@ workflow germline_single_sample_workflow {
       ref_fasta = ref_fasta,
       ref_fasta_index = ref_fasta_index,
       output_prefix = base_file_name + ".preBqsr",
-      disk_size = agg_bam_size + ref_size + additional_disk,
       preemptible_tries = agg_preemptible_tries,
       contamination_underestimation_factor = 0.75
   }
@@ -278,8 +240,7 @@ workflow germline_single_sample_workflow {
         ref_dict = ref_dict,
         ref_fasta = ref_fasta,
         ref_fasta_index = ref_fasta_index,
-        # We need disk to localize the sharded bam due to the scatter.
-        disk_size = (agg_bam_size / bqsr_divisor) + ref_size + dbsnp_size + additional_disk,
+        bqsr_scatter = bqsr_divisor,
         preemptible_tries = agg_preemptible_tries
     }
   }
@@ -290,7 +251,6 @@ workflow germline_single_sample_workflow {
     input:
       input_bqsr_reports = BaseRecalibrator.recalibration_report,
       output_report_filename = base_file_name + ".recal_data.csv",
-      disk_size = additional_disk,
       preemptible_tries = preemptible_tries
   }
 
@@ -305,20 +265,20 @@ workflow germline_single_sample_workflow {
         ref_dict = ref_dict,
         ref_fasta = ref_fasta,
         ref_fasta_index = ref_fasta_index,
-        # We need disk to localize the sharded bam and the sharded output due to the scatter.
-        disk_size = ((agg_bam_size * 3) / bqsr_divisor) + ref_size + additional_disk,
+        bqsr_scatter = bqsr_divisor,
         compression_level = compression_level,
         preemptible_tries = agg_preemptible_tries
     }
   }
 
+  Float agg_bam_size = size(SortSampleBam.output_bam, "GB")
+
   # Merge the recalibrated BAM files resulting from by-interval recalibration
-  call Processing.GatherBamFiles as GatherBamFiles {
+  call Processing.GatherSortedBamFiles as GatherBamFiles {
     input:
       input_bams = ApplyBQSR.recalibrated_bam,
       output_bam_basename = base_file_name,
-      # Multiply the input bam size by two to account for the input and output
-      disk_size = (2 * agg_bam_size) + additional_disk,
+      total_input_size = agg_bam_size,
       compression_level = compression_level,
       preemptible_tries = agg_preemptible_tries
   }
@@ -338,7 +298,6 @@ workflow germline_single_sample_workflow {
         ref_dict = ref_dict,
         ref_fasta = ref_fasta,
         ref_fasta_index = ref_fasta_index,
-        disk_size = binned_qual_bam_size + ref_size + additional_disk,
         preemptible_tries = agg_preemptible_tries
     }
 
@@ -351,7 +310,6 @@ workflow germline_single_sample_workflow {
         ref_dict = ref_dict,
         ref_fasta = ref_fasta,
         ref_fasta_index = ref_fasta_index,
-        disk_size = binned_qual_bam_size + ref_size + additional_disk,
         preemptible_tries = agg_preemptible_tries
     }
 
@@ -365,7 +323,6 @@ workflow germline_single_sample_workflow {
         ref_fasta_index = ref_fasta_index,
         wgs_coverage_interval_list = wgs_coverage_interval_list,
         read_length = read_length,
-        disk_size = binned_qual_bam_size + ref_size + additional_disk,
         preemptible_tries = agg_preemptible_tries
     }
 
@@ -379,7 +336,6 @@ workflow germline_single_sample_workflow {
         ref_fasta_index = ref_fasta_index,
         wgs_coverage_interval_list = wgs_coverage_interval_list,
         read_length = read_length,
-        disk_size = binned_qual_bam_size + ref_size + additional_disk,
         preemptible_tries = agg_preemptible_tries
     }
 
@@ -389,14 +345,10 @@ workflow germline_single_sample_workflow {
         input_bam = GatherBamFiles.output_bam,
         input_bam_index = GatherBamFiles.output_bam_index,
         read_group_md5_filename = recalibrated_bam_basename + ".bam.read_group_md5",
-        disk_size = binned_qual_bam_size + additional_disk,
         preemptible_tries = agg_preemptible_tries
     }
   }
 
-  # Germline single sample GVCFs shouldn't get bigger even when the input bam is bigger (after a certain size)
-  Int GVCF_disk_size = select_first([increase_disk_size, 30])
-
   # ValidateSamFile runs out of memory in mate validation on crazy edge case data, so we want to skip the mate validation
   # in those cases.  These values set the thresholds for what is considered outside the normal realm of "reasonable" data.
   Float max_duplication_in_reasonable_sample = 0.30
@@ -409,12 +361,9 @@ workflow germline_single_sample_workflow {
       ref_fasta = ref_fasta,
       ref_fasta_index = ref_fasta_index,
       output_basename = base_file_name,
-      disk_size = (2 * binned_qual_bam_size) + ref_size + additional_disk,
       preemptible_tries = agg_preemptible_tries
   }
 
-  Float cram_size = size(ConvertToCram.output_cram, "GB")
-
   if (!skip_QC) {
     # Check whether the data has massively high duplication or chimerism rates
     call QC.CheckPreValidation as CheckPreValidation {
@@ -441,7 +390,6 @@ workflow germline_single_sample_workflow {
       ignore = ["MISSING_TAG_NM"],
       max_output = 1000000000,
       is_outlier_data = is_outlier_data,
-      disk_size = cram_size + ref_size + additional_disk,
       preemptible_tries = agg_preemptible_tries
   }
 
@@ -474,8 +422,7 @@ workflow germline_single_sample_workflow {
           ref_dict = ref_dict,
           ref_fasta = ref_fasta,
           ref_fasta_index = ref_fasta_index,
-          # Divide the total output GVCF size and the input bam size to account for the smaller scattered input and output.
-          disk_size = ((binned_qual_bam_size + GVCF_disk_size) / hc_divisor) + ref_size + additional_disk,
+          hc_scatter = hc_divisor,
           preemptible_tries = agg_preemptible_tries
       }
 
@@ -486,7 +433,6 @@ workflow germline_single_sample_workflow {
             input_vcf_index = HaplotypeCaller4.output_vcf_index,
             vcf_basename = base_file_name,
             interval_list = ScatterIntervalList.out[index],
-            disk_size = GVCF_disk_size + GVCF_disk_size + additional_disk,
             preemptible_tries = preemptible_tries
         }
       }
@@ -501,8 +447,7 @@ workflow germline_single_sample_workflow {
           ref_dict = ref_dict,
           ref_fasta = ref_fasta,
           ref_fasta_index = ref_fasta_index,
-          # Divide the total output GVCF size and the input bam size to account for the smaller scattered input and output.
-          disk_size = ((binned_qual_bam_size + GVCF_disk_size) / hc_divisor) + ref_size + additional_disk,
+          hc_scatter = hc_divisor,
           preemptible_tries = agg_preemptible_tries
        }
      }
@@ -519,7 +464,6 @@ workflow germline_single_sample_workflow {
       input_vcfs = merge_input,
       input_vcfs_indexes = merge_input_index,
       output_vcf_name = final_vcf_base_name + name_token + ".vcf.gz",
-      disk_size = GVCF_disk_size,
       preemptible_tries = agg_preemptible_tries
   }
 

diff --git a/germline_single_sample_workflow.wdl b/germline_single_sample_workflow.wdl
@@ -72,9 +72,6 @@ workflow germline_single_sample_workflow {
   Int preemptible_tries
   Int agg_preemptible_tries
 
-  # Optional input to increase all disk sizes in case of outlier sample with strange size behavior
-  Int? increase_disk_size
-
   call ToBam.to_bam_workflow {
     input:
       contamination_sites_ud = contamination_sites_ud,
@@ -103,21 +100,9 @@ workflow germline_single_sample_workflow {
       known_indels_sites_VCFs = known_indels_sites_VCFs,
       known_indels_sites_indices = known_indels_sites_indices,
       preemptible_tries = preemptible_tries,
-      agg_preemptible_tries = agg_preemptible_tries,
-      increase_disk_size = increase_disk_size
+      agg_preemptible_tries = agg_preemptible_tries
   }
 
-  # Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a
-  # Cromwell error from asking for 0 disk when the input is less than 1GB
-  Int additional_disk = select_first([increase_disk_size, 20])
-  # Germline single sample GVCFs shouldn't get bigger even when the input bam is bigger (after a certain size)
-  Int GVCF_disk_size = select_first([increase_disk_size, 30])
-  #BQSR bins the qualities which makes a significantly smaller bam
-  Float binned_qual_bam_size = size(to_bam_workflow.output_bam, "GB")
-
-  Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
-  Float dbsnp_size = size(dbSNP_vcf, "GB")
-
   # ValidateSamFile runs out of memory in mate validation on crazy edge case data, so we want to skip the mate validation
   # in those cases.  These values set the thresholds for what is considered outside the normal realm of "reasonable" data.
   Float max_duplication_in_reasonable_sample = 0.30
@@ -130,12 +115,9 @@ workflow germline_single_sample_workflow {
       ref_fasta = ref_fasta,
       ref_fasta_index = ref_fasta_index,
       output_basename = base_file_name,
-      disk_size = (2 * binned_qual_bam_size) + ref_size + additional_disk,
       preemptible_tries = agg_preemptible_tries
   }
 
-  Float cram_size = size(ConvertToCram.output_cram, "GB")
-
   # Check whether the data has massively high duplication or chimerism rates
   call QC.CheckPreValidation as CheckPreValidation {
     input:
@@ -158,7 +140,6 @@ workflow germline_single_sample_workflow {
       ignore = ["MISSING_TAG_NM"],
       max_output = 1000000000,
       is_outlier_data = CheckPreValidation.is_outlier_data,
-      disk_size = cram_size + ref_size + additional_disk,
       preemptible_tries = agg_preemptible_tries
   }
 
@@ -189,8 +170,7 @@ workflow germline_single_sample_workflow {
         ref_dict = ref_dict,
         ref_fasta = ref_fasta,
         ref_fasta_index = ref_fasta_index,
-        # Divide the total output GVCF size and the input bam size to account for the smaller scattered input and output.
-        disk_size = ((binned_qual_bam_size + GVCF_disk_size) / hc_divisor) + ref_size + additional_disk,
+        hc_scatter = hc_divisor,
         preemptible_tries = agg_preemptible_tries
      }
   }
@@ -201,7 +181,6 @@ workflow germline_single_sample_workflow {
       input_vcfs = HaplotypeCaller.output_gvcf,
       input_vcfs_indexes = HaplotypeCaller.output_gvcf_index,
       output_vcf_name = final_gvcf_base_name + ".g.vcf.gz",
-      disk_size = GVCF_disk_size,
       preemptible_tries = agg_preemptible_tries
   }
 
@@ -218,7 +197,6 @@ workflow germline_single_sample_workflow {
       ref_fasta_index = ref_fasta_index,
       ref_dict = ref_dict,
       wgs_calling_interval_list = wgs_calling_interval_list,
-      disk_size = gvcf_size + ref_size + dbsnp_size + additional_disk,
       preemptible_tries = agg_preemptible_tries
   }
 
@@ -232,7 +210,6 @@ workflow germline_single_sample_workflow {
       dbSNP_vcf_index = dbSNP_vcf_index,
       ref_dict = ref_dict,
       wgs_evaluation_interval_list = wgs_evaluation_interval_list,
-      disk_size = gvcf_size + dbsnp_size + additional_disk,
       preemptible_tries = agg_preemptible_tries
   }
 
@@ -296,4 +273,3 @@ workflow germline_single_sample_workflow {
     File output_vcf_index = MergeVCFs.output_vcf_index
   }
 }
-