diff --git a/fc_germline_single_sample_workflow.wdl b/fc_germline_single_sample_workflow.wdl index de638f6..d332b10 100644 --- a/fc_germline_single_sample_workflow.wdl +++ b/fc_germline_single_sample_workflow.wdl @@ -73,29 +73,12 @@ workflow germline_single_sample_workflow { Int preemptible_tries Int agg_preemptible_tries - # Optional input to increase all disk sizes in case of outlier sample with strange size behavior - Int? increase_disk_size - Boolean skip_QC Boolean make_gatk4_single_sample_vcf Boolean use_gatk4_haplotype_caller Float cutoff_for_large_rg_in_gb = 20.0 - # Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a - # Cromwell error from asking for 0 disk when the input is less than 1GB - Int additional_disk = select_first([increase_disk_size, 20]) - # Sometimes the output is larger than the input, or a task can spill to disk. In these cases we need to account for the - # input (1) and the output (1.5) or the input(1), the output(1), and spillage (.5). - Float bwa_disk_multiplier = 2.5 - # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data - # so it needs more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a - # larger multiplier - Float sort_sam_disk_multiplier = 3.25 - - # Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving .25 as wiggleroom - Float md_disk_multiplier = 2.25 - String bwa_commandline="bwa mem -K 100000000 -p -v 3 -t 16 -Y $bash_ref_fasta" String recalibrated_bam_basename = base_file_name + ".aligned.duplicates_marked.recalibrated" @@ -106,25 +89,19 @@ workflow germline_single_sample_workflow { # by MergeBamAlignment. call Alignment.GetBwaVersion - # Get the size of the standard reference files as well as the additional reference files needed for BWA - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Float bwa_ref_size = ref_size + size(ref_alt, "GB") + size(ref_amb, "GB") + size(ref_ann, "GB") + size(ref_bwt, "GB") + size(ref_pac, "GB") + size(ref_sa, "GB") - Float dbsnp_size = size(dbSNP_vcf, "GB") - # Align flowcell-level unmapped input bams in parallel scatter (unmapped_bam in flowcell_unmapped_bams) { Float unmapped_bam_size = size(unmapped_bam, "GB") String unmapped_bam_basename = basename(unmapped_bam, unmapped_bam_suffix) - + if (!skip_QC) { # QC the unmapped BAM call QC.CollectQualityYieldMetrics as CollectQualityYieldMetrics { input: input_bam = unmapped_bam, metrics_filename = unmapped_bam_basename + ".unmapped.quality_yield_metrics", - disk_size = unmapped_bam_size + additional_disk, preemptible_tries = preemptible_tries } } @@ -147,12 +124,8 @@ workflow germline_single_sample_workflow { ref_bwt = ref_bwt, ref_pac = ref_pac, ref_sa = ref_sa, - additional_disk = additional_disk, compression_level = compression_level, - preemptible_tries = preemptible_tries, - bwa_ref_size = bwa_ref_size, - disk_multiplier = bwa_disk_multiplier, - unmapped_bam_size = unmapped_bam_size + preemptible_tries = preemptible_tries } } @@ -173,9 +146,6 @@ workflow germline_single_sample_workflow { ref_pac = ref_pac, ref_sa = ref_sa, bwa_version = GetBwaVersion.version, - # The merged bam can be bigger than only the aligned bam, - # so account for the output size by multiplying the input size by 2.75. - disk_size = unmapped_bam_size + bwa_ref_size + (bwa_disk_multiplier * unmapped_bam_size) + additional_disk, compression_level = compression_level, preemptible_tries = preemptible_tries } @@ -192,7 +162,6 @@ workflow germline_single_sample_workflow { input: input_bam = output_aligned_bam, output_bam_prefix = unmapped_bam_basename + ".readgroup", - disk_size = mapped_bam_size + additional_disk, preemptible_tries = preemptible_tries } } @@ -213,22 +182,16 @@ workflow germline_single_sample_workflow { input_bams = output_aligned_bam, output_bam_basename = base_file_name + ".aligned.unsorted.duplicates_marked", metrics_filename = base_file_name + ".duplicate_metrics", - # The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs - # and the merged output. - disk_size = (md_disk_multiplier * SumFloats.total_size) + additional_disk, + total_input_size = SumFloats.total_size, compression_level = compression_level, preemptible_tries = agg_preemptible_tries } - Float agg_bam_size = size(MarkDuplicates.output_bam, "GB") - # Sort aggregated+deduped BAM file call Processing.SortSam as SortSampleBam { input: input_bam = MarkDuplicates.output_bam, output_bam_basename = base_file_name + ".aligned.duplicate_marked.sorted", - # This task spills to disk so we need space for the input bam, the output bam, and any spillage. - disk_size = (sort_sam_disk_multiplier * agg_bam_size) + additional_disk, compression_level = compression_level, preemptible_tries = agg_preemptible_tries } @@ -251,7 +214,6 @@ workflow germline_single_sample_workflow { ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, output_prefix = base_file_name + ".preBqsr", - disk_size = agg_bam_size + ref_size + additional_disk, preemptible_tries = agg_preemptible_tries, contamination_underestimation_factor = 0.75 } @@ -278,8 +240,7 @@ workflow germline_single_sample_workflow { ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, - # We need disk to localize the sharded bam due to the scatter. - disk_size = (agg_bam_size / bqsr_divisor) + ref_size + dbsnp_size + additional_disk, + bqsr_scatter = bqsr_divisor, preemptible_tries = agg_preemptible_tries } } @@ -290,7 +251,6 @@ workflow germline_single_sample_workflow { input: input_bqsr_reports = BaseRecalibrator.recalibration_report, output_report_filename = base_file_name + ".recal_data.csv", - disk_size = additional_disk, preemptible_tries = preemptible_tries } @@ -305,20 +265,20 @@ workflow germline_single_sample_workflow { ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, - # We need disk to localize the sharded bam and the sharded output due to the scatter. - disk_size = ((agg_bam_size * 3) / bqsr_divisor) + ref_size + additional_disk, + bqsr_scatter = bqsr_divisor, compression_level = compression_level, preemptible_tries = agg_preemptible_tries } } + Float agg_bam_size = size(SortSampleBam.output_bam, "GB") + # Merge the recalibrated BAM files resulting from by-interval recalibration - call Processing.GatherBamFiles as GatherBamFiles { + call Processing.GatherSortedBamFiles as GatherBamFiles { input: input_bams = ApplyBQSR.recalibrated_bam, output_bam_basename = base_file_name, - # Multiply the input bam size by two to account for the input and output - disk_size = (2 * agg_bam_size) + additional_disk, + total_input_size = agg_bam_size, compression_level = compression_level, preemptible_tries = agg_preemptible_tries } @@ -338,7 +298,6 @@ workflow germline_single_sample_workflow { ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, - disk_size = binned_qual_bam_size + ref_size + additional_disk, preemptible_tries = agg_preemptible_tries } @@ -351,7 +310,6 @@ workflow germline_single_sample_workflow { ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, - disk_size = binned_qual_bam_size + ref_size + additional_disk, preemptible_tries = agg_preemptible_tries } @@ -365,7 +323,6 @@ workflow germline_single_sample_workflow { ref_fasta_index = ref_fasta_index, wgs_coverage_interval_list = wgs_coverage_interval_list, read_length = read_length, - disk_size = binned_qual_bam_size + ref_size + additional_disk, preemptible_tries = agg_preemptible_tries } @@ -379,7 +336,6 @@ workflow germline_single_sample_workflow { ref_fasta_index = ref_fasta_index, wgs_coverage_interval_list = wgs_coverage_interval_list, read_length = read_length, - disk_size = binned_qual_bam_size + ref_size + additional_disk, preemptible_tries = agg_preemptible_tries } @@ -389,14 +345,10 @@ workflow germline_single_sample_workflow { input_bam = GatherBamFiles.output_bam, input_bam_index = GatherBamFiles.output_bam_index, read_group_md5_filename = recalibrated_bam_basename + ".bam.read_group_md5", - disk_size = binned_qual_bam_size + additional_disk, preemptible_tries = agg_preemptible_tries } } - # Germline single sample GVCFs shouldn't get bigger even when the input bam is bigger (after a certain size) - Int GVCF_disk_size = select_first([increase_disk_size, 30]) - # ValidateSamFile runs out of memory in mate validation on crazy edge case data, so we want to skip the mate validation # in those cases. These values set the thresholds for what is considered outside the normal realm of "reasonable" data. Float max_duplication_in_reasonable_sample = 0.30 @@ -409,12 +361,9 @@ workflow germline_single_sample_workflow { ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, output_basename = base_file_name, - disk_size = (2 * binned_qual_bam_size) + ref_size + additional_disk, preemptible_tries = agg_preemptible_tries } - Float cram_size = size(ConvertToCram.output_cram, "GB") - if (!skip_QC) { # Check whether the data has massively high duplication or chimerism rates call QC.CheckPreValidation as CheckPreValidation { @@ -441,7 +390,6 @@ workflow germline_single_sample_workflow { ignore = ["MISSING_TAG_NM"], max_output = 1000000000, is_outlier_data = is_outlier_data, - disk_size = cram_size + ref_size + additional_disk, preemptible_tries = agg_preemptible_tries } @@ -474,8 +422,7 @@ workflow germline_single_sample_workflow { ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, - # Divide the total output GVCF size and the input bam size to account for the smaller scattered input and output. - disk_size = ((binned_qual_bam_size + GVCF_disk_size) / hc_divisor) + ref_size + additional_disk, + hc_scatter = hc_divisor, preemptible_tries = agg_preemptible_tries } @@ -486,7 +433,6 @@ workflow germline_single_sample_workflow { input_vcf_index = HaplotypeCaller4.output_vcf_index, vcf_basename = base_file_name, interval_list = ScatterIntervalList.out[index], - disk_size = GVCF_disk_size + GVCF_disk_size + additional_disk, preemptible_tries = preemptible_tries } } @@ -501,8 +447,7 @@ workflow germline_single_sample_workflow { ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, - # Divide the total output GVCF size and the input bam size to account for the smaller scattered input and output. - disk_size = ((binned_qual_bam_size + GVCF_disk_size) / hc_divisor) + ref_size + additional_disk, + hc_scatter = hc_divisor, preemptible_tries = agg_preemptible_tries } } @@ -519,7 +464,6 @@ workflow germline_single_sample_workflow { input_vcfs = merge_input, input_vcfs_indexes = merge_input_index, output_vcf_name = final_vcf_base_name + name_token + ".vcf.gz", - disk_size = GVCF_disk_size, preemptible_tries = agg_preemptible_tries } diff --git a/germline_single_sample_workflow.wdl b/germline_single_sample_workflow.wdl index 154072c..04905b8 100644 --- a/germline_single_sample_workflow.wdl +++ b/germline_single_sample_workflow.wdl @@ -72,9 +72,6 @@ workflow germline_single_sample_workflow { Int preemptible_tries Int agg_preemptible_tries - # Optional input to increase all disk sizes in case of outlier sample with strange size behavior - Int? increase_disk_size - call ToBam.to_bam_workflow { input: contamination_sites_ud = contamination_sites_ud, @@ -103,21 +100,9 @@ workflow germline_single_sample_workflow { known_indels_sites_VCFs = known_indels_sites_VCFs, known_indels_sites_indices = known_indels_sites_indices, preemptible_tries = preemptible_tries, - agg_preemptible_tries = agg_preemptible_tries, - increase_disk_size = increase_disk_size + agg_preemptible_tries = agg_preemptible_tries } - # Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a - # Cromwell error from asking for 0 disk when the input is less than 1GB - Int additional_disk = select_first([increase_disk_size, 20]) - # Germline single sample GVCFs shouldn't get bigger even when the input bam is bigger (after a certain size) - Int GVCF_disk_size = select_first([increase_disk_size, 30]) - #BQSR bins the qualities which makes a significantly smaller bam - Float binned_qual_bam_size = size(to_bam_workflow.output_bam, "GB") - - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Float dbsnp_size = size(dbSNP_vcf, "GB") - # ValidateSamFile runs out of memory in mate validation on crazy edge case data, so we want to skip the mate validation # in those cases. These values set the thresholds for what is considered outside the normal realm of "reasonable" data. Float max_duplication_in_reasonable_sample = 0.30 @@ -130,12 +115,9 @@ workflow germline_single_sample_workflow { ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, output_basename = base_file_name, - disk_size = (2 * binned_qual_bam_size) + ref_size + additional_disk, preemptible_tries = agg_preemptible_tries } - Float cram_size = size(ConvertToCram.output_cram, "GB") - # Check whether the data has massively high duplication or chimerism rates call QC.CheckPreValidation as CheckPreValidation { input: @@ -158,7 +140,6 @@ workflow germline_single_sample_workflow { ignore = ["MISSING_TAG_NM"], max_output = 1000000000, is_outlier_data = CheckPreValidation.is_outlier_data, - disk_size = cram_size + ref_size + additional_disk, preemptible_tries = agg_preemptible_tries } @@ -189,8 +170,7 @@ workflow germline_single_sample_workflow { ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, - # Divide the total output GVCF size and the input bam size to account for the smaller scattered input and output. - disk_size = ((binned_qual_bam_size + GVCF_disk_size) / hc_divisor) + ref_size + additional_disk, + hc_scatter = hc_divisor, preemptible_tries = agg_preemptible_tries } } @@ -201,7 +181,6 @@ workflow germline_single_sample_workflow { input_vcfs = HaplotypeCaller.output_gvcf, input_vcfs_indexes = HaplotypeCaller.output_gvcf_index, output_vcf_name = final_gvcf_base_name + ".g.vcf.gz", - disk_size = GVCF_disk_size, preemptible_tries = agg_preemptible_tries } @@ -218,7 +197,6 @@ workflow germline_single_sample_workflow { ref_fasta_index = ref_fasta_index, ref_dict = ref_dict, wgs_calling_interval_list = wgs_calling_interval_list, - disk_size = gvcf_size + ref_size + dbsnp_size + additional_disk, preemptible_tries = agg_preemptible_tries } @@ -232,7 +210,6 @@ workflow germline_single_sample_workflow { dbSNP_vcf_index = dbSNP_vcf_index, ref_dict = ref_dict, wgs_evaluation_interval_list = wgs_evaluation_interval_list, - disk_size = gvcf_size + dbsnp_size + additional_disk, preemptible_tries = agg_preemptible_tries } @@ -296,4 +273,3 @@ workflow germline_single_sample_workflow { File output_vcf_index = MergeVCFs.output_vcf_index } } - diff --git a/tasks_pipelines/alignment.wdl b/tasks_pipelines/alignment.wdl index bf57374..0ba0334 100644 --- a/tasks_pipelines/alignment.wdl +++ b/tasks_pipelines/alignment.wdl @@ -23,7 +23,7 @@ task GetBwaVersion { sed 's/Version: //' } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" memory: "1 GB" } output { @@ -50,10 +50,17 @@ task SamToFastqAndBwaMemAndMba { File ref_bwt File ref_pac File ref_sa - Float disk_size Int compression_level Int preemptible_tries + Float unmapped_bam_size = size(input_bam, "GB") + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Float bwa_ref_size = ref_size + size(ref_alt, "GB") + size(ref_amb, "GB") + size(ref_ann, "GB") + size(ref_bwt, "GB") + size(ref_pac, "GB") + size(ref_sa, "GB") + # Sometimes the output is larger than the input, or a task can spill to disk. + # In these cases we need to account for the input (1) and the output (1.5) or the input(1), the output(1), and spillage (.5). + Float disk_multiplier = 2.5 + Int disk_size = ceil(unmapped_bam_size + bwa_ref_size + (disk_multiplier * unmapped_bam_size) + 20) + command <<< set -o pipefail set -e @@ -107,11 +114,11 @@ task SamToFastqAndBwaMemAndMba { fi >>> runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "14 GB" cpu: "16" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" } output { File output_bam = "${output_bam_basename}.bam" @@ -122,10 +129,14 @@ task SamToFastqAndBwaMemAndMba { task SamSplitter { File input_bam Int n_reads - Int disk_size Int preemptible_tries Int compression_level + Float unmapped_bam_size = size(input_bam, "GB") + # Since the output bams are less compressed than the input bam we need a disk multiplier that's larger than 2. + Float disk_multiplier = 2.5 + Int disk_size = ceil(disk_multiplier * unmapped_bam_size + 20) + command { set -e mkdir output_dir diff --git a/tasks_pipelines/bam_processing.wdl b/tasks_pipelines/bam_processing.wdl index de7c6b3..b97c4d5 100644 --- a/tasks_pipelines/bam_processing.wdl +++ b/tasks_pipelines/bam_processing.wdl @@ -13,13 +13,17 @@ ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed ## licensing information pertaining to the included programs. -# Sort BAM file by coordinate order +# Sort BAM file by coordinate order task SortSam { File input_bam String output_bam_basename Int preemptible_tries Int compression_level - Float disk_size + + # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs + # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier + Float sort_sam_disk_multiplier = 3.25 + Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GB")) + 20 command { java -Dsamjdk.compression_level=${compression_level} -Xms4000m -jar /usr/gitc/picard.jar \ @@ -33,8 +37,8 @@ task SortSam { } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" + disks: "local-disk " + disk_size + " HDD" cpu: "1" memory: "5000 MB" preemptible: preemptible_tries @@ -52,26 +56,30 @@ task SortSamSpark { String output_bam_basename Int preemptible_tries Int compression_level - Float disk_size + + # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs + # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier + Float sort_sam_disk_multiplier = 3.25 + Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GB")) + 20 command { set -e export GATK_LOCAL_JAR=/root/gatk.jar - gatk --java-options "-Dsamjdk.compression_level=${compression_level} -Xms13g -Xmx13g" \ + gatk --java-options "-Dsamjdk.compression_level=${compression_level} -Xms100g -Xmx100g" \ SortSamSpark \ -I ${input_bam} \ -O ${output_bam_basename}.bam \ -- --conf spark.local.dir=. --spark-master 'local[16]' --conf 'spark.kryo.referenceTracking=false' - samtools index ${output_bam_basename}.bam ${output_bam_basename}.bai + samtools index ${output_bam_basename}.bam ${output_bam_basename}.bai } runtime { - docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:4.0.1.2-18-g78fbcd88a-ericSortSamEval" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + docker: "us.gcr.io/broad-gatk/gatk:4.0.2.1" + disks: "local-disk " + disk_size + " HDD" bootDiskSizeGb: "15" cpu: "16" - memory: "14 GB" + memory: "102 GB" preemptible: preemptible_tries } output { @@ -85,10 +93,15 @@ task MarkDuplicates { Array[File] input_bams String output_bam_basename String metrics_filename - Float disk_size + Float total_input_size Int compression_level Int preemptible_tries + # The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs and the merged output. + # Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving .25 as wiggleroom + Float md_disk_multiplier = 2.25 + Int disk_size = ceil(md_disk_multiplier * total_input_size) + 20 + # The program default for READ_NAME_REGEX is appropriate in nearly every case. # Sometimes we wish to supply "null" in order to turn off optical duplicate detection # This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing @@ -111,7 +124,7 @@ task MarkDuplicates { ADD_PG_TAG_TO_READS=false } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "7 GB" disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" @@ -134,9 +147,13 @@ task BaseRecalibrator { File ref_dict File ref_fasta File ref_fasta_index - Float disk_size + Int bqsr_scatter Int preemptible_tries + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Float dbsnp_size = size(dbSNP_vcf, "GB") + Int disk_size = ceil((size(input_bam, "GB") / bqsr_scatter) + ref_size + dbsnp_size) + 20 + command { /usr/gitc/gatk4/gatk-launch --javaOptions "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \ -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCDetails \ @@ -151,10 +168,10 @@ task BaseRecalibrator { -L ${sep=" -L " sequence_group_interval} } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "6 GB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" } output { File recalibration_report = "${recalibration_report_filename}" @@ -170,10 +187,13 @@ task ApplyBQSR { File ref_dict File ref_fasta File ref_fasta_index - Float disk_size Int compression_level + Int bqsr_scatter Int preemptible_tries + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil((size(input_bam, "GB") * 3 / bqsr_scatter) + ref_size) + 20 + command { /usr/gitc/gatk4/gatk-launch --javaOptions "-XX:+PrintFlagsFinal -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps \ -XX:+PrintGCDetails -Xloggc:gc_log.log \ @@ -190,10 +210,10 @@ task ApplyBQSR { -L ${sep=" -L " sequence_group_interval} } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "3500 MB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" } output { File recalibrated_bam = "${output_bam_basename}.bam" @@ -205,7 +225,6 @@ task ApplyBQSR { task GatherBqsrReports { Array[File] input_bqsr_reports String output_report_filename - Int disk_size Int preemptible_tries command { @@ -215,24 +234,27 @@ task GatherBqsrReports { -O ${output_report_filename} } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "3500 MB" - disks: "local-disk " + disk_size + " HDD" + disks: "local-disk 20 HDD" } output { File output_bqsr_report = "${output_report_filename}" } } -# Combine multiple recalibrated BAM files -task GatherBamFiles { +# Combine multiple *sorted* BAM files +task GatherSortedBamFiles { Array[File] input_bams String output_bam_basename - Float disk_size + Float total_input_size Int compression_level Int preemptible_tries + # Multiply the input bam size by two to account for the input and output + Int disk_size = ceil(2 * total_input_size) + 20 + command { java -Dsamjdk.compression_level=${compression_level} -Xms2000m -jar /usr/gitc/picard.jar \ GatherBamFiles \ @@ -242,10 +264,10 @@ task GatherBamFiles { CREATE_MD5_FILE=true } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "3 GB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" } output { File output_bam = "${output_bam_basename}.bam" @@ -254,6 +276,37 @@ task GatherBamFiles { } } +# Combine multiple *unsorted* BAM files +# Note that if/when WDL supports optional outputs, we should merge this task with the sorted version +task GatherUnsortedBamFiles { + Array[File] input_bams + String output_bam_basename + Float total_input_size + Int compression_level + Int preemptible_tries + + # Multiply the input bam size by two to account for the input and output + Int disk_size = ceil(2 * total_input_size) + 20 + + command { + java -Dsamjdk.compression_level=${compression_level} -Xms2000m -jar /usr/gitc/picard.jar \ + GatherBamFiles \ + INPUT=${sep=' INPUT=' input_bams} \ + OUTPUT=${output_bam_basename}.bam \ + CREATE_INDEX=false \ + CREATE_MD5_FILE=false + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" + preemptible: preemptible_tries + memory: "3 GB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bam = "${output_bam_basename}.bam" + } +} + # Notes on the contamination estimate: # The contamination value is read from the FREEMIX field of the selfSM file output by verifyBamId # @@ -276,10 +329,11 @@ task CheckContamination { File ref_fasta File ref_fasta_index String output_prefix - Float disk_size Int preemptible_tries Float contamination_underestimation_factor + Int disk_size = ceil(size(input_bam, "GB") + size(ref_fasta, "GB")) + 30 + command <<< set -e @@ -322,7 +376,7 @@ task CheckContamination { runtime { preemptible: preemptible_tries memory: "2 GB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" docker: "us.gcr.io/broad-gotc-prod/verify-bam-id:c8a66425c312e5f8be46ab0c41f8d7a1942b6e16-1500298351" } output { diff --git a/tasks_pipelines/germline_variant_discovery.wdl b/tasks_pipelines/germline_variant_discovery.wdl index 53f86ae..1a5ce3a 100644 --- a/tasks_pipelines/germline_variant_discovery.wdl +++ b/tasks_pipelines/germline_variant_discovery.wdl @@ -22,8 +22,11 @@ task HaplotypeCaller_GATK35_GVCF { File ref_fasta File ref_fasta_index Float? contamination - Float disk_size Int preemptible_tries + Int hc_scatter + + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20 # We use interval_padding 500 below to make sure that the HaplotypeCaller has context on both sides around # the interval because the assembly uses them. @@ -53,11 +56,11 @@ task HaplotypeCaller_GATK35_GVCF { --read_filter OverclippedRead } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "10 GB" cpu: "1" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" } output { File output_gvcf = "${gvcf_basename}.vcf.gz" @@ -65,9 +68,6 @@ task HaplotypeCaller_GATK35_GVCF { } } -# TODO -- -# -O ${vcf_basename}.vcf.gz \ -# -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf} task HaplotypeCaller_GATK4_VCF { String input_bam File interval_list @@ -77,8 +77,11 @@ task HaplotypeCaller_GATK4_VCF { File ref_fasta_index Float contamination Boolean make_gvcf - Float disk_size Int preemptible_tries + Int hc_scatter + + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20 command <<< @@ -90,14 +93,15 @@ task HaplotypeCaller_GATK4_VCF { -R ${ref_fasta} \ -I ${input_bam} \ -L ${interval_list} \ - -O ${vcf_basename}.vcf.gz ${true="-ERC GVCF" false="" make_gvcf} + -O ${vcf_basename}.vcf.gz \ + -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf} >>> runtime { - docker: "broadinstitute/gatk-nightly:2018-02-08-4.0.1.1-11-g9b93440-SNAPSHOT" + docker: "us.gcr.io/broad-gatk/gatk:4.0.2.1" preemptible: preemptible_tries memory: "6.5 GB" cpu: "1" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" } output { File output_vcf = "${vcf_basename}.vcf.gz" @@ -110,7 +114,6 @@ task MergeVCFs { Array[File] input_vcfs Array[File] input_vcfs_indexes String output_vcf_name - Int disk_size Int preemptible_tries # Using MergeVcfs instead of GatherVcfs so we can create indices @@ -122,10 +125,10 @@ task MergeVCFs { OUTPUT=${output_vcf_name} } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "3 GB" - disks: "local-disk " + disk_size + " HDD" + disks: "local-disk 30 HDD" } output { File output_vcf = "${output_vcf_name}" @@ -138,9 +141,9 @@ task HardFilterVcf { File input_vcf_index String vcf_basename File interval_list - Int disk_size Int preemptible_tries + Int disk_size = ceil(2 * size(input_vcf, "GB")) + 20 String output_vcf_name = vcf_basename + ".filtered.vcf.gz" command { @@ -163,4 +166,3 @@ task HardFilterVcf { disks: "local-disk " + disk_size + " HDD" } } - diff --git a/tasks_pipelines/qc.wdl b/tasks_pipelines/qc.wdl index 4f54099..62225c7 100644 --- a/tasks_pipelines/qc.wdl +++ b/tasks_pipelines/qc.wdl @@ -17,9 +17,10 @@ task CollectQualityYieldMetrics { File input_bam String metrics_filename - Float disk_size Int preemptible_tries + Int disk_size = ceil(size(input_bam, "GB")) + 20 + command { java -Xms2000m -jar /usr/gitc/picard.jar \ CollectQualityYieldMetrics \ @@ -28,8 +29,8 @@ task CollectQualityYieldMetrics { OUTPUT=${metrics_filename} } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" + disks: "local-disk " + disk_size + " HDD" memory: "3 GB" preemptible: preemptible_tries } @@ -43,7 +44,8 @@ task CollectUnsortedReadgroupBamQualityMetrics { File input_bam String output_bam_prefix Int preemptible_tries - Float disk_size + + Int disk_size = ceil(size(input_bam, "GB")) + 20 command { java -Xms5000m -jar /usr/gitc/picard.jar \ @@ -63,9 +65,9 @@ task CollectUnsortedReadgroupBamQualityMetrics { touch ${output_bam_prefix}.insert_size_histogram.pdf } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" memory: "7 GB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries } output { @@ -89,7 +91,9 @@ task CollectReadgroupBamQualityMetrics { File ref_fasta File ref_fasta_index Int preemptible_tries - Float disk_size + + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 command { java -Xms5000m -jar /usr/gitc/picard.jar \ @@ -105,9 +109,9 @@ task CollectReadgroupBamQualityMetrics { METRIC_ACCUMULATION_LEVEL="READ_GROUP" } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" memory: "7 GB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries } output { @@ -127,7 +131,9 @@ task CollectAggregationMetrics { File ref_fasta File ref_fasta_index Int preemptible_tries - Float disk_size + + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 command { java -Xms5000m -jar /usr/gitc/picard.jar \ @@ -150,9 +156,9 @@ task CollectAggregationMetrics { touch ${output_bam_prefix}.insert_size_histogram.pdf } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" memory: "7 GB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries } output { @@ -177,9 +183,11 @@ task CrossCheckFingerprints { Array[File] input_bam_indexes File? haplotype_database_file String metrics_filename - Float disk_size + Float total_input_size Int preemptible_tries + Int disk_size = ceil(total_input_size) + 20 + command <<< java -Dsamjdk.buffer_size=131072 \ -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms2000m \ @@ -192,10 +200,10 @@ task CrossCheckFingerprints { LOD_THRESHOLD=-20.0 >>> runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "2 GB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" } output { File cross_check_fingerprints_metrics = "${metrics_filename}" @@ -211,9 +219,10 @@ task CheckFingerprint { File? genotypes File? genotypes_index String sample - Float disk_size Int preemptible_tries + Int disk_size = ceil(size(input_bam, "GB")) + 20 + command <<< java -Dsamjdk.buffer_size=131072 \ -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms1024m \ @@ -228,10 +237,10 @@ task CheckFingerprint { >>> runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "1 GB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" } output { File summary_metrics = "${output_basename}.fingerprinting_summary_metrics" @@ -274,7 +283,7 @@ task CheckPreValidation { >>> runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries docker: "python:2.7" memory: "2 GB" @@ -296,9 +305,11 @@ task ValidateSamFile { Int? max_output Array[String]? ignore Boolean? is_outlier_data - Float disk_size Int preemptible_tries + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 + command { java -Xms6000m -jar /usr/gitc/picard.jar \ ValidateSamFile \ @@ -312,10 +323,10 @@ task ValidateSamFile { IS_BISULFITE_SEQUENCED=false } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "7 GB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" } output { File report = "${report_filename}" @@ -331,9 +342,11 @@ task CollectWgsMetrics { File ref_fasta File ref_fasta_index Int read_length - Float disk_size Int preemptible_tries + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 + command { java -Xms2000m -jar /usr/gitc/picard.jar \ CollectWgsMetrics \ @@ -347,10 +360,10 @@ task CollectWgsMetrics { READ_LENGTH=${read_length} } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "3 GB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" } output { File metrics = "${metrics_filename}" @@ -366,9 +379,11 @@ task CollectRawWgsMetrics { File ref_fasta File ref_fasta_index Int read_length - Float disk_size Int preemptible_tries + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 + command { java -Xms2000m -jar /usr/gitc/picard.jar \ CollectRawWgsMetrics \ @@ -382,10 +397,10 @@ task CollectRawWgsMetrics { READ_LENGTH=${read_length} } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "3 GB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" } output { File metrics = "${metrics_filename}" @@ -397,9 +412,10 @@ task CalculateReadGroupChecksum { File input_bam File input_bam_index String read_group_md5_filename - Float disk_size Int preemptible_tries + Int disk_size = ceil(size(input_bam, "GB")) + 20 + command { java -Xms1000m -jar /usr/gitc/picard.jar \ CalculateReadGroupChecksum \ @@ -407,10 +423,10 @@ task CalculateReadGroupChecksum { OUTPUT=${read_group_md5_filename} } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "2 GB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" } output { File md5_file = "${read_group_md5_filename}" @@ -427,9 +443,11 @@ task ValidateGVCF { File dbSNP_vcf File dbSNP_vcf_index File wgs_calling_interval_list - Float disk_size Int preemptible_tries + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil(size(input_vcf, "GB") + size(dbSNP_vcf, "GB") + ref_size) + 20 + command { /usr/gitc/gatk4/gatk-launch --javaOptions "-Xms3000m" \ ValidateVariants \ @@ -441,10 +459,10 @@ task ValidateGVCF { --dbsnp ${dbSNP_vcf} } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "3500 MB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" } } @@ -457,9 +475,10 @@ task CollectGvcfCallingMetrics { File dbSNP_vcf_index File ref_dict File wgs_evaluation_interval_list - Float disk_size Int preemptible_tries + Int disk_size = ceil(size(input_vcf, "GB") + size(dbSNP_vcf, "GB")) + 20 + command { java -Xms2000m -jar /usr/gitc/picard.jar \ CollectVariantCallingMetrics \ @@ -471,14 +490,13 @@ task CollectGvcfCallingMetrics { GVCF_INPUT=true } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "3 GB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" } output { File summary_metrics = "${metrics_basename}.variant_calling_summary_metrics" File detail_metrics = "${metrics_basename}.variant_calling_detail_metrics" } } - diff --git a/tasks_pipelines/split_large_readgroup.wdl b/tasks_pipelines/split_large_readgroup.wdl index 1eee076..a3044d2 100644 --- a/tasks_pipelines/split_large_readgroup.wdl +++ b/tasks_pipelines/split_large_readgroup.wdl @@ -35,23 +35,14 @@ workflow split_large_readgroup { File ref_bwt File ref_pac File ref_sa - Int additional_disk Int compression_level Int preemptible_tries Int reads_per_file = 48000000 - Float bwa_ref_size - Float disk_multiplier - - Float unmapped_bam_size - call Alignment.SamSplitter as SamSplitter { input : input_bam = input_bam, n_reads = reads_per_file, - # Since the output bams are less compressed than the input bam we need a disk multiplier - # that's larger than 2. - disk_size = ceil(disk_multiplier * unmapped_bam_size + additional_disk), preemptible_tries = preemptible_tries, compression_level = compression_level } @@ -75,9 +66,6 @@ workflow split_large_readgroup { ref_pac = ref_pac, ref_sa = ref_sa, bwa_version = bwa_version, - # The merged bam can be bigger than only the aligned bam, - # so account for the output size by multiplying the input size by 2.75. - disk_size = current_unmapped_bam_size + bwa_ref_size + (disk_multiplier * current_unmapped_bam_size) + additional_disk, compression_level = compression_level, preemptible_tries = preemptible_tries } @@ -91,10 +79,10 @@ workflow split_large_readgroup { preemptible_tries = preemptible_tries } - call Processing.GatherBamFiles as GatherMonolithicBamFile { + call Processing.GatherUnsortedBamFiles as GatherMonolithicBamFile { input: input_bams = SamToFastqAndBwaMemAndMba.output_bam, - disk_size = ceil((2 * SumSplitAlignedSizes.total_size) + additional_disk), + total_input_size = SumSplitAlignedSizes.total_size, output_bam_basename = output_bam_basename, preemptible_tries = preemptible_tries, compression_level = compression_level diff --git a/tasks_pipelines/unmapped_bam_to_aligned_bam.wdl b/tasks_pipelines/unmapped_bam_to_aligned_bam.wdl index c1fcc1f..6bb25cd 100644 --- a/tasks_pipelines/unmapped_bam_to_aligned_bam.wdl +++ b/tasks_pipelines/unmapped_bam_to_aligned_bam.wdl @@ -58,24 +58,7 @@ workflow to_bam_workflow { Float cutoff_for_large_rg_in_gb = 20.0 - # Optional input to increase all disk sizes in case of outlier sample with strange size behavior - Int? increase_disk_size - - # Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a - # Cromwell error from asking for 0 disk when the input is less than 1GB - Int additional_disk = select_first([increase_disk_size, 20]) - # Sometimes the output is larger than the input, or a task can spill to disk. In these cases we need to account for the - # input (1) and the output (1.5) or the input(1), the output(1), and spillage (.5). - Float bwa_disk_multiplier = 2.5 - # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data - # so it needs more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a - # larger multiplier - Float sort_sam_disk_multiplier = 3.25 - - # Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving .25 as wiggleroom - Float md_disk_multiplier = 2.25 - - String bwa_commandline="bwa mem -K 100000000 -p -v 3 -t 16 -Y $bash_ref_fasta" + String bwa_commandline = "bwa mem -K 100000000 -p -v 3 -t 16 -Y $bash_ref_fasta" String recalibrated_bam_basename = base_file_name + ".aligned.duplicates_marked.recalibrated" @@ -86,9 +69,6 @@ workflow to_bam_workflow { call Alignment.GetBwaVersion # Get the size of the standard reference files as well as the additional reference files needed for BWA - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Float bwa_ref_size = ref_size + size(ref_alt, "GB") + size(ref_amb, "GB") + size(ref_ann, "GB") + size(ref_bwt, "GB") + size(ref_pac, "GB") + size(ref_sa, "GB") - Float dbsnp_size = size(dbSNP_vcf, "GB") # Align flowcell-level unmapped input bams in parallel scatter (unmapped_bam in flowcell_unmapped_bams) { @@ -102,7 +82,6 @@ workflow to_bam_workflow { input: input_bam = unmapped_bam, metrics_filename = unmapped_bam_basename + ".unmapped.quality_yield_metrics", - disk_size = unmapped_bam_size + additional_disk, preemptible_tries = preemptible_tries } @@ -124,12 +103,8 @@ workflow to_bam_workflow { ref_bwt = ref_bwt, ref_pac = ref_pac, ref_sa = ref_sa, - additional_disk = additional_disk, compression_level = compression_level, - preemptible_tries = preemptible_tries, - bwa_ref_size = bwa_ref_size, - disk_multiplier = bwa_disk_multiplier, - unmapped_bam_size = unmapped_bam_size + preemptible_tries = preemptible_tries } } @@ -150,9 +125,6 @@ workflow to_bam_workflow { ref_pac = ref_pac, ref_sa = ref_sa, bwa_version = GetBwaVersion.version, - # The merged bam can be bigger than only the aligned bam, - # so account for the output size by multiplying the input size by 2.75. - disk_size = unmapped_bam_size + bwa_ref_size + (bwa_disk_multiplier * unmapped_bam_size) + additional_disk, compression_level = compression_level, preemptible_tries = preemptible_tries } @@ -168,7 +140,6 @@ workflow to_bam_workflow { input: input_bam = output_aligned_bam, output_bam_prefix = unmapped_bam_basename + ".readgroup", - disk_size = mapped_bam_size + additional_disk, preemptible_tries = preemptible_tries } } @@ -188,26 +159,22 @@ workflow to_bam_workflow { input_bams = output_aligned_bam, output_bam_basename = base_file_name + ".aligned.unsorted.duplicates_marked", metrics_filename = base_file_name + ".duplicate_metrics", - # The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs - # and the merged output. - disk_size = (md_disk_multiplier * SumFloats.total_size) + additional_disk, + total_input_size = SumFloats.total_size, compression_level = compression_level, preemptible_tries = agg_preemptible_tries } - Float agg_bam_size = size(MarkDuplicates.output_bam, "GB") - # Sort aggregated+deduped BAM file and fix tags call Processing.SortSam as SortSampleBam { input: input_bam = MarkDuplicates.output_bam, output_bam_basename = base_file_name + ".aligned.duplicate_marked.sorted", - # This task spills to disk so we need space for the input bam, the output bam, and any spillage. - disk_size = (sort_sam_disk_multiplier * agg_bam_size) + additional_disk, compression_level = compression_level, preemptible_tries = agg_preemptible_tries } + Float agg_bam_size = size(SortSampleBam.output_bam, "GB") + if (defined(haplotype_database_file)) { # Check identity of fingerprints across readgroups call QC.CrossCheckFingerprints as CrossCheckFingerprints { @@ -216,7 +183,7 @@ workflow to_bam_workflow { input_bam_indexes = SortSampleBam.output_bam_index, haplotype_database_file = haplotype_database_file, metrics_filename = base_file_name + ".crosscheck", - disk_size = agg_bam_size + additional_disk, + total_input_size = agg_bam_size, preemptible_tries = agg_preemptible_tries } } @@ -239,7 +206,6 @@ workflow to_bam_workflow { ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, output_prefix = base_file_name + ".preBqsr", - disk_size = agg_bam_size + ref_size + additional_disk, preemptible_tries = agg_preemptible_tries, contamination_underestimation_factor = 0.75 } @@ -266,8 +232,7 @@ workflow to_bam_workflow { ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, - # We need disk to localize the sharded bam due to the scatter. - disk_size = (agg_bam_size / bqsr_divisor) + ref_size + dbsnp_size + additional_disk, + bqsr_scatter = bqsr_divisor, preemptible_tries = agg_preemptible_tries } } @@ -278,7 +243,6 @@ workflow to_bam_workflow { input: input_bqsr_reports = BaseRecalibrator.recalibration_report, output_report_filename = base_file_name + ".recal_data.csv", - disk_size = additional_disk, preemptible_tries = preemptible_tries } @@ -293,27 +257,22 @@ workflow to_bam_workflow { ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, - # We need disk to localize the sharded bam and the sharded output due to the scatter. - disk_size = ((agg_bam_size * 3) / bqsr_divisor) + ref_size + additional_disk, + bqsr_scatter = bqsr_divisor, compression_level = compression_level, preemptible_tries = agg_preemptible_tries } } # Merge the recalibrated BAM files resulting from by-interval recalibration - call Processing.GatherBamFiles as GatherBamFiles { + call Processing.GatherSortedBamFiles as GatherBamFiles { input: input_bams = ApplyBQSR.recalibrated_bam, output_bam_basename = base_file_name, - # Multiply the input bam size by two to account for the input and output - disk_size = (2 * agg_bam_size) + additional_disk, + total_input_size = agg_bam_size, compression_level = compression_level, preemptible_tries = agg_preemptible_tries } - #BQSR bins the qualities which makes a significantly smaller bam - Float binned_qual_bam_size = size(GatherBamFiles.output_bam, "GB") - # QC the final BAM (consolidated after scattered BQSR) call QC.CollectReadgroupBamQualityMetrics as CollectReadgroupBamQualityMetrics { input: @@ -323,7 +282,6 @@ workflow to_bam_workflow { ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, - disk_size = binned_qual_bam_size + ref_size + additional_disk, preemptible_tries = agg_preemptible_tries } @@ -336,7 +294,6 @@ workflow to_bam_workflow { ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, - disk_size = binned_qual_bam_size + ref_size + additional_disk, preemptible_tries = agg_preemptible_tries } @@ -351,7 +308,6 @@ workflow to_bam_workflow { genotypes_index = fingerprint_genotypes_index, output_basename = base_file_name, sample = sample_name, - disk_size = binned_qual_bam_size + additional_disk, preemptible_tries = agg_preemptible_tries } } @@ -366,7 +322,6 @@ workflow to_bam_workflow { ref_fasta_index = ref_fasta_index, wgs_coverage_interval_list = wgs_coverage_interval_list, read_length = read_length, - disk_size = binned_qual_bam_size + ref_size + additional_disk, preemptible_tries = agg_preemptible_tries } @@ -380,7 +335,6 @@ workflow to_bam_workflow { ref_fasta_index = ref_fasta_index, wgs_coverage_interval_list = wgs_coverage_interval_list, read_length = read_length, - disk_size = binned_qual_bam_size + ref_size + additional_disk, preemptible_tries = agg_preemptible_tries } @@ -390,7 +344,6 @@ workflow to_bam_workflow { input_bam = GatherBamFiles.output_bam, input_bam_index = GatherBamFiles.output_bam_index, read_group_md5_filename = recalibrated_bam_basename + ".bam.read_group_md5", - disk_size = binned_qual_bam_size + additional_disk, preemptible_tries = agg_preemptible_tries } diff --git a/tasks_pipelines/utilities.wdl b/tasks_pipelines/utilities.wdl index d2136aa..7845821 100644 --- a/tasks_pipelines/utilities.wdl +++ b/tasks_pipelines/utilities.wdl @@ -105,7 +105,7 @@ task ScatterIntervalList { Int interval_count = read_int(stdout()) } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" memory: "2 GB" } } @@ -117,9 +117,11 @@ task ConvertToCram { File ref_fasta File ref_fasta_index String output_basename - Float disk_size Int preemptible_tries + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + Int disk_size = ceil(2 * size(input_bam, "GB") + ref_size) + 20 + command <<< set -e set -o pipefail @@ -136,11 +138,11 @@ task ConvertToCram { samtools index ${output_basename}.cram >>> runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: preemptible_tries memory: "3 GB" cpu: "1" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" + disks: "local-disk " + disk_size + " HDD" } output { File output_cram = "${output_basename}.cram" @@ -165,7 +167,7 @@ task ConvertToBam { samtools index ${output_basename}.bam >>> runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" preemptible: 3 memory: "3 GB" cpu: "1" @@ -193,4 +195,3 @@ task SumFloats { preemptible: preemptible_tries } } -