Skip to content
This repository has been archived by the owner on Jan 25, 2020. It is now read-only.

Commit

Permalink
Wdlupdate (#6)
Browse files Browse the repository at this point in the history
* Fix for splitRG task. Disk sizing now takes place within tasks.
  • Loading branch information
bshifaw authored Mar 23, 2018
1 parent 84a2325 commit bbfdfb3
Show file tree
Hide file tree
Showing 9 changed files with 203 additions and 256 deletions.
78 changes: 11 additions & 67 deletions fc_germline_single_sample_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -73,29 +73,12 @@ workflow germline_single_sample_workflow {
Int preemptible_tries
Int agg_preemptible_tries

# Optional input to increase all disk sizes in case of outlier sample with strange size behavior
Int? increase_disk_size

Boolean skip_QC
Boolean make_gatk4_single_sample_vcf
Boolean use_gatk4_haplotype_caller

Float cutoff_for_large_rg_in_gb = 20.0

# Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a
# Cromwell error from asking for 0 disk when the input is less than 1GB
Int additional_disk = select_first([increase_disk_size, 20])
# Sometimes the output is larger than the input, or a task can spill to disk. In these cases we need to account for the
# input (1) and the output (1.5) or the input(1), the output(1), and spillage (.5).
Float bwa_disk_multiplier = 2.5
# SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data
# so it needs more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a
# larger multiplier
Float sort_sam_disk_multiplier = 3.25

# Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving .25 as wiggleroom
Float md_disk_multiplier = 2.25

String bwa_commandline="bwa mem -K 100000000 -p -v 3 -t 16 -Y $bash_ref_fasta"

String recalibrated_bam_basename = base_file_name + ".aligned.duplicates_marked.recalibrated"
Expand All @@ -106,25 +89,19 @@ workflow germline_single_sample_workflow {
# by MergeBamAlignment.
call Alignment.GetBwaVersion

# Get the size of the standard reference files as well as the additional reference files needed for BWA
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
Float bwa_ref_size = ref_size + size(ref_alt, "GB") + size(ref_amb, "GB") + size(ref_ann, "GB") + size(ref_bwt, "GB") + size(ref_pac, "GB") + size(ref_sa, "GB")
Float dbsnp_size = size(dbSNP_vcf, "GB")

# Align flowcell-level unmapped input bams in parallel
scatter (unmapped_bam in flowcell_unmapped_bams) {

Float unmapped_bam_size = size(unmapped_bam, "GB")

String unmapped_bam_basename = basename(unmapped_bam, unmapped_bam_suffix)

if (!skip_QC) {
# QC the unmapped BAM
call QC.CollectQualityYieldMetrics as CollectQualityYieldMetrics {
input:
input_bam = unmapped_bam,
metrics_filename = unmapped_bam_basename + ".unmapped.quality_yield_metrics",
disk_size = unmapped_bam_size + additional_disk,
preemptible_tries = preemptible_tries
}
}
Expand All @@ -147,12 +124,8 @@ workflow germline_single_sample_workflow {
ref_bwt = ref_bwt,
ref_pac = ref_pac,
ref_sa = ref_sa,
additional_disk = additional_disk,
compression_level = compression_level,
preemptible_tries = preemptible_tries,
bwa_ref_size = bwa_ref_size,
disk_multiplier = bwa_disk_multiplier,
unmapped_bam_size = unmapped_bam_size
preemptible_tries = preemptible_tries
}
}

Expand All @@ -173,9 +146,6 @@ workflow germline_single_sample_workflow {
ref_pac = ref_pac,
ref_sa = ref_sa,
bwa_version = GetBwaVersion.version,
# The merged bam can be bigger than only the aligned bam,
# so account for the output size by multiplying the input size by 2.75.
disk_size = unmapped_bam_size + bwa_ref_size + (bwa_disk_multiplier * unmapped_bam_size) + additional_disk,
compression_level = compression_level,
preemptible_tries = preemptible_tries
}
Expand All @@ -192,7 +162,6 @@ workflow germline_single_sample_workflow {
input:
input_bam = output_aligned_bam,
output_bam_prefix = unmapped_bam_basename + ".readgroup",
disk_size = mapped_bam_size + additional_disk,
preemptible_tries = preemptible_tries
}
}
Expand All @@ -213,22 +182,16 @@ workflow germline_single_sample_workflow {
input_bams = output_aligned_bam,
output_bam_basename = base_file_name + ".aligned.unsorted.duplicates_marked",
metrics_filename = base_file_name + ".duplicate_metrics",
# The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs
# and the merged output.
disk_size = (md_disk_multiplier * SumFloats.total_size) + additional_disk,
total_input_size = SumFloats.total_size,
compression_level = compression_level,
preemptible_tries = agg_preemptible_tries
}

Float agg_bam_size = size(MarkDuplicates.output_bam, "GB")

# Sort aggregated+deduped BAM file
call Processing.SortSam as SortSampleBam {
input:
input_bam = MarkDuplicates.output_bam,
output_bam_basename = base_file_name + ".aligned.duplicate_marked.sorted",
# This task spills to disk so we need space for the input bam, the output bam, and any spillage.
disk_size = (sort_sam_disk_multiplier * agg_bam_size) + additional_disk,
compression_level = compression_level,
preemptible_tries = agg_preemptible_tries
}
Expand All @@ -251,7 +214,6 @@ workflow germline_single_sample_workflow {
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
output_prefix = base_file_name + ".preBqsr",
disk_size = agg_bam_size + ref_size + additional_disk,
preemptible_tries = agg_preemptible_tries,
contamination_underestimation_factor = 0.75
}
Expand All @@ -278,8 +240,7 @@ workflow germline_single_sample_workflow {
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
# We need disk to localize the sharded bam due to the scatter.
disk_size = (agg_bam_size / bqsr_divisor) + ref_size + dbsnp_size + additional_disk,
bqsr_scatter = bqsr_divisor,
preemptible_tries = agg_preemptible_tries
}
}
Expand All @@ -290,7 +251,6 @@ workflow germline_single_sample_workflow {
input:
input_bqsr_reports = BaseRecalibrator.recalibration_report,
output_report_filename = base_file_name + ".recal_data.csv",
disk_size = additional_disk,
preemptible_tries = preemptible_tries
}

Expand All @@ -305,20 +265,20 @@ workflow germline_single_sample_workflow {
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
# We need disk to localize the sharded bam and the sharded output due to the scatter.
disk_size = ((agg_bam_size * 3) / bqsr_divisor) + ref_size + additional_disk,
bqsr_scatter = bqsr_divisor,
compression_level = compression_level,
preemptible_tries = agg_preemptible_tries
}
}

Float agg_bam_size = size(SortSampleBam.output_bam, "GB")

# Merge the recalibrated BAM files resulting from by-interval recalibration
call Processing.GatherBamFiles as GatherBamFiles {
call Processing.GatherSortedBamFiles as GatherBamFiles {
input:
input_bams = ApplyBQSR.recalibrated_bam,
output_bam_basename = base_file_name,
# Multiply the input bam size by two to account for the input and output
disk_size = (2 * agg_bam_size) + additional_disk,
total_input_size = agg_bam_size,
compression_level = compression_level,
preemptible_tries = agg_preemptible_tries
}
Expand All @@ -338,7 +298,6 @@ workflow germline_single_sample_workflow {
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
disk_size = binned_qual_bam_size + ref_size + additional_disk,
preemptible_tries = agg_preemptible_tries
}

Expand All @@ -351,7 +310,6 @@ workflow germline_single_sample_workflow {
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
disk_size = binned_qual_bam_size + ref_size + additional_disk,
preemptible_tries = agg_preemptible_tries
}

Expand All @@ -365,7 +323,6 @@ workflow germline_single_sample_workflow {
ref_fasta_index = ref_fasta_index,
wgs_coverage_interval_list = wgs_coverage_interval_list,
read_length = read_length,
disk_size = binned_qual_bam_size + ref_size + additional_disk,
preemptible_tries = agg_preemptible_tries
}

Expand All @@ -379,7 +336,6 @@ workflow germline_single_sample_workflow {
ref_fasta_index = ref_fasta_index,
wgs_coverage_interval_list = wgs_coverage_interval_list,
read_length = read_length,
disk_size = binned_qual_bam_size + ref_size + additional_disk,
preemptible_tries = agg_preemptible_tries
}

Expand All @@ -389,14 +345,10 @@ workflow germline_single_sample_workflow {
input_bam = GatherBamFiles.output_bam,
input_bam_index = GatherBamFiles.output_bam_index,
read_group_md5_filename = recalibrated_bam_basename + ".bam.read_group_md5",
disk_size = binned_qual_bam_size + additional_disk,
preemptible_tries = agg_preemptible_tries
}
}

# Germline single sample GVCFs shouldn't get bigger even when the input bam is bigger (after a certain size)
Int GVCF_disk_size = select_first([increase_disk_size, 30])

# ValidateSamFile runs out of memory in mate validation on crazy edge case data, so we want to skip the mate validation
# in those cases. These values set the thresholds for what is considered outside the normal realm of "reasonable" data.
Float max_duplication_in_reasonable_sample = 0.30
Expand All @@ -409,12 +361,9 @@ workflow germline_single_sample_workflow {
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
output_basename = base_file_name,
disk_size = (2 * binned_qual_bam_size) + ref_size + additional_disk,
preemptible_tries = agg_preemptible_tries
}

Float cram_size = size(ConvertToCram.output_cram, "GB")

if (!skip_QC) {
# Check whether the data has massively high duplication or chimerism rates
call QC.CheckPreValidation as CheckPreValidation {
Expand All @@ -441,7 +390,6 @@ workflow germline_single_sample_workflow {
ignore = ["MISSING_TAG_NM"],
max_output = 1000000000,
is_outlier_data = is_outlier_data,
disk_size = cram_size + ref_size + additional_disk,
preemptible_tries = agg_preemptible_tries
}

Expand Down Expand Up @@ -474,8 +422,7 @@ workflow germline_single_sample_workflow {
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
# Divide the total output GVCF size and the input bam size to account for the smaller scattered input and output.
disk_size = ((binned_qual_bam_size + GVCF_disk_size) / hc_divisor) + ref_size + additional_disk,
hc_scatter = hc_divisor,
preemptible_tries = agg_preemptible_tries
}

Expand All @@ -486,7 +433,6 @@ workflow germline_single_sample_workflow {
input_vcf_index = HaplotypeCaller4.output_vcf_index,
vcf_basename = base_file_name,
interval_list = ScatterIntervalList.out[index],
disk_size = GVCF_disk_size + GVCF_disk_size + additional_disk,
preemptible_tries = preemptible_tries
}
}
Expand All @@ -501,8 +447,7 @@ workflow germline_single_sample_workflow {
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
# Divide the total output GVCF size and the input bam size to account for the smaller scattered input and output.
disk_size = ((binned_qual_bam_size + GVCF_disk_size) / hc_divisor) + ref_size + additional_disk,
hc_scatter = hc_divisor,
preemptible_tries = agg_preemptible_tries
}
}
Expand All @@ -519,7 +464,6 @@ workflow germline_single_sample_workflow {
input_vcfs = merge_input,
input_vcfs_indexes = merge_input_index,
output_vcf_name = final_vcf_base_name + name_token + ".vcf.gz",
disk_size = GVCF_disk_size,
preemptible_tries = agg_preemptible_tries
}

Expand Down
28 changes: 2 additions & 26 deletions germline_single_sample_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,6 @@ workflow germline_single_sample_workflow {
Int preemptible_tries
Int agg_preemptible_tries

# Optional input to increase all disk sizes in case of outlier sample with strange size behavior
Int? increase_disk_size

call ToBam.to_bam_workflow {
input:
contamination_sites_ud = contamination_sites_ud,
Expand Down Expand Up @@ -103,21 +100,9 @@ workflow germline_single_sample_workflow {
known_indels_sites_VCFs = known_indels_sites_VCFs,
known_indels_sites_indices = known_indels_sites_indices,
preemptible_tries = preemptible_tries,
agg_preemptible_tries = agg_preemptible_tries,
increase_disk_size = increase_disk_size
agg_preemptible_tries = agg_preemptible_tries
}

# Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a
# Cromwell error from asking for 0 disk when the input is less than 1GB
Int additional_disk = select_first([increase_disk_size, 20])
# Germline single sample GVCFs shouldn't get bigger even when the input bam is bigger (after a certain size)
Int GVCF_disk_size = select_first([increase_disk_size, 30])
#BQSR bins the qualities which makes a significantly smaller bam
Float binned_qual_bam_size = size(to_bam_workflow.output_bam, "GB")

Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
Float dbsnp_size = size(dbSNP_vcf, "GB")

# ValidateSamFile runs out of memory in mate validation on crazy edge case data, so we want to skip the mate validation
# in those cases. These values set the thresholds for what is considered outside the normal realm of "reasonable" data.
Float max_duplication_in_reasonable_sample = 0.30
Expand All @@ -130,12 +115,9 @@ workflow germline_single_sample_workflow {
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
output_basename = base_file_name,
disk_size = (2 * binned_qual_bam_size) + ref_size + additional_disk,
preemptible_tries = agg_preemptible_tries
}

Float cram_size = size(ConvertToCram.output_cram, "GB")

# Check whether the data has massively high duplication or chimerism rates
call QC.CheckPreValidation as CheckPreValidation {
input:
Expand All @@ -158,7 +140,6 @@ workflow germline_single_sample_workflow {
ignore = ["MISSING_TAG_NM"],
max_output = 1000000000,
is_outlier_data = CheckPreValidation.is_outlier_data,
disk_size = cram_size + ref_size + additional_disk,
preemptible_tries = agg_preemptible_tries
}

Expand Down Expand Up @@ -189,8 +170,7 @@ workflow germline_single_sample_workflow {
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
# Divide the total output GVCF size and the input bam size to account for the smaller scattered input and output.
disk_size = ((binned_qual_bam_size + GVCF_disk_size) / hc_divisor) + ref_size + additional_disk,
hc_scatter = hc_divisor,
preemptible_tries = agg_preemptible_tries
}
}
Expand All @@ -201,7 +181,6 @@ workflow germline_single_sample_workflow {
input_vcfs = HaplotypeCaller.output_gvcf,
input_vcfs_indexes = HaplotypeCaller.output_gvcf_index,
output_vcf_name = final_gvcf_base_name + ".g.vcf.gz",
disk_size = GVCF_disk_size,
preemptible_tries = agg_preemptible_tries
}

Expand All @@ -218,7 +197,6 @@ workflow germline_single_sample_workflow {
ref_fasta_index = ref_fasta_index,
ref_dict = ref_dict,
wgs_calling_interval_list = wgs_calling_interval_list,
disk_size = gvcf_size + ref_size + dbsnp_size + additional_disk,
preemptible_tries = agg_preemptible_tries
}

Expand All @@ -232,7 +210,6 @@ workflow germline_single_sample_workflow {
dbSNP_vcf_index = dbSNP_vcf_index,
ref_dict = ref_dict,
wgs_evaluation_interval_list = wgs_evaluation_interval_list,
disk_size = gvcf_size + dbsnp_size + additional_disk,
preemptible_tries = agg_preemptible_tries
}

Expand Down Expand Up @@ -296,4 +273,3 @@ workflow germline_single_sample_workflow {
File output_vcf_index = MergeVCFs.output_vcf_index
}
}

Loading

0 comments on commit bbfdfb3

Please sign in to comment.