Skip to content

Commit

Permalink
Updates to ONT workflows to bring them closer to current conventions (#…
Browse files Browse the repository at this point in the history
…466)

- Documentation updates for `SRWholeGenome.wdl` and `ONTWholeGenome.wdl`
- `VariantUtils::MergeAndSortVCFs` now has a fall-back to bcftools concat with normal compression if `--naive` fails.
- Fixed `VariantUtils::MergeAndSortVCFs` to use `lr-basic:0.1.2`
- `CallVariantsONT.wdl` now takes a `ref_map` instead of reference fasta/fai/dict.
- `CallVariantsONT.wdl` now filters out the `mt_chr_name` if `call_small_vars_on_mitochondria` is set to `false`.
  • Loading branch information
jonn-smith authored Aug 16, 2024
1 parent 6e1a2cc commit 821e81b
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 30 deletions.
3 changes: 2 additions & 1 deletion wdl/pipelines/ILMN/VariantCalling/SRWholeGenome.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ workflow SRWholeGenome {
parameter_meta {
aligned_bams: "Array of aligned bam files to process."
aligned_bais: "Array of aligned bam indices to process. Order must correspond to `aligned_bams`."
ref_map_file: "Reference map file indicating reference sequence and auxillary file locations"
ref_map_file: "Table indicating reference sequence, auxillary file locations, and metadata."
participant_name: "The unique identifier of this sample being processed."

enable_hc_pileup_mode: "If true, will enable `pileup mode` in HaplotypeCaller."
Expand Down Expand Up @@ -104,6 +104,7 @@ workflow SRWholeGenome {
Array[String] contigs_names_to_ignore = ["RANDOM_PLACEHOLDER_VALUE"] ## Required for ignoring any filtering - this is kind of a hack - TODO: fix the task!
}

# Read ref map into map data type so we can access its fields:
Map[String, String] ref_map = read_map(ref_map_file)

# gather across (potential multiple) input CCS BAMs
Expand Down
7 changes: 3 additions & 4 deletions wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ workflow ONTWholeGenome {
aligned_bais: "GCS path to aligned BAM file indices"
participant_name: "name of the participant from whom these samples were obtained"

ref_map_file: "table indicating reference sequence and auxillary file locations"
ref_map_file: "Table indicating reference sequence, auxillary file locations, and metadata."
gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files"

call_svs: "whether to call SVs"
Expand Down Expand Up @@ -60,6 +60,7 @@ workflow ONTWholeGenome {
File? ref_scatter_interval_list_ids
}

# Read ref map into map data type so we can access its fields:
Map[String, String] ref_map = read_map(ref_map_file)

String outdir = sub(gcs_out_root_dir, "/$", "") + "/ONTWholeGenome/~{participant_name}"
Expand Down Expand Up @@ -120,9 +121,7 @@ workflow ONTWholeGenome {
bam = usable_bam,
bai = usable_bai,
sample_id = participant_name,
ref_fasta = ref_map['fasta'],
ref_fasta_fai = ref_map['fai'],
ref_dict = ref_map['dict'],
ref_map_file = ref_map_file,
tandem_repeat_bed = ref_map['tandem_repeat_bed'],

prefix = participant_name,
Expand Down
23 changes: 21 additions & 2 deletions wdl/tasks/Utility/VariantUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -108,13 +108,27 @@ task MergeAndSortVCFs {
echo "==========================================================="
echo "starting concatenation" && date
echo "==========================================================="
# Try this first. If it fails, do a regular compression.
set +e
bcftools \
concat \
--naive \
--threads ~{cores-1} \
-f all_raw_vcfs.txt \
--output-type v \
-o concatedated_raw.vcf.gz # fast, at the expense of disk space
r=$?
if [[ $r -ne 0 ]] ; then
# Could not naively concatenate, so do a regular compression.
bcftools \
concat \
--threads ~{cores-1} \
-f all_raw_vcfs.txt \
--output-type z2 \
-o concatedated_raw.vcf.gz
fi
set -e
for vcf in ~{sep=' ' vcfs}; do rm $vcf ; done
# this is another bug in bcftools that's hot fixed but not in official release yet
Expand Down Expand Up @@ -153,6 +167,8 @@ task MergeAndSortVCFs {
else
mv concatedated_raw.vcf.gz tmp.wgs.vcf.gz
fi
# Fix file header:
bcftools reheader \
--fai ~{ref_fasta_fai} \
-o wgs_raw.vcf.gz \
Expand All @@ -168,7 +184,10 @@ task MergeAndSortVCFs {
--output-type z \
-o ~{prefix}.vcf.gz \
wgs_raw.vcf.gz
# Index the output:
bcftools index --tbi --force ~{prefix}.vcf.gz
echo "==========================================================="
echo "done sorting" && date
echo "==========================================================="
Expand All @@ -187,7 +206,7 @@ task MergeAndSortVCFs {
boot_disk_gb: 10,
preemptible_tries: 1,
max_retries: 0,
docker: "us.gcr.io/broad-dsp-lrma/lr-basic:latest"
docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.2"
}
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
runtime {
Expand Down
49 changes: 26 additions & 23 deletions wdl/tasks/VariantCalling/CallVariantsONT.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,18 @@ workflow CallVariants {
minsvlen: "Minimum SV length in bp (default: 50)"
prefix: "Prefix for output files"
sample_id: "Sample ID"
ref_fasta: "Reference FASTA file"
ref_fasta_fai: "Reference FASTA index file"
ref_dict: "Reference FASTA dictionary file"

ref_map_file: "Table indicating reference sequence, auxillary file locations, and metadata."

call_svs: "Call structural variants"
fast_less_sensitive_sv: "to trade less sensitive SV calling for faster speed"
tandem_repeat_bed: "BED file containing TRF finder for better PBSV calls (e.g. http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.trf.bed.gz)"

call_small_variants: "Call small variants"
call_small_vars_on_mitochondria: "if false, will not attempt to call variants on mitochondria"
sites_vcf: "for use with Clair"
sites_vcf_tbi: "for use with Clair"

run_dv_pepper_analysis: "to turn on DV-Pepper analysis or not (non-trivial increase in cost and runtime)"
dvp_threads: "number of threads for DV-Pepper"
dvp_memory: "memory for DV-Pepper"
Expand All @@ -43,9 +45,7 @@ workflow CallVariants {
String prefix
String sample_id

File ref_fasta
File ref_fasta_fai
File ref_dict
File ref_map_file

Boolean call_svs
Boolean fast_less_sensitive_sv
Expand All @@ -63,6 +63,9 @@ workflow CallVariants {
File? ref_scatter_interval_list_ids
}

# Read ref map into map data type so we can access its fields:
Map[String, String] ref_map = read_map(ref_map_file)

######################################################################
# Block for small variants handling
######################################################################
Expand All @@ -73,10 +76,10 @@ workflow CallVariants {
if (call_small_variants) {
# Scatter by chromosome
Array[String] default_filter = ['random', 'chrUn', 'decoy', 'alt', 'HLA', 'EBV']
Array[String] use_filter = if (call_small_vars_on_mitochondria) then default_filter else flatten([['chrM'],default_filter])
Array[String] use_filter = if (call_small_vars_on_mitochondria) then default_filter else flatten([['chrM', ref_map["mt_chr_name"]], default_filter])
call Utils.MakeChrIntervalList as SmallVariantsScatterPrepp {
input:
ref_dict = ref_dict,
ref_dict = ref_map["dict"],
filter = use_filter
}

Expand All @@ -95,8 +98,8 @@ workflow CallVariants {
bam = SmallVariantsScatter.subset_bam,
bai = SmallVariantsScatter.subset_bai,

ref_fasta = ref_fasta,
ref_fasta_fai = ref_fasta_fai,
ref_fasta = ref_map["fasta"],
ref_fasta_fai = ref_map["fai"],

sites_vcf = sites_vcf,
sites_vcf_tbi = sites_vcf_tbi,
Expand All @@ -109,14 +112,14 @@ workflow CallVariants {
call VariantUtils.MergeAndSortVCFs as MergeAndSortClairVCFs {
input:
vcfs = Clair.vcf,
ref_fasta_fai = ref_fasta_fai,
ref_fasta_fai = ref_map["fai"],
prefix = prefix + ".clair"
}

call VariantUtils.MergeAndSortVCFs as MergeAndSortClair_gVCFs {
input:
vcfs = Clair.gvcf,
ref_fasta_fai = ref_fasta_fai,
ref_fasta_fai = ref_map["fai"],
prefix = prefix + ".clair.g"
}

Expand All @@ -142,8 +145,8 @@ workflow CallVariants {
input:
bam = size_balanced_scatter.subset_bam,
bai = size_balanced_scatter.subset_bai,
ref_fasta = ref_fasta,
ref_fasta_fai = ref_fasta_fai,
ref_fasta = ref_map["fasta"],
ref_fasta_fai = ref_map["fai"],
threads = select_first([dvp_threads]),
memory = select_first([dvp_memory]),
zones = arbitrary.zones
Expand All @@ -156,21 +159,21 @@ workflow CallVariants {
input:
vcfs = Pepper.gVCF,
prefix = dvp_prefix + ".g",
ref_fasta_fai = ref_fasta_fai
ref_fasta_fai = ref_map["fai"]
}

call VariantUtils.MergeAndSortVCFs as MergeDeepVariantPhasedVCFs {
input:
vcfs = Pepper.phasedVCF,
prefix = dvp_prefix + ".phased",
ref_fasta_fai = ref_fasta_fai
ref_fasta_fai = ref_map["fai"]
}

call VariantUtils.MergeAndSortVCFs as MergeDeepVariantVCFs {
input:
vcfs = Pepper.VCF,
prefix = dvp_prefix,
ref_fasta_fai = ref_fasta_fai
ref_fasta_fai = ref_map["fai"]
}

call Utils.MergeBams {
Expand All @@ -188,7 +191,7 @@ workflow CallVariants {

call Utils.MakeChrIntervalList {
input:
ref_dict = ref_dict,
ref_dict = ref_map["dict"],
filter = ['random', 'chrUn', 'decoy', 'alt', 'HLA', 'EBV']
}

Expand All @@ -206,8 +209,8 @@ workflow CallVariants {
input:
bam = SubsetBam.subset_bam,
bai = SubsetBam.subset_bai,
ref_fasta = ref_fasta,
ref_fasta_fai = ref_fasta_fai,
ref_fasta = ref_map["fasta"],
ref_fasta_fai = ref_map["fai"],
prefix = prefix,
tandem_repeat_bed = tandem_repeat_bed,
is_ccs = false,
Expand All @@ -225,7 +228,7 @@ workflow CallVariants {
call VariantUtils.MergePerChrCalls as MergePBSVVCFs {
input:
vcfs = RunPBSV.vcf,
ref_dict = ref_dict,
ref_dict = ref_map["dict"],
prefix = prefix + ".pbsv"
}

Expand All @@ -236,8 +239,8 @@ workflow CallVariants {
input:
bam = bam,
bai = bai,
ref_fasta = ref_fasta,
ref_fasta_fai = ref_fasta_fai,
ref_fasta = ref_map["fasta"],
ref_fasta_fai = ref_map["fai"],
prefix = prefix,
tandem_repeat_bed = tandem_repeat_bed,
is_ccs = false,
Expand Down

0 comments on commit 821e81b

Please sign in to comment.