From fe4d92623850dc48aaf1ec024b41a7b3d981f7a5 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Mon, 12 Sep 2022 23:14:59 +0000 Subject: [PATCH 01/45] Create first working version with vcf2maf --- .gitignore | 3 +++ README.md | 34 ++++++++++++++++++++++++++++++++++ main.nf | 35 +++++++++++++++++++++++++++++++++++ nextflow.config | 1 + 4 files changed, 73 insertions(+) create mode 100644 .gitignore create mode 100644 main.nf create mode 100644 nextflow.config diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dcd7af8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# Nextflow files +.nextflow* +work/ diff --git a/README.md b/README.md index 663c567..ed52637 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,36 @@ # nf-vcf2maf + Annotate VCF files using VEP and generate MAF files using vcf2maf + +## Setup + +```console +# Install tmux for long-running commands +sudo yum install -y tmux + +# Download Ensembl VEP cache (expands to 22G) +mkdir -p $HOME/ref/ $HOME/.vep/ +rsync -avr --progress rsync://ftp.ensembl.org/ensembl/pub/release-107/variation/indexed_vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz $HOME/ref/ +tar -zvxf $HOME/ref/homo_sapiens_vep_107_GRCh38.tar.gz -C $HOME/.vep/ + +# Download reference genome FASTA file +mkdir -p $HOME/ref/fasta/ +aws --no-sign-request s3 sync s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/ $HOME/ref/fasta/ + +# Get example VCF file for testing purposes +mkdir -p $HOME/data +echo "Log into AWS using credentials from JumpCloud" +aws s3 cp s3://ntap-add5-project-tower-bucket/outputs/sarek_syn29793238/VariantCalling/2-001_Plexiform_Neurofibroma/DeepVariant/2-001_Plexiform_Neurofibroma.vcf.gz ~/data/test.vcf.gz +gunzip ~/data/test.vcf.gz +head -n 4000 ~/data/test.vcf ~/data/test-small.vcf + +# Install Nextflow +sudo yum install -y java +(cd .local/bin && wget -qO- https://get.nextflow.io | bash) + +# Stage reference files in memory +mkdir -p /dev/shm/vep/ /dev/shm/fasta/ +sudo mount -o remount,size=30G /dev/shm # Increase /dev/shm size +rsync -avhP $HOME/.vep/ /dev/shm/vep/ +rsync -avhP $HOME/ref/fasta/ /dev/shm/fasta/ +``` diff --git a/main.nf b/main.nf new file mode 100644 index 0000000..f58e631 --- /dev/null +++ b/main.nf @@ -0,0 +1,35 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl=2 + +params.input_vcf = "$HOME/data/test-small.vcf" +params.reference_fasta = "/dev/shm/fasta/Homo_sapiens_assembly38.fasta" +params.vep_data = "/dev/shm/vep/" + +process VCF2MAF { + + container 'nfosi/vcf2maf:latest' + + input: + path input_vcf + path reference_fasta + path vep_data + + output: + path 'output.maf' + + script: + """ + perl /nf-osi-vcf2maf-*/vcf2maf.pl \ + --input-vcf ${input_vcf} --output-maf output.maf --ref-fasta ${reference_fasta} \ + --vep-data ${vep_data} --ncbi-build GRCh38 --max-subpop-af 0.0005 \ + --vep-path /root/miniconda3/envs/vcf2maf/bin --maf-center "Sage Bionetworks" + """ + +} + +workflow { + input_vcf_ch = Channel.fromPath(params.input_vcf) + VCF2MAF(input_vcf_ch, params.reference_fasta, params.vep_data) + VCF2MAF.out.view() +} diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..d3af3ea --- /dev/null +++ b/nextflow.config @@ -0,0 +1 @@ +docker.enabled = true From 8d180e84ad167b9e130dd84197acd76986531c16 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Tue, 13 Sep 2022 03:52:30 +0000 Subject: [PATCH 02/45] Switch to `gnomad-genomes` container --- main.nf | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index f58e631..aeeedd6 100644 --- a/main.nf +++ b/main.nf @@ -2,13 +2,18 @@ nextflow.enable.dsl=2 -params.input_vcf = "$HOME/data/test-small.vcf" +params.input_vcf = "$HOME/data/test-small*.vcf" params.reference_fasta = "/dev/shm/fasta/Homo_sapiens_assembly38.fasta" params.vep_data = "/dev/shm/vep/" +params.maf_center = "Sage Bionetworks" +params.max_subpop_af = 0.0005 +params.ncbi_build = "GRCh38" + + process VCF2MAF { - container 'nfosi/vcf2maf:latest' + container "sagebionetworks/vcf2maf:gnomad-genomes" input: path input_vcf @@ -19,17 +24,19 @@ process VCF2MAF { path 'output.maf' script: + vep_path = "/root/miniconda3/envs/vep/bin" """ - perl /nf-osi-vcf2maf-*/vcf2maf.pl \ + vcf2maf.pl \ --input-vcf ${input_vcf} --output-maf output.maf --ref-fasta ${reference_fasta} \ - --vep-data ${vep_data} --ncbi-build GRCh38 --max-subpop-af 0.0005 \ - --vep-path /root/miniconda3/envs/vcf2maf/bin --maf-center "Sage Bionetworks" + --vep-data ${vep_data} --ncbi-build ${params.ncbi_build} --max-subpop-af ${params.max_subpop_af} \ + --vep-path ${vep_path} --maf-center ${params.maf_center} """ } + workflow { - input_vcf_ch = Channel.fromPath(params.input_vcf) + input_vcf_ch = Channel.fromPath(params.input_vcf, checkIfExists: true) VCF2MAF(input_vcf_ch, params.reference_fasta, params.vep_data) VCF2MAF.out.view() } From b9c342a6bb0b8cd953fcd3be1787cd2eac0a8c58 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Tue, 13 Sep 2022 05:37:20 +0000 Subject: [PATCH 03/45] Add `MERGE_MAFS` process --- main.nf | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 70 insertions(+), 9 deletions(-) diff --git a/main.nf b/main.nf index aeeedd6..324f1a2 100644 --- a/main.nf +++ b/main.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 -params.input_vcf = "$HOME/data/test-small*.vcf" +params.input = "$HOME/data/example.csv" params.reference_fasta = "/dev/shm/fasta/Homo_sapiens_assembly38.fasta" params.vep_data = "/dev/shm/vep/" @@ -16,27 +16,88 @@ process VCF2MAF { container "sagebionetworks/vcf2maf:gnomad-genomes" input: - path input_vcf + tuple val(meta), path(input_vcf) path reference_fasta path vep_data output: - path 'output.maf' + tuple val(meta), path("${input_vcf}.maf") script: vep_path = "/root/miniconda3/envs/vep/bin" """ vcf2maf.pl \ - --input-vcf ${input_vcf} --output-maf output.maf --ref-fasta ${reference_fasta} \ - --vep-data ${vep_data} --ncbi-build ${params.ncbi_build} --max-subpop-af ${params.max_subpop_af} \ - --vep-path ${vep_path} --maf-center ${params.maf_center} + --input-vcf '${input_vcf}' --output-maf ${input_vcf}.maf --ref-fasta '${reference_fasta}' \ + --vep-data '${vep_data}' --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \ + --vep-path '${vep_path}' --maf-center '${params.maf_center}' + """ + +} + + +process MERGE_MAFS { + + container "python:3.10.4" + + input: + tuple val(meta), path(input_mafs) + + output: + tuple val(meta), path("${meta.study_id}.maf") + + script: + script_url = "https://raw.githubusercontent.com/genome-nexus/annotation-tools/master/merge_mafs.py" + """ + wget ${script_url} + python3 merge_mafs.py -o ${meta.study_id}.maf -i ${input_mafs.join(',')} """ } workflow { - input_vcf_ch = Channel.fromPath(params.input_vcf, checkIfExists: true) - VCF2MAF(input_vcf_ch, params.reference_fasta, params.vep_data) - VCF2MAF.out.view() + + input_ch = Channel + .fromPath ( params.input ) + .splitCsv ( header:true, sep:',' ) + .map { create_vcf_channel(it) } + + VCF2MAF(input_ch, params.reference_fasta, params.vep_data) + + merged_inputs_ch = VCF2MAF.out + .filter { meta, maf -> meta.is_releasable } + .map { + vcf_meta, maf -> + def study_meta = [:] + study_meta.merged_parent_id = vcf_meta.merged_parent_id + study_meta.study_id = vcf_meta.study_id + study_meta.variant_class = vcf_meta.variant_class + study_meta.variant_caller = vcf_meta.variant_caller + [ study_meta, maf ] + } + .groupTuple( by: 0 ) + + MERGE_MAFS(merged_inputs_ch) + + MERGE_MAFS.out.view() + +} + + +// Function to get list of [ meta, vcf ] +def create_vcf_channel(LinkedHashMap row) { + + // Create metadata element + def meta = [:] + meta.sample_parent_id = row.sample_parent_id + meta.merged_parent_id = row.merged_parent_id + meta.study_id = row.study_id + meta.variant_class = row.variant_class + meta.variant_caller = row.variant_caller + meta.is_releasable = row.is_releasable.toBoolean() + + // Combine with VCF file element + def vcf_meta = [meta, file(row.vcf_file)] + + return vcf_meta } From b81a9926eeaa745ed3b82fc90e85032ae83fe7fe Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Tue, 13 Sep 2022 07:02:14 +0000 Subject: [PATCH 04/45] Add `FILTER_MAF` process --- main.nf | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 324f1a2..b432708 100644 --- a/main.nf +++ b/main.nf @@ -55,6 +55,37 @@ process MERGE_MAFS { } +process FILTER_MAF { + + container "python:3.10.4" + + input: + tuple val(meta), path(input_maf) + + output: + tuple val(meta), path("${input_maf}.filt.maf") + + script: + """ + #!/usr/bin/env python3 + + import csv + + with ( + open('${input_maf}', newline='') as infile, + open('${input_maf}.filt.maf', "w", newline='') as outfile + ): + reader = csv.DictReader(infile, delimiter='\t') + writer = csv.DictWriter(outfile, reader.fieldnames, delimiter='\t') + for row in reader: + print(row) + if row['FILTER'] == 'PASS': + writer.writerow(row) + """ + +} + + workflow { input_ch = Channel @@ -79,7 +110,9 @@ workflow { MERGE_MAFS(merged_inputs_ch) - MERGE_MAFS.out.view() + FILTER_MAF(MERGE_MAFS.out) + + FILTER_MAF.out.view() } From afbcb7ffe76adfc7ac666db7ebeff300f49559e0 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Tue, 13 Sep 2022 07:02:40 +0000 Subject: [PATCH 05/45] Remove unnecessary print statement --- main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/main.nf b/main.nf index b432708..2f3b393 100644 --- a/main.nf +++ b/main.nf @@ -78,7 +78,6 @@ process FILTER_MAF { reader = csv.DictReader(infile, delimiter='\t') writer = csv.DictWriter(outfile, reader.fieldnames, delimiter='\t') for row in reader: - print(row) if row['FILTER'] == 'PASS': writer.writerow(row) """ From 5209817bc1d0d7f07b143a200e9ee1c6309fb614 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Tue, 13 Sep 2022 07:29:07 +0000 Subject: [PATCH 06/45] Add `SYNAPSE_STORE` process --- main.nf | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 2f3b393..cc9b423 100644 --- a/main.nf +++ b/main.nf @@ -85,6 +85,23 @@ process FILTER_MAF { } +process SYNAPSE_STORE { + + container "sagebionetworks/synapsepythonclient:v2.6.0" + + secret "SYNAPSE_AUTH_TOKEN" + + input: + tuple path(input), val(parent_id) + + script: + """ + synapse store --parentId '${parent_id}' '${input}' + """ + +} + + workflow { input_ch = Channel @@ -111,7 +128,10 @@ workflow { FILTER_MAF(MERGE_MAFS.out) - FILTER_MAF.out.view() + merged_mafs_ch = MERGE_MAFS.out + .map { meta, maf -> [ maf, meta.merged_parent_id ] } + + SYNAPSE_STORE(merged_mafs_ch) } From 61ac39b6a85208ef77693d2ff1c8bd61d6afc819 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Tue, 13 Sep 2022 07:50:29 +0000 Subject: [PATCH 07/45] Add `SYNAPSE_GET` and `SYNAPSE_STORE` processes --- main.nf | 105 +++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 81 insertions(+), 24 deletions(-) diff --git a/main.nf b/main.nf index cc9b423..0528d2d 100644 --- a/main.nf +++ b/main.nf @@ -11,6 +11,26 @@ params.max_subpop_af = 0.0005 params.ncbi_build = "GRCh38" +process SYNAPSE_GET { + + container "sagebionetworks/synapsepythonclient:v2.6.0" + + secret "SYNAPSE_AUTH_TOKEN" + + input: + tuple val(meta), val(synapse_id) + + output: + tuple val(meta), path('*') + + script: + """ + synapse get '${synapse_id}' + """ + +} + + process VCF2MAF { container "sagebionetworks/vcf2maf:gnomad-genomes" @@ -102,36 +122,73 @@ process SYNAPSE_STORE { } +workflow SAMPLE_MAFS { + + take: + sample_vcfs + + main: + SYNAPSE_GET(sample_vcfs) + + VCF2MAF(SYNAPSE_GET.out, params.reference_fasta, params.vep_data) + + sample_mafs_ch = VCF2MAF.out + .map { meta, maf -> [ maf, meta.sample_parent_id ] } + + SYNAPSE_STORE(sample_mafs_ch) + + emit: + VCF2MAF.out + +} + + +workflow MERGED_MAFS { + + take: + sample_mafs + + main: + merged_inputs_ch = sample_mafs + .filter { meta, maf -> meta.is_releasable } + .map { + vcf_meta, maf -> + def study_meta = [:] + study_meta.merged_parent_id = vcf_meta.merged_parent_id + study_meta.study_id = vcf_meta.study_id + study_meta.variant_class = vcf_meta.variant_class + study_meta.variant_caller = vcf_meta.variant_caller + [ study_meta, maf ] + } + .groupTuple( by: 0 ) + + MERGE_MAFS(merged_inputs_ch) + + FILTER_MAF(MERGE_MAFS.out) + + merged_mafs_ch = MERGE_MAFS.out + .map { meta, maf -> [ maf, meta.merged_parent_id ] } + + filtered_mafs_ch = FILTER_MAF.out + .map { meta, maf -> [ maf, meta.merged_parent_id ] } + + both_mafs_ch = merged_mafs_ch.mix(filtered_mafs_ch) + + SYNAPSE_STORE(both_mafs_ch) + +} + + workflow { - input_ch = Channel + input_vcfs_ch = Channel .fromPath ( params.input ) .splitCsv ( header:true, sep:',' ) .map { create_vcf_channel(it) } - VCF2MAF(input_ch, params.reference_fasta, params.vep_data) - - merged_inputs_ch = VCF2MAF.out - .filter { meta, maf -> meta.is_releasable } - .map { - vcf_meta, maf -> - def study_meta = [:] - study_meta.merged_parent_id = vcf_meta.merged_parent_id - study_meta.study_id = vcf_meta.study_id - study_meta.variant_class = vcf_meta.variant_class - study_meta.variant_caller = vcf_meta.variant_caller - [ study_meta, maf ] - } - .groupTuple( by: 0 ) - - MERGE_MAFS(merged_inputs_ch) - - FILTER_MAF(MERGE_MAFS.out) - - merged_mafs_ch = MERGE_MAFS.out - .map { meta, maf -> [ maf, meta.merged_parent_id ] } + SAMPLE_MAFS(input_vcfs_ch) - SYNAPSE_STORE(merged_mafs_ch) + MERGED_MAFS(SAMPLE_MAFS.out) } @@ -149,7 +206,7 @@ def create_vcf_channel(LinkedHashMap row) { meta.is_releasable = row.is_releasable.toBoolean() // Combine with VCF file element - def vcf_meta = [meta, file(row.vcf_file)] + def vcf_meta = [meta, row.synapse_id] return vcf_meta } From e81c2afca85de593562522cf46da5e36d4ebbbec Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Tue, 13 Sep 2022 08:02:08 +0000 Subject: [PATCH 08/45] Improve output file naming --- main.nf | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index 0528d2d..0a081c7 100644 --- a/main.nf +++ b/main.nf @@ -41,13 +41,13 @@ process VCF2MAF { path vep_data output: - tuple val(meta), path("${input_vcf}.maf") + tuple val(meta), path("*.maf") script: vep_path = "/root/miniconda3/envs/vep/bin" """ vcf2maf.pl \ - --input-vcf '${input_vcf}' --output-maf ${input_vcf}.maf --ref-fasta '${reference_fasta}' \ + --input-vcf '${input_vcf}' --output-maf ${input_vcf.baseName}.maf --ref-fasta '${reference_fasta}' \ --vep-data '${vep_data}' --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \ --vep-path '${vep_path}' --maf-center '${params.maf_center}' """ @@ -63,13 +63,15 @@ process MERGE_MAFS { tuple val(meta), path(input_mafs) output: - tuple val(meta), path("${meta.study_id}.maf") + tuple val(meta), path("*.merged.maf") script: script_url = "https://raw.githubusercontent.com/genome-nexus/annotation-tools/master/merge_mafs.py" """ wget ${script_url} - python3 merge_mafs.py -o ${meta.study_id}.maf -i ${input_mafs.join(',')} + python3 merge_mafs.py \ + -o ${meta.study_id}-${meta.variant_class}-${meta.variant_caller}.merged.maf \ + -i ${input_mafs.join(',')} """ } @@ -83,7 +85,7 @@ process FILTER_MAF { tuple val(meta), path(input_maf) output: - tuple val(meta), path("${input_maf}.filt.maf") + tuple val(meta), path("*.passed.maf") script: """ @@ -93,7 +95,7 @@ process FILTER_MAF { with ( open('${input_maf}', newline='') as infile, - open('${input_maf}.filt.maf', "w", newline='') as outfile + open('${input_maf.baseName}.passed.maf', "w", newline='') as outfile ): reader = csv.DictReader(infile, delimiter='\t') writer = csv.DictWriter(outfile, reader.fieldnames, delimiter='\t') From b55a6b29713ebfd847945d4de73b481f0ad98e8f Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Tue, 13 Sep 2022 08:07:12 +0000 Subject: [PATCH 09/45] Add TODOs for later --- main.nf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/main.nf b/main.nf index 0a081c7..4841964 100644 --- a/main.nf +++ b/main.nf @@ -31,6 +31,7 @@ process SYNAPSE_GET { } +// TODO: Look at other vcf2maf options (like the sample names) process VCF2MAF { container "sagebionetworks/vcf2maf:gnomad-genomes" @@ -55,6 +56,8 @@ process VCF2MAF { } +// TODO: Sanity check output +// TODO: Incorporate script inside container (or copy script to this repo) process MERGE_MAFS { container "python:3.10.4" @@ -77,6 +80,7 @@ process MERGE_MAFS { } +// TODO: Store Python script in a separate file process FILTER_MAF { container "python:3.10.4" @@ -124,6 +128,7 @@ process SYNAPSE_STORE { } +// TODO: Add comments workflow SAMPLE_MAFS { take: @@ -145,6 +150,7 @@ workflow SAMPLE_MAFS { } +// TODO: Add comments workflow MERGED_MAFS { take: From e128c3009154665214357a3c7030b3241d2b1c86 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Tue, 13 Sep 2022 21:22:02 +0000 Subject: [PATCH 10/45] Update Docker container tag to official release --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 4841964..0bfeb5d 100644 --- a/main.nf +++ b/main.nf @@ -34,7 +34,7 @@ process SYNAPSE_GET { // TODO: Look at other vcf2maf options (like the sample names) process VCF2MAF { - container "sagebionetworks/vcf2maf:gnomad-genomes" + container "sagebionetworks/vcf2maf:107.1" input: tuple val(meta), path(input_vcf) From 8a735bba40297d8ed23102ecc44213e9c4b42647 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Wed, 14 Sep 2022 00:27:10 +0000 Subject: [PATCH 11/45] Filter individual MAF files before merging --- main.nf | 65 +++++++++++++++++++++++++++------------------------------ 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/main.nf b/main.nf index 0bfeb5d..8eb476a 100644 --- a/main.nf +++ b/main.nf @@ -48,33 +48,10 @@ process VCF2MAF { vep_path = "/root/miniconda3/envs/vep/bin" """ vcf2maf.pl \ - --input-vcf '${input_vcf}' --output-maf ${input_vcf.baseName}.maf --ref-fasta '${reference_fasta}' \ + --input-vcf '${input_vcf}' --output-maf ${input_vcf.baseName}.maf.raw --ref-fasta '${reference_fasta}' \ --vep-data '${vep_data}' --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \ --vep-path '${vep_path}' --maf-center '${params.maf_center}' - """ - -} - - -// TODO: Sanity check output -// TODO: Incorporate script inside container (or copy script to this repo) -process MERGE_MAFS { - - container "python:3.10.4" - - input: - tuple val(meta), path(input_mafs) - - output: - tuple val(meta), path("*.merged.maf") - - script: - script_url = "https://raw.githubusercontent.com/genome-nexus/annotation-tools/master/merge_mafs.py" - """ - wget ${script_url} - python3 merge_mafs.py \ - -o ${meta.study_id}-${meta.variant_class}-${meta.variant_caller}.merged.maf \ - -i ${input_mafs.join(',')} + grep -v '^#' ${input_vcf.baseName}.maf.raw > ${input_vcf.baseName}.maf """ } @@ -103,6 +80,7 @@ process FILTER_MAF { ): reader = csv.DictReader(infile, delimiter='\t') writer = csv.DictWriter(outfile, reader.fieldnames, delimiter='\t') + writer.writeheader() for row in reader: if row['FILTER'] == 'PASS': writer.writerow(row) @@ -111,6 +89,30 @@ process FILTER_MAF { } +// TODO: Sanity check output +// TODO: Incorporate script inside container (or copy script to this repo) +process MERGE_MAFS { + + container "python:3.10.4" + + input: + tuple val(meta), path(input_mafs) + + output: + tuple val(meta), path("*.merged.maf") + + script: + script_url = "https://raw.githubusercontent.com/genome-nexus/annotation-tools/master/merge_mafs.py" + """ + wget ${script_url} + python3 merge_mafs.py \ + -o ${meta.study_id}-${meta.variant_class}-${meta.variant_caller}.merged.maf \ + -i ${input_mafs.join(',')} + """ +} + + + process SYNAPSE_STORE { container "sagebionetworks/synapsepythonclient:v2.6.0" @@ -143,9 +145,11 @@ workflow SAMPLE_MAFS { .map { meta, maf -> [ maf, meta.sample_parent_id ] } SYNAPSE_STORE(sample_mafs_ch) + + FILTER_MAF(VCF2MAF.out) emit: - VCF2MAF.out + FILTER_MAF.out } @@ -172,17 +176,10 @@ workflow MERGED_MAFS { MERGE_MAFS(merged_inputs_ch) - FILTER_MAF(MERGE_MAFS.out) - merged_mafs_ch = MERGE_MAFS.out .map { meta, maf -> [ maf, meta.merged_parent_id ] } - filtered_mafs_ch = FILTER_MAF.out - .map { meta, maf -> [ maf, meta.merged_parent_id ] } - - both_mafs_ch = merged_mafs_ch.mix(filtered_mafs_ch) - - SYNAPSE_STORE(both_mafs_ch) + SYNAPSE_STORE(merged_mafs_ch) } From 4e6b7d15d673e87e9dc9ceac397ca5c68d3f8e70 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Thu, 15 Sep 2022 15:57:08 +0000 Subject: [PATCH 12/45] Add process tags and handle gzipped VCF files --- bin/filter_maf.py | 18 ++++++ bin/merge_mafs.py | 153 ++++++++++++++++++++++++++++++++++++++++++++++ main.nf | 69 ++++++++++++--------- 3 files changed, 212 insertions(+), 28 deletions(-) create mode 100755 bin/filter_maf.py create mode 100755 bin/merge_mafs.py diff --git a/bin/filter_maf.py b/bin/filter_maf.py new file mode 100755 index 0000000..d675584 --- /dev/null +++ b/bin/filter_maf.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +import csv +import sys + +input_maf = sys.argv[1] +output_maf = sys.argv[2] + +with ( + open(input_maf, newline='') as infile, + open(output_maf, "w", newline='') as outfile +): + reader = csv.DictReader(infile, delimiter='\t') + writer = csv.DictWriter(outfile, reader.fieldnames, delimiter='\t') + writer.writeheader() + for row in reader: + if row['FILTER'] == 'PASS': + writer.writerow(row) diff --git a/bin/merge_mafs.py b/bin/merge_mafs.py new file mode 100755 index 0000000..cff5948 --- /dev/null +++ b/bin/merge_mafs.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 + +# Copied from the genome-nexus/annotation-tools GitHub repository: +# https://github.com/genome-nexus/annotation-tools/blob/2e315735/merge_mafs.py + +# Copyright (c) 2020 Memorial Sloan Kettering Cancer Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import os +import sys +import argparse + +# Script for merging MAFs +# Creates one aggregate header taken from column names acorss MAFs +# Writes out rows while mapping to header +# NAs and equivalents replaced with '' + +TUMOR_SAMPLE_BARCODE_COLUMN = "Tumor_Sample_Barcode" +HUGO_SYMBOL_COLUMN = "Hugo_Symbol" +ENTREZ_GENE_ID_COLUMN = "Entrez_Gene_Id" + +NULL_OR_MISSING_VALUES = ["", "NA", "N/A", None] + +def process_datum(val): + """ Cleans up datum. """ + try: + vfixed = val.strip() + except AttributeError: + vfixed = "" + if vfixed in NULL_OR_MISSING_VALUES: + return "" + return vfixed + +def get_file_header(data_file): + """ + Returns header from MAF + Assumes file is tab-delimited + Assumes header is first non-commented line + """ + header = [] + with open(data_file, "r") as header_source: + for line in header_source: + if not line.startswith("#"): + header = list(map(str.strip, line.rstrip().split("\t"))) + break + if not header: + print("Could not extract header from mutation data file: %s - Exiting..." % (filename)) + sys.exit(2) + return header + +def merge_headers(data_filenames): + """ + Generates a merged header from all input files. + * Also ensures that "Hugo_Symbol" and "Entrez_Gene_Id" + are the first 2 columns in the merged MAF header. + """ + merged_header = [HUGO_SYMBOL_COLUMN, ENTREZ_GENE_ID_COLUMN] + for fname in data_filenames: + columns_to_add = [column for column in get_file_header(fname) if column not in merged_header] + merged_header.extend(columns_to_add) + return merged_header + +def merge_input_mafs(input_mafs, output_maf_filename): + """ Generates a merged MAF given a list of input MAFs. """ + merged_header = merge_headers(input_mafs) + + rows_to_write = [] + for input_maf in input_mafs: + print('Loading data from ' + input_maf) + header_processed = False + file_header = get_file_header(input_maf) + with open(input_maf, "r") as maf: + for line in maf: + if line.startswith("#"): + continue + if not header_processed: + header_processed = True + continue + # map row values to current header columns + mapped_row = dict(zip(file_header, list(map(lambda x: process_datum(x), line.split("\t"))))) + sample_id = mapped_row[TUMOR_SAMPLE_BARCODE_COLUMN] + + # full record for the merged MAF (if mapped_row does not contain a column in merged_header, it is blank) + normalized_row = list(map(lambda x: mapped_row.get(x, ""), merged_header)) + rows_to_write.append("\t".join(normalized_row)) + + with open(output_maf_filename, "w") as output_maf: + output_maf.write("\t".join(merged_header)) + output_maf.write("\n") + output_maf.write("\n".join(rows_to_write)) + output_maf.write("\n") + print('Merged MAF written to: %s' % (output_maf_filename)) + +def verify_input_mafs(input_mafs): + for maf in input_mafs: + if not os.path.isfile(maf): + print("Could not find MAF by filename: %s" % (maf)) + exit(1) + +def usage(parser): + parser.print_help() + sys.exit(2) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument("-i", "--input-mafs-list", action = "store", dest = "input_mafs_list", help = "comma-delimited list of MAFs to merge") + parser.add_argument("-d", "--input-mafs-directory", action = "store", dest = "input_mafs_directory", help = "directory containing all MAFs to merge") + parser.add_argument("-o", "--output-maf", action = "store", dest = "output_maf", help = "output filename for merged MAF [REQUIRED]") + + args = parser.parse_args() + input_mafs_list = args.input_mafs_list + input_mafs_directory = args.input_mafs_directory + output_maf = args.output_maf + + if not output_maf: + print("Missing required argument: -o | --output-maf") + usage(parser) + + if (input_mafs_list and input_mafs_directory) or (not input_mafs_list and not input_mafs_directory): + print("Please choose only one of the following options when running script: --input-mafs-list | --input-mafs-directory") + usage(parser) + + if input_mafs_list: + input_mafs = list(map(str.strip, input_mafs_list.split(","))) + else: + input_mafs = [] + for maf in os.listdir(input_mafs_directory): + full_maf_path = os.path.join(input_mafs_directory, maf) + if os.path.isfile(full_maf_path): + input_mafs.append(full_maf_path) + else: + print("Skipping non-file element in directory... '%s'" % (full_maf_path)) + + verify_input_mafs(input_mafs) + merge_input_mafs(input_mafs, output_maf) diff --git a/main.nf b/main.nf index 8eb476a..b2a2483 100644 --- a/main.nf +++ b/main.nf @@ -4,6 +4,7 @@ nextflow.enable.dsl=2 params.input = "$HOME/data/example.csv" params.reference_fasta = "/dev/shm/fasta/Homo_sapiens_assembly38.fasta" +params.reference_fasta_fai = "${params.reference_fasta}.fai" params.vep_data = "/dev/shm/vep/" params.maf_center = "Sage Bionetworks" @@ -13,6 +14,8 @@ params.ncbi_build = "GRCh38" process SYNAPSE_GET { + tag "${meta.synapse_id}" + container "sagebionetworks/synapsepythonclient:v2.6.0" secret "SYNAPSE_AUTH_TOKEN" @@ -31,14 +34,21 @@ process SYNAPSE_GET { } -// TODO: Look at other vcf2maf options (like the sample names) +// TODO: Handle VCF genotype columns per variant caller +// TODO: Add appropriate memory allocation process VCF2MAF { - container "sagebionetworks/vcf2maf:107.1" + tag "${meta.synapse_id}" + + container "sagebionetworks/vcf2maf:107.1-debug2" + + cpus 8 + + afterScript "rm -f intermediate*" input: tuple val(meta), path(input_vcf) - path reference_fasta + tuple path(reference_fasta), path(reference_fasta_fai) path vep_data output: @@ -47,19 +57,29 @@ process VCF2MAF { script: vep_path = "/root/miniconda3/envs/vep/bin" """ + if [[ ${input_vcf} == *.gz ]]; then + zcat '${input_vcf}' > 'intermediate.vcf' + else + mv '${input_vcf}' 'intermediate.vcf' + fi + vcf2maf.pl \ - --input-vcf '${input_vcf}' --output-maf ${input_vcf.baseName}.maf.raw --ref-fasta '${reference_fasta}' \ - --vep-data '${vep_data}' --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \ - --vep-path '${vep_path}' --maf-center '${params.maf_center}' - grep -v '^#' ${input_vcf.baseName}.maf.raw > ${input_vcf.baseName}.maf + --input-vcf 'intermediate.vcf' --output-maf 'intermediate.maf.raw' \ + --ref-fasta '${reference_fasta}' --vep-data '${vep_data}' \ + --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \ + --vep-path '${vep_path}' --maf-center '${params.maf_center}' \ + --tumor-id '${meta.biospecimen_id}' --vep-forks '${task.cpus}' + + grep -v '^#' 'intermediate.maf.raw' > '${meta.biospecimen_id}-${meta.variant_class}-${meta.variant_caller}.maf' """ } -// TODO: Store Python script in a separate file process FILTER_MAF { + tag "${input_maf.name}" + container "python:3.10.4" input: @@ -70,29 +90,17 @@ process FILTER_MAF { script: """ - #!/usr/bin/env python3 - - import csv - - with ( - open('${input_maf}', newline='') as infile, - open('${input_maf.baseName}.passed.maf', "w", newline='') as outfile - ): - reader = csv.DictReader(infile, delimiter='\t') - writer = csv.DictWriter(outfile, reader.fieldnames, delimiter='\t') - writer.writeheader() - for row in reader: - if row['FILTER'] == 'PASS': - writer.writerow(row) + filter_maf.py ${input_maf} ${input_maf.baseName}.passed.maf """ } // TODO: Sanity check output -// TODO: Incorporate script inside container (or copy script to this repo) process MERGE_MAFS { + tag "${meta.study_id}-${meta.variant_class}-${meta.variant_caller}" + container "python:3.10.4" input: @@ -102,19 +110,18 @@ process MERGE_MAFS { tuple val(meta), path("*.merged.maf") script: - script_url = "https://raw.githubusercontent.com/genome-nexus/annotation-tools/master/merge_mafs.py" """ - wget ${script_url} - python3 merge_mafs.py \ + merge_mafs.py \ -o ${meta.study_id}-${meta.variant_class}-${meta.variant_caller}.merged.maf \ -i ${input_mafs.join(',')} """ } - process SYNAPSE_STORE { + tag "${parent_id}/${input.name}" + container "sagebionetworks/synapsepythonclient:v2.6.0" secret "SYNAPSE_AUTH_TOKEN" @@ -137,9 +144,13 @@ workflow SAMPLE_MAFS { sample_vcfs main: + reference_fasta_pair = [ + params.reference_fasta, params.reference_fasta_fai + ] + SYNAPSE_GET(sample_vcfs) - VCF2MAF(SYNAPSE_GET.out, params.reference_fasta, params.vep_data) + VCF2MAF(SYNAPSE_GET.out, reference_fasta_pair, params.vep_data) sample_mafs_ch = VCF2MAF.out .map { meta, maf -> [ maf, meta.sample_parent_id ] } @@ -203,6 +214,8 @@ def create_vcf_channel(LinkedHashMap row) { // Create metadata element def meta = [:] + meta.synapse_id = row.synapse_id + meta.biospecimen_id = row.biospecimen_id meta.sample_parent_id = row.sample_parent_id meta.merged_parent_id = row.merged_parent_id meta.study_id = row.study_id From e32b68c2c5c2fc66b82af8ad9e7c6ff838a3c70e Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Thu, 15 Sep 2022 16:18:26 +0000 Subject: [PATCH 13/45] Add steps for Nextflow secrets --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed52637..8991cb4 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,12 @@ aws s3 cp s3://ntap-add5-project-tower-bucket/outputs/sarek_syn29793238/VariantC gunzip ~/data/test.vcf.gz head -n 4000 ~/data/test.vcf ~/data/test-small.vcf -# Install Nextflow +# Install and setup Nextflow sudo yum install -y java (cd .local/bin && wget -qO- https://get.nextflow.io | bash) +echo 'export NXF_ENABLE_SECRETS=true' >> ~/.bashrc +source ~/.bashrc +nextflow secrets put -n SYNAPSE_AUTH_TOKEN -v "" # Stage reference files in memory mkdir -p /dev/shm/vep/ /dev/shm/fasta/ From 1203985d2e6ed60a42e5502a5a31537ceddefcf2 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Thu, 15 Sep 2022 22:30:43 +0000 Subject: [PATCH 14/45] Switch to downloading the VEP cache tarball --- main.nf | 26 +++++++++++--------------- nextflow.config | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/main.nf b/main.nf index b2a2483..0e3726d 100644 --- a/main.nf +++ b/main.nf @@ -2,15 +2,6 @@ nextflow.enable.dsl=2 -params.input = "$HOME/data/example.csv" -params.reference_fasta = "/dev/shm/fasta/Homo_sapiens_assembly38.fasta" -params.reference_fasta_fai = "${params.reference_fasta}.fai" -params.vep_data = "/dev/shm/vep/" - -params.maf_center = "Sage Bionetworks" -params.max_subpop_af = 0.0005 -params.ncbi_build = "GRCh38" - process SYNAPSE_GET { @@ -35,14 +26,15 @@ process SYNAPSE_GET { // TODO: Handle VCF genotype columns per variant caller -// TODO: Add appropriate memory allocation +// TODO: Improve handling of vep_path process VCF2MAF { tag "${meta.synapse_id}" - container "sagebionetworks/vcf2maf:107.1-debug2" + container "sagebionetworks/vcf2maf:107.1" cpus 8 + memory '16.GB' afterScript "rm -f intermediate*" @@ -58,17 +50,21 @@ process VCF2MAF { vep_path = "/root/miniconda3/envs/vep/bin" """ if [[ ${input_vcf} == *.gz ]]; then - zcat '${input_vcf}' > 'intermediate.vcf' + zcat '${input_vcf}' | head 10000 > 'intermediate.vcf' else - mv '${input_vcf}' 'intermediate.vcf' + cat '${input_vcf}' | head 10000 > 'intermediate.vcf' fi + mkdir -p 'vep_data/' + tar -zxf '${vep_data}' -C 'vep_data/' + vcf2maf.pl \ --input-vcf 'intermediate.vcf' --output-maf 'intermediate.maf.raw' \ - --ref-fasta '${reference_fasta}' --vep-data '${vep_data}' \ + --ref-fasta '${reference_fasta}' --vep-data 'vep_data/' \ --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \ --vep-path '${vep_path}' --maf-center '${params.maf_center}' \ - --tumor-id '${meta.biospecimen_id}' --vep-forks '${task.cpus}' + --tumor-id '${meta.biospecimen_id}' --vep-forks '${task.cpus}' \ + --species ${params.species} grep -v '^#' 'intermediate.maf.raw' > '${meta.biospecimen_id}-${meta.variant_class}-${meta.variant_caller}.maf' """ diff --git a/nextflow.config b/nextflow.config index d3af3ea..bd4251c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1 +1,33 @@ -docker.enabled = true +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + nf-core/rnaseq Nextflow config file +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Default config options for all compute environments +---------------------------------------------------------------------------------------- +*/ + +// Input parameters +params.input = null +params.reference_fasta = "s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" +params.reference_fasta_fai = "${params.reference_fasta}.fai" +params.vep_data = "s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz" + +// vcf2maf parameters +params.maf_center = "Sage Bionetworks" +params.max_subpop_af = 0.0005 +params.ncbi_build = "GRCh38" +params.species = "homo_sapiens" + +// Profiles +profiles { + + docker { + docker.enabled = true + docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + +} From 2617be492328f4a5724f6761132b52fa97bb5ed6 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Thu, 15 Sep 2022 22:30:53 +0000 Subject: [PATCH 15/45] Add performance benchmarks --- README.md | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/README.md b/README.md index 8991cb4..735fe4c 100644 --- a/README.md +++ b/README.md @@ -37,3 +37,127 @@ sudo mount -o remount,size=30G /dev/shm # Increase /dev/shm size rsync -avhP $HOME/.vep/ /dev/shm/vep/ rsync -avhP $HOME/ref/fasta/ /dev/shm/fasta/ ``` + +## Benchmarks + +### Ensembl VEP cache + +I want to see what's the fastest way of downloading the VEP cache. Based on the tests below, here are some options: + +- Download tarball from Ensembl and extract: 16m34s (10m23s + 6m11s) +- Download tarball from S3 and extract: 9m25s (3m14 + 6m11s) +- Download extracted tarball from S3: 4m5 + +#### Using rsync with Ensembl's servers + +```console +$ time rsync -avr --progress rsync://ftp.ensembl.org/ensembl/pub/release-107/variation/indexed_vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz ./ + +real 10m23.648s +user 0m50.824s +sys 0m44.770s +``` + +#### Using S3 with our public bucket + +```console +$ time aws --no-sign-request s3 cp s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz ./s3-homo_sapiens_vep_107_GRCh38.tar.gz + +real 3m14.885s +user 3m31.085s +sys 4m1.697s +``` + +#### Extracting the tarball + +```console +$ time tar -zxf $HOME/ref/homo_sapiens_vep_107_GRCh38.tar.gz -C $HOME/.vep/ + +real 6m11.087s +user 2m59.190s +sys 0m40.660s +``` + +#### Downloading the extracted tarball + +```console +time aws --no-sign-request s3 sync s3://sage-igenomes/vep_cache/homo_sapiens/107_GRCh38/ ./s3-107_GRCh38/ + +real 4m5.307s +user 5m13.736s +sys 3m24.302s +``` + +### Nextflow workflow runs + +#### Original (all reference files in `/dev/shm`) + +```console +$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv + +real 3m43.240s +user 0m29.439s +sys 0m1.415s +``` + +#### VEP in shm and FASTA in S3 + +Note that I needed to set `aws.client.anonymous = true` in the Nextflow config. + +```console +$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta + +real 4m9.529s +user 0m56.803s +sys 0m5.996s +``` + +#### VEP in non-shm and FASTA in S3 + +I'm curious about the difference in performance between SHM and non-SHM. + +```console +$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data ~/.vep/homo_sapiens/107_GRCh38/ + +real 3m43.073s +user 0m27.599s +sys 0m1.449s +``` + +#### VEP folder and FASTA in S3 + +I had to kill this job because there was barely any CPU being used. It seemed stuck on staging the `107_GRCh38/` folder. Maybe Nextflow is more efficient with staging files over folders (compared to the AWS CLI). + +```console +$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data s3://sage-igenomes/vep_cache/homo_sapiens/107_GRCh38/ + +^C + +real 17m7.522s +user 2m28.423s +sys 0m38.765s +``` + +#### VEP tarball in non-shm and FASTA in S3 + + +```console +$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data ~/ref/homo_sapiens_vep_107_GRCh38.tar.gz + +real 8m39.798s +user 0m29.715s +sys 0m1.455s +``` + + +#### VEP tarball and FASTA in S3 + + +```console +$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz + +real 8m38.703s +user 2m47.076s +sys 0m50.287s +``` + From 06c92886e3b5fb39d7e1512e94db235b1ae77848 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 00:09:38 +0000 Subject: [PATCH 16/45] Split off tarball extraction --- main.nf | 32 +++++++++++++++++++++++++------- nextflow.config | 2 +- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/main.nf b/main.nf index 0e3726d..999d4ac 100644 --- a/main.nf +++ b/main.nf @@ -25,6 +25,25 @@ process SYNAPSE_GET { } +process EXTRACT_TAR_GZ { + + container "sagebionetworks/vcf2maf:107.1" + + input: + path vep_tarball + + output: + path "vep_data" + + script: + """ + mkdir -p 'vep_data/' + tar -zxf '${vep_tarball}' -C 'vep_data/' + """ + +} + + // TODO: Handle VCF genotype columns per variant caller // TODO: Improve handling of vep_path process VCF2MAF { @@ -50,17 +69,14 @@ process VCF2MAF { vep_path = "/root/miniconda3/envs/vep/bin" """ if [[ ${input_vcf} == *.gz ]]; then - zcat '${input_vcf}' | head 10000 > 'intermediate.vcf' + zcat '${input_vcf}' | head -n 10000 > 'intermediate.vcf' else - cat '${input_vcf}' | head 10000 > 'intermediate.vcf' + cat '${input_vcf}' | head -n 10000 > 'intermediate.vcf' fi - mkdir -p 'vep_data/' - tar -zxf '${vep_data}' -C 'vep_data/' - vcf2maf.pl \ --input-vcf 'intermediate.vcf' --output-maf 'intermediate.maf.raw' \ - --ref-fasta '${reference_fasta}' --vep-data 'vep_data/' \ + --ref-fasta '${reference_fasta}' --vep-data '${vep_data}/' \ --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \ --vep-path '${vep_path}' --maf-center '${params.maf_center}' \ --tumor-id '${meta.biospecimen_id}' --vep-forks '${task.cpus}' \ @@ -146,7 +162,9 @@ workflow SAMPLE_MAFS { SYNAPSE_GET(sample_vcfs) - VCF2MAF(SYNAPSE_GET.out, reference_fasta_pair, params.vep_data) + EXTRACT_TAR_GZ(params.vep_tarball) + + VCF2MAF(SYNAPSE_GET.out, reference_fasta_pair, EXTRACT_TAR_GZ.out) sample_mafs_ch = VCF2MAF.out .map { meta, maf -> [ maf, meta.sample_parent_id ] } diff --git a/nextflow.config b/nextflow.config index bd4251c..1b7503e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,7 +10,7 @@ params.input = null params.reference_fasta = "s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" params.reference_fasta_fai = "${params.reference_fasta}.fai" -params.vep_data = "s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz" +params.vep_tarball = "s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz" // vcf2maf parameters params.maf_center = "Sage Bionetworks" From 9fd3071ee9dc7d59566fca3df2a3269368f940b9 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Thu, 15 Sep 2022 17:37:01 -0700 Subject: [PATCH 17/45] Add missing dependency for Tower --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 999d4ac..0c9f16a 100644 --- a/main.nf +++ b/main.nf @@ -27,7 +27,7 @@ process SYNAPSE_GET { process EXTRACT_TAR_GZ { - container "sagebionetworks/vcf2maf:107.1" + container "sagebionetworks/vcf2maf:107.2" input: path vep_tarball From b2efbb7c96c5a1f6ff4914cba9d168d3a5fdfb7e Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Thu, 15 Sep 2022 20:43:33 -0700 Subject: [PATCH 18/45] Add missing dependency for Tower (take 2) --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 0c9f16a..6f64dc7 100644 --- a/main.nf +++ b/main.nf @@ -50,7 +50,7 @@ process VCF2MAF { tag "${meta.synapse_id}" - container "sagebionetworks/vcf2maf:107.1" + container "sagebionetworks/vcf2maf:107.2" cpus 8 memory '16.GB' From 238d41e69172391d99bf7a5058096be0d21dd168 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Thu, 15 Sep 2022 22:07:06 -0700 Subject: [PATCH 19/45] Quote all files in commands --- main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 6f64dc7..7596e4f 100644 --- a/main.nf +++ b/main.nf @@ -102,7 +102,7 @@ process FILTER_MAF { script: """ - filter_maf.py ${input_maf} ${input_maf.baseName}.passed.maf + filter_maf.py '${input_maf}' '${input_maf.baseName}.passed.maf' """ } @@ -124,8 +124,8 @@ process MERGE_MAFS { script: """ merge_mafs.py \ - -o ${meta.study_id}-${meta.variant_class}-${meta.variant_caller}.merged.maf \ - -i ${input_mafs.join(',')} + -o '${meta.study_id}-${meta.variant_class}-${meta.variant_caller}.merged.maf' \ + -i '${input_mafs.join(',')}' """ } From 44332dfc1f5034852e2d3f22beb5ed9044ad51c8 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Thu, 15 Sep 2022 22:54:49 -0700 Subject: [PATCH 20/45] Remove `head` command --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 7596e4f..f9fbda2 100644 --- a/main.nf +++ b/main.nf @@ -69,9 +69,9 @@ process VCF2MAF { vep_path = "/root/miniconda3/envs/vep/bin" """ if [[ ${input_vcf} == *.gz ]]; then - zcat '${input_vcf}' | head -n 10000 > 'intermediate.vcf' + zcat '${input_vcf}' > 'intermediate.vcf' else - cat '${input_vcf}' | head -n 10000 > 'intermediate.vcf' + cat '${input_vcf}' > 'intermediate.vcf' fi vcf2maf.pl \ From 580cce223f9b948a0a4c76c2fece3a04b3764ef7 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 06:35:29 -0700 Subject: [PATCH 21/45] Optimize vcf2maf resources --- main.nf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index f9fbda2..d23987a 100644 --- a/main.nf +++ b/main.nf @@ -52,8 +52,8 @@ process VCF2MAF { container "sagebionetworks/vcf2maf:107.2" - cpus 8 - memory '16.GB' + cpus 6 + memory 8.GB * task.attempt afterScript "rm -f intermediate*" @@ -67,6 +67,7 @@ process VCF2MAF { script: vep_path = "/root/miniconda3/envs/vep/bin" + vep_forks = task.cpus + 2 """ if [[ ${input_vcf} == *.gz ]]; then zcat '${input_vcf}' > 'intermediate.vcf' @@ -79,7 +80,7 @@ process VCF2MAF { --ref-fasta '${reference_fasta}' --vep-data '${vep_data}/' \ --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \ --vep-path '${vep_path}' --maf-center '${params.maf_center}' \ - --tumor-id '${meta.biospecimen_id}' --vep-forks '${task.cpus}' \ + --tumor-id '${meta.biospecimen_id}' --vep-forks '${vep_forks}' \ --species ${params.species} grep -v '^#' 'intermediate.maf.raw' > '${meta.biospecimen_id}-${meta.variant_class}-${meta.variant_caller}.maf' From 8b3700a279dc6c77f64779811308bdacff39c27a Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 06:40:28 -0700 Subject: [PATCH 22/45] Add missing command quotes --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index d23987a..19fadda 100644 --- a/main.nf +++ b/main.nf @@ -81,7 +81,7 @@ process VCF2MAF { --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \ --vep-path '${vep_path}' --maf-center '${params.maf_center}' \ --tumor-id '${meta.biospecimen_id}' --vep-forks '${vep_forks}' \ - --species ${params.species} + --species '${params.species}' grep -v '^#' 'intermediate.maf.raw' > '${meta.biospecimen_id}-${meta.variant_class}-${meta.variant_caller}.maf' """ From 878b519057f8269e90d5aa38d4762a580c78148c Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 07:04:55 -0700 Subject: [PATCH 23/45] Polish workflow and add comments --- main.nf | 77 ++++++++++++++++++++++++++++++------------------- nextflow.config | 8 ----- 2 files changed, 48 insertions(+), 37 deletions(-) diff --git a/main.nf b/main.nf index 19fadda..85db611 100644 --- a/main.nf +++ b/main.nf @@ -3,6 +3,7 @@ nextflow.enable.dsl=2 +// Process for downloading files from Synapse process SYNAPSE_GET { tag "${meta.synapse_id}" @@ -25,6 +26,7 @@ process SYNAPSE_GET { } +// Process for decompressing and extracting the VEP cache tarball process EXTRACT_TAR_GZ { container "sagebionetworks/vcf2maf:107.2" @@ -44,8 +46,7 @@ process EXTRACT_TAR_GZ { } -// TODO: Handle VCF genotype columns per variant caller -// TODO: Improve handling of vep_path +// Process for annotating VCF file and converting to MAF process VCF2MAF { tag "${meta.synapse_id}" @@ -65,14 +66,16 @@ process VCF2MAF { output: tuple val(meta), path("*.maf") + // TODO: Remove hard-coded VEP path + // TODO: Handle VCF genotype columns per variant caller script: vep_path = "/root/miniconda3/envs/vep/bin" vep_forks = task.cpus + 2 """ if [[ ${input_vcf} == *.gz ]]; then - zcat '${input_vcf}' > 'intermediate.vcf' + zcat '${input_vcf}' | head -n 10000 > 'intermediate.vcf' else - cat '${input_vcf}' > 'intermediate.vcf' + cat '${input_vcf}' | head -n 10000 > 'intermediate.vcf' fi vcf2maf.pl \ @@ -89,6 +92,7 @@ process VCF2MAF { } +// Process for filtering MAF files for passed variants process FILTER_MAF { tag "${input_maf.name}" @@ -109,7 +113,7 @@ process FILTER_MAF { } -// TODO: Sanity check output +// Process for merging study MAF files process MERGE_MAFS { tag "${meta.study_id}-${meta.variant_class}-${meta.variant_caller}" @@ -131,6 +135,7 @@ process MERGE_MAFS { } +// Process for uploading files to Synapse process SYNAPSE_STORE { tag "${parent_id}/${input.name}" @@ -150,76 +155,75 @@ process SYNAPSE_STORE { } -// TODO: Add comments +// Workflow for generating sample-level MAF files workflow SAMPLE_MAFS { take: sample_vcfs main: - reference_fasta_pair = [ - params.reference_fasta, params.reference_fasta_fai - ] + // Pair up FASTA and FAI reference files + ref_fasta_pair = [params.reference_fasta, params.reference_fasta_fai] + // Download VCF files from Synapse SYNAPSE_GET(sample_vcfs) + // Decompress and extract VEP cache tarball EXTRACT_TAR_GZ(params.vep_tarball) + // Run vcf2maf on each vcf file VCF2MAF(SYNAPSE_GET.out, reference_fasta_pair, EXTRACT_TAR_GZ.out) + // Upload MAF files to Synapse sample_mafs_ch = VCF2MAF.out .map { meta, maf -> [ maf, meta.sample_parent_id ] } - SYNAPSE_STORE(sample_mafs_ch) - - FILTER_MAF(VCF2MAF.out) emit: - FILTER_MAF.out + VCF2MAF.out } -// TODO: Add comments -workflow MERGED_MAFS { +// Workflow for generating study-level MAF files +workflow STUDY_MAFS { take: sample_mafs main: - merged_inputs_ch = sample_mafs + // Filter MAF files for passed variants + FILTER_MAF(sample_mafs) + + // Group MAF files by study and merge + mafs_by_study_ch = FILTER_MAF.out .filter { meta, maf -> meta.is_releasable } - .map { - vcf_meta, maf -> - def study_meta = [:] - study_meta.merged_parent_id = vcf_meta.merged_parent_id - study_meta.study_id = vcf_meta.study_id - study_meta.variant_class = vcf_meta.variant_class - study_meta.variant_caller = vcf_meta.variant_caller - [ study_meta, maf ] - } + .map { meta, maf -> subset_study_meta(meta, maf) } .groupTuple( by: 0 ) - - MERGE_MAFS(merged_inputs_ch) + MERGE_MAFS(mafs_by_study_ch) + // Upload study MAF files to Synapse merged_mafs_ch = MERGE_MAFS.out .map { meta, maf -> [ maf, meta.merged_parent_id ] } - SYNAPSE_STORE(merged_mafs_ch) } +// Entrypoint workflow workflow { + // Parse input CSV sample sheet input_vcfs_ch = Channel .fromPath ( params.input ) .splitCsv ( header:true, sep:',' ) .map { create_vcf_channel(it) } + // Process individual sample VCF files SAMPLE_MAFS(input_vcfs_ch) - MERGED_MAFS(SAMPLE_MAFS.out) + // Filter and merge MAF files by study + STUDY_MAFS(SAMPLE_MAFS.out) } @@ -243,3 +247,18 @@ def create_vcf_channel(LinkedHashMap row) { return vcf_meta } + + +// Function to get list of [ study_meta, maf ] +def subset_study_meta(vcf_meta, maf) { + + // Subset metadata element + def study_meta = [:] + study_meta.merged_parent_id = vcf_meta.merged_parent_id + study_meta.study_id = vcf_meta.study_id + study_meta.variant_class = vcf_meta.variant_class + study_meta.variant_caller = vcf_meta.variant_caller + [ study_meta, maf ] + + return vcf_meta +} diff --git a/nextflow.config b/nextflow.config index 1b7503e..6a39361 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,11 +1,3 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - nf-core/rnaseq Nextflow config file -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Default config options for all compute environments ----------------------------------------------------------------------------------------- -*/ - // Input parameters params.input = null params.reference_fasta = "s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" From 13fcb6b8dfff73b83e566cec98fba3691dd00326 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 07:50:22 -0700 Subject: [PATCH 24/45] Fix variable name mismatch --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 85db611..9bc2f6e 100644 --- a/main.nf +++ b/main.nf @@ -172,7 +172,7 @@ workflow SAMPLE_MAFS { EXTRACT_TAR_GZ(params.vep_tarball) // Run vcf2maf on each vcf file - VCF2MAF(SYNAPSE_GET.out, reference_fasta_pair, EXTRACT_TAR_GZ.out) + VCF2MAF(SYNAPSE_GET.out, ref_fasta_pair, EXTRACT_TAR_GZ.out) // Upload MAF files to Synapse sample_mafs_ch = VCF2MAF.out From 64d548c8d0d474012d04d46b4301f55049b3db78 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 08:09:27 -0700 Subject: [PATCH 25/45] Enable CSV value stripping --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 9bc2f6e..3d49b17 100644 --- a/main.nf +++ b/main.nf @@ -216,7 +216,7 @@ workflow { // Parse input CSV sample sheet input_vcfs_ch = Channel .fromPath ( params.input ) - .splitCsv ( header:true, sep:',' ) + .splitCsv ( header:true, strip:true ) .map { create_vcf_channel(it) } // Process individual sample VCF files From 13e7bd166cc886da3d49e9232fb2ad1aaae0d43b Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 08:12:35 -0700 Subject: [PATCH 26/45] Add missing closure for `memory` --- main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 3d49b17..267afb1 100644 --- a/main.nf +++ b/main.nf @@ -54,7 +54,8 @@ process VCF2MAF { container "sagebionetworks/vcf2maf:107.2" cpus 6 - memory 8.GB * task.attempt + memory { 8.GB * task.attempt } + maxRetries 3 afterScript "rm -f intermediate*" From c0c927ada884e0f6779c0a015321ae50bd1082aa Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 09:29:30 -0700 Subject: [PATCH 27/45] Fix quotes Spaces in inputs are automatically escaped, but not in file attributes (like `baseName`). So, I removed the quotes from the former and kept them for the latter. --- main.nf | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/main.nf b/main.nf index 267afb1..30b2a29 100644 --- a/main.nf +++ b/main.nf @@ -20,7 +20,7 @@ process SYNAPSE_GET { script: """ - synapse get '${synapse_id}' + synapse get ${synapse_id} """ } @@ -39,8 +39,8 @@ process EXTRACT_TAR_GZ { script: """ - mkdir -p 'vep_data/' - tar -zxf '${vep_tarball}' -C 'vep_data/' + mkdir -p vep_data/ + tar -zxf ${vep_tarball} -C vep_data/ """ } @@ -74,20 +74,20 @@ process VCF2MAF { vep_forks = task.cpus + 2 """ if [[ ${input_vcf} == *.gz ]]; then - zcat '${input_vcf}' | head -n 10000 > 'intermediate.vcf' + zcat ${input_vcf} | head -n 10000 > intermediate.vcf else - cat '${input_vcf}' | head -n 10000 > 'intermediate.vcf' + cat ${input_vcf} | head -n 10000 > intermediate.vcf fi vcf2maf.pl \ - --input-vcf 'intermediate.vcf' --output-maf 'intermediate.maf.raw' \ - --ref-fasta '${reference_fasta}' --vep-data '${vep_data}/' \ - --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \ - --vep-path '${vep_path}' --maf-center '${params.maf_center}' \ - --tumor-id '${meta.biospecimen_id}' --vep-forks '${vep_forks}' \ - --species '${params.species}' - - grep -v '^#' 'intermediate.maf.raw' > '${meta.biospecimen_id}-${meta.variant_class}-${meta.variant_caller}.maf' + --input-vcf intermediate.vcf --output-maf intermediate.maf.raw \ + --ref-fasta ${reference_fasta} --vep-data ${vep_data}/ \ + --ncbi-build ${params.ncbi_build} --max-subpop-af ${params.max_subpop_af} \ + --vep-path ${vep_path} --maf-center ${params.maf_center} \ + --tumor-id '${meta.biospecimen_id}' --vep-forks ${vep_forks} \ + --species ${params.species} + + grep -v '^#' intermediate.maf.raw > '${meta.biospecimen_id}-${meta.variant_class}-${meta.variant_caller}.maf' """ } @@ -108,7 +108,7 @@ process FILTER_MAF { script: """ - filter_maf.py '${input_maf}' '${input_maf.baseName}.passed.maf' + filter_maf.py ${input_maf} '${input_maf.baseName}.passed.maf' """ } @@ -128,10 +128,9 @@ process MERGE_MAFS { tuple val(meta), path("*.merged.maf") script: + prefix = "${meta.study_id}-${meta.variant_class}-${meta.variant_caller}" """ - merge_mafs.py \ - -o '${meta.study_id}-${meta.variant_class}-${meta.variant_caller}.merged.maf' \ - -i '${input_mafs.join(',')}' + merge_mafs.py -d './' -o '${prefix}.merged.maf' """ } @@ -150,7 +149,7 @@ process SYNAPSE_STORE { script: """ - synapse store --parentId '${parent_id}' '${input}' + synapse store --parentId ${parent_id} ${input} """ } From 20ba11a73daa1cc8a3aacee4376fe69c9dfbbdf1 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 10:08:02 -0700 Subject: [PATCH 28/45] Create detailed README --- README.md | 240 +++++++++++++++++++++++++++++------------------------- 1 file changed, 129 insertions(+), 111 deletions(-) diff --git a/README.md b/README.md index 735fe4c..4812877 100644 --- a/README.md +++ b/README.md @@ -1,163 +1,181 @@ -# nf-vcf2maf +# nf-vcf2maf: Annotate Variants (VCF) and Generate MAF Files -Annotate VCF files using VEP and generate MAF files using vcf2maf +## Purpose -## Setup +The purpose of this Nextflow workflow is to annotate variants in VCF files using the Ensembl Variant Effect Predictor (VEP) and convert the annotated VCF file into the [Mutation Annotation Format](https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/) (MAF). Unlike VEP-anntotated VCF files, MAF files are generally more useful for downstream applications given their tabular nature. For example, you can easily load them into R (_e.g._ with the [`maftools`](https://www.bioconductor.org/packages/release/bioc/html/maftools.html) package) and/or derive the input files for [cBioPortal](https://www.cbioportal.org/visualize). -```console -# Install tmux for long-running commands -sudo yum install -y tmux +**Important:** Please read the [limitations](#known-limitations) listed below. -# Download Ensembl VEP cache (expands to 22G) -mkdir -p $HOME/ref/ $HOME/.vep/ -rsync -avr --progress rsync://ftp.ensembl.org/ensembl/pub/release-107/variation/indexed_vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz $HOME/ref/ -tar -zvxf $HOME/ref/homo_sapiens_vep_107_GRCh38.tar.gz -C $HOME/.vep/ +This repository leverages a [fork of vcf2maf](https://github.com/Sage-Bionetworks-Workflows/vcf2maf) and a [custom container image](https://github.com/Sage-Bionetworks-Workflows/vcf2maf-docker). -# Download reference genome FASTA file -mkdir -p $HOME/ref/fasta/ -aws --no-sign-request s3 sync s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/ $HOME/ref/fasta/ +## Quickstart -# Get example VCF file for testing purposes -mkdir -p $HOME/data -echo "Log into AWS using credentials from JumpCloud" -aws s3 cp s3://ntap-add5-project-tower-bucket/outputs/sarek_syn29793238/VariantCalling/2-001_Plexiform_Neurofibroma/DeepVariant/2-001_Plexiform_Neurofibroma.vcf.gz ~/data/test.vcf.gz -gunzip ~/data/test.vcf.gz -head -n 4000 ~/data/test.vcf ~/data/test-small.vcf +1. Prepare a CSV samplesheet according to the [format](#samplesheet) described below. -# Install and setup Nextflow -sudo yum install -y java -(cd .local/bin && wget -qO- https://get.nextflow.io | bash) -echo 'export NXF_ENABLE_SECRETS=true' >> ~/.bashrc -source ~/.bashrc -nextflow secrets put -n SYNAPSE_AUTH_TOKEN -v "" + **Example:** Stored locally as `./samplesheet.csv` -# Stage reference files in memory -mkdir -p /dev/shm/vep/ /dev/shm/fasta/ -sudo mount -o remount,size=30G /dev/shm # Increase /dev/shm size -rsync -avhP $HOME/.vep/ /dev/shm/vep/ -rsync -avhP $HOME/ref/fasta/ /dev/shm/fasta/ -``` + ```csv + synapse_id ,sample_parent_id ,merged_parent_id ,study_id ,variant_class ,variant_caller ,is_releasable + syn87654301 ,syn87654311 ,syn87654321 ,study_x ,germline ,deepvariant ,true + syn87654302 ,syn87654311 ,syn87654321 ,study_x ,germline ,deepvariant ,false + syn87654303 ,syn87654311 ,syn87654321 ,study_x ,germline ,deepvariant ,true + syn87654304 ,syn87654312 ,syn87654321 ,study_x ,germline ,mutect2 ,false + syn87654305 ,syn87654312 ,syn87654321 ,study_x ,germline ,mutect2 ,false + syn87654306 ,syn87654312 ,syn87654321 ,study_x ,germline ,mutect2 ,false + syn87654307 ,syn87654313 ,syn87654322 ,study_y ,germline ,deepvariant ,true + syn36245848 ,syn87654313 ,syn87654322 ,study_y ,germline ,deepvariant ,true + ``` -## Benchmarks +2. Create a Nextflow secret called `SYNAPSE_AUTH_TOKEN` with a Synapse personal access token ([docs](#authentication)). -### Ensembl VEP cache +3. Prepare your parameters file. For more details, check out the [Parameters](#parameters) section. Only the `input` parameters is required. -I want to see what's the fastest way of downloading the VEP cache. Based on the tests below, here are some options: + **Example:** Stored locally as `./params.yml` -- Download tarball from Ensembl and extract: 16m34s (10m23s + 6m11s) -- Download tarball from S3 and extract: 9m25s (3m14 + 6m11s) -- Download extracted tarball from S3: 4m5 + ```yaml + input: "./samplesheet.csv" + maf_center: "Sage Bionetworks" + max_subpop_af: 0.0005 + ``` -#### Using rsync with Ensembl's servers +4. Launch workflow using the [Nextflow CLI](https://nextflow.io/docs/latest/cli.html#run), the [Tower CLI](https://help.tower.nf/latest/cli/), or the [Tower web UI](https://help.tower.nf/latest/launch/launchpad/). -```console -$ time rsync -avr --progress rsync://ftp.ensembl.org/ensembl/pub/release-107/variation/indexed_vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz ./ + **Example:** Launched using the Nextflow CLI with Docker enabled -real 10m23.648s -user 0m50.824s -sys 0m44.770s -``` + ```console + nextflow run sage-bionetworks-workflows/nf-vcf2maf -params-file ./params.yml -profile docker + ``` -#### Using S3 with our public bucket +5. Explore the MAf files uploaded to Synapse (using the parent IDs listed in the samplesheet). -```console -$ time aws --no-sign-request s3 cp s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz ./s3-homo_sapiens_vep_107_GRCh38.tar.gz +## Authentication -real 3m14.885s -user 3m31.085s -sys 4m1.697s -``` +This workflow takes care of transferring files to and from Synapse. Hence, it requires a secret with a personal access token for authentication. To configure Nextflow with such a token, follow these steps: -#### Extracting the tarball +1. Generate a personal access token (PAT) on Synapse using [this dashboard](https://www.synapse.org/#!PersonalAccessTokens:). Make sure to enable the `view`, `download`, and `modify` scopes since this workflow both downloads and uploads to Synapse. +2. Create a secret called `SYNAPSE_AUTH_TOKEN` containing a Synapse personal access token using the [Nextflow CLI](https://nextflow.io/docs/latest/secrets.html) or [Nextflow Tower](https://help.tower.nf/latest/secrets/overview/). +3. (Tower only) When launching the workflow, include the `SYNAPSE_AUTH_TOKEN` as a pipeline secret from either your user or workspace secrets. -```console -$ time tar -zxf $HOME/ref/homo_sapiens_vep_107_GRCh38.tar.gz -C $HOME/.vep/ +## Parameters -real 6m11.087s -user 2m59.190s -sys 0m40.660s -``` +Check out the [Quickstart](#quickstart) section for example parameter values. You are encouraged to read the [limitations](#known-limitations) listed below because some parameters have not been tested with non-default values. -#### Downloading the extracted tarball +- **`input`**: (Required) A CSV samplesheet that lists the VCF files that should be processed. See below for the [samplesheet format](#samplesheet). +- **`max_subpop_af`**: Threshold used by vcf2maf for labeling variants with the `common_variant` filter. Specifically, the `common_variant` filter is applied to variants with an allele frequency of at least `max_subpop_af` in one or more gnomAD sub-populations ([source](https://github.com/mskcc/vcf2maf/blob/5ed414428046e71833f454d4b64da6c30362a89b/docs/vep_maf_readme.txt#L116-L120)). This filter is useful for removing false-positive somatic variants. The merged MAF files lack these common variants. Default: `0.0005`. +- **`maf_center`**: Value used in the `Center` MAF column. Default: `"Sage Bionetworks"`. +- **`reference_fasta`**: Reference genome FASTA file used in variant calling. Default: `"s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta"`. +- **`reference_fasta_fai`**: Reference genome FASTA index (FAI) file. This shouldn't be needed in most cases since the workflow will automatically pick up on the `.fai` file alongside the `.fasta` file. Default: `"${reference_fasta}.fai"`. +- **`vep_tarball`**: A tarball (ideally compressed) of the VEP cache. Default: `"s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz"`. +- **`ncbi_build`**: The NCBI genome build. Passed to `--assembly` in VEP ([source](http://Jul2022.archive.ensembl.org/info/docs/tools/vep/script/vep_options.html)). Default: `"GRCh38"`. +- **`species`**: The species identifier. Passed to `--species` in VEP ([source](http://Jul2022.archive.ensembl.org/info/docs/tools/vep/script/vep_options.html)). Default: `"homo_sapiens"`. -```console -time aws --no-sign-request s3 sync s3://sage-igenomes/vep_cache/homo_sapiens/107_GRCh38/ ./s3-107_GRCh38/ +## Inputs -real 4m5.307s -user 5m13.736s -sys 3m24.302s -``` +### Samplesheet -### Nextflow workflow runs +The input samplesheet should be in comma-separated values (CSV) format and contain the following columns: -#### Original (all reference files in `/dev/shm`) +1. **`synapse_id`**: Synapse ID of the VCF file +2. **`biospecimen_id`**: Biospecimen/sample identifier + - This value will be used to populate the `Tumor_Sample_Barcode` MAF column + - **Important:** This value needs to uniquely identify samples within each merged MAF file. See [below](#maf-files) for information on how MAF files are merged. +3. **`sample_parent_id`**: Synapse ID of the folder where the individual sample MAF file will be uploaded + - Suggestion: The folder that contains the VCF file +4. **`merged_parent_id`**: The Synapse ID of the folder where the merged MAF file will be uploaded + - Suggestion: The root folder containing the VCF files + - **Important:** This value should be consistent across VCF files that are expected to be merged. Otherwise, you will end up with artificially split merged MAF files. See [below](#maf-files) for information on how MAF files are merged. +5. **`study_id`**: Study identifier + - Suggestion: The Synapse ID of the project representing the study if you don’t have shorthand study IDs +6. **`variant_class`**: Whether the VCF file contains somatic or germline mutations + - **Valid values:** `somatic` or `germline` +7. **`variant_caller`**: Name of the variant caller +8. **`is_releasable`**: Whether the VCF file should be included in the merged MAF file + - **Valid values:** `true` or `false` -```console -$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv +## Outputs -real 3m43.240s -user 0m29.439s -sys 0m1.415s -``` +### MAF Files -#### VEP in shm and FASTA in S3 +- Individual sample MAF files + - Unfiltered (_i.e._ includes all variants regardless of their FILTER status, including those that weren’t deemed high-confidence by the variant caller) + - File naming: `${biospecimen_id}-${variant_class}-${variant_caller}.maf` +- Merged study MAF files (one for every combination of `study_id`, `variant_class` and `variant_caller`) + - Filtered (_i.e._ restricted to “releasable” samples and variants where `FILTER == 'PASS'`, which excludes those with common_variant due to any(gnomAD_*_AF) >= 0.0005) + - File naming: `${study_id}-${variant_class}-${variant_caller}.merged.maf` -Note that I needed to set `aws.client.anonymous = true` in the Nextflow config. +## Known Limitations -```console -$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta +- This workflow has only been tested with the following parameters: + - `vep_tarball`: Ensembl VEP 107 + - `species`: `homo_sapiens` + - `ncbi_build`: `GRCh38` + - `reference_fasta`: GATK FASTA file -real 4m9.529s -user 0m56.803s -sys 0m5.996s -``` +## Benchmarks -#### VEP in non-shm and FASTA in S3 +### Setup -I'm curious about the difference in performance between SHM and non-SHM. +The following benchmarks were performed with the following setup on an EC2 instance: ```console -$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data ~/.vep/homo_sapiens/107_GRCh38/ +# Install tmux for long-running commands +sudo yum install -y tmux -real 3m43.073s -user 0m27.599s -sys 0m1.449s -``` +# Install and setup Nextflow +sudo yum install -y java +(cd .local/bin && wget -qO- https://get.nextflow.io | bash) +echo 'export NXF_ENABLE_SECRETS=true' >> ~/.bashrc +source ~/.bashrc +nextflow secrets put -n SYNAPSE_AUTH_TOKEN -v "" +mkdir -p $HOME/.nextflow/ +echo 'aws.client.anonymous = true' >> $HOME/.nextflow/config -#### VEP folder and FASTA in S3 +# Download and extract Ensembl VEP cache +mkdir -p $HOME/ref/ $HOME/.vep/ +rsync -avr --progress rsync://ftp.ensembl.org/ensembl/pub/release-107/variation/indexed_vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz $HOME/ref/ +tar -zvxf $HOME/ref/homo_sapiens_vep_107_GRCh38.tar.gz -C $HOME/.vep/ -I had to kill this job because there was barely any CPU being used. It seemed stuck on staging the `107_GRCh38/` folder. Maybe Nextflow is more efficient with staging files over folders (compared to the AWS CLI). +# Download reference genome FASTA file +mkdir -p $HOME/ref/fasta/ +aws --no-sign-request s3 sync s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/ $HOME/ref/fasta/ -```console -$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data s3://sage-igenomes/vep_cache/homo_sapiens/107_GRCh38/ +# Stage reference files in memory +mkdir -p /dev/shm/vep/ /dev/shm/fasta/ +sudo mount -o remount,size=25G /dev/shm # Increase /dev/shm size +rsync -avhP $HOME/.vep/ /dev/shm/vep/ +rsync -avhP $HOME/ref/fasta/ /dev/shm/fasta/ +``` -^C +### Preparing the Ensembl VEP cache -real 17m7.522s -user 2m28.423s -sys 0m38.765s -``` +#### Outside of Nextflow -#### VEP tarball in non-shm and FASTA in S3 +To determine the most efficient way of preparing the VEP cache for vcf2maf, I tried different permutations of downloading the tarball or extracted folder from Ensembl or S3. Here are the individual results: +- **Download tarball using rsync from Ensembl:** 10 min 23 sec +- **Download tarball using AWS CLI from S3:** 3 min 14 sec +- **Extract tarball using `tar` locally:** 6 min 11 sec +- **Download extracted folder using AWS CLI from S3:** 4 min 5 sec -```console -$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data ~/ref/homo_sapiens_vep_107_GRCh38.tar.gz +Based on the above results, here are some estimated runtimes: -real 8m39.798s -user 0m29.715s -sys 0m1.455s -``` +- **Download tarball from Ensembl and extract locally:** 16 min 34 sec +- **Download tarball from S3 and extract locally:** 9 min 25 sec +- **Download extracted tarball from S3:** 4 min 5 sec +Based on the above estimates, downloading the extracted tarball from S3 seems like the most efficient method followed by downloading the tarball from S3 and extracting locally. -#### VEP tarball and FASTA in S3 +#### Within Nextflow +After benchmarking different methods for downloading VEP cache, I performed various test within a Nextflow run. Note that SHM refers to files being in shared memory (_i.e._ `/dev/shm`). -```console -$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz +- **Baseline (all reference files in SHM):** 3 min 43 sec +- **FASTA in S3 and VEP folder in SHM:** 4 min 9 sec +- **FASTA in S3 and VEP folder in non-SHM:** 3 min 43 sec +- **FASTA in S3 and VEP folder in S3:** Over 17 min 7 sec[^1] +- **FASTA in S3 and VEP tarball in non-SHM:** 8 min 39 sec +- **FASTA and VEP tarball in S3:** 8 min 38 sec -real 8m38.703s -user 2m47.076s -sys 0m50.287s -``` +The above results showed that downloading the tarball from S3 was the most efficient method. While ~10 minutes is a long time to spend on preparing reference files, it's trivial compared to the actual runtimes of vcf2maf, which can reach 4-5 hours. The benefit is portability, including the ability of running this workflow on Tower. +[^1]: While this was expected to be the most efficient method of downloading the VEP cache, I had to kill the job because it was taking so long. Perhaps the AWS Java SDK isn't as efficient as the AWS CLI for downloading a folder in S3 recursively. From ec141690f1d24d7454c529d178381176de2149b5 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 10:09:44 -0700 Subject: [PATCH 29/45] Fix typo in `subset_study_meta()` --- main.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 30b2a29..6bc0052 100644 --- a/main.nf +++ b/main.nf @@ -258,7 +258,6 @@ def subset_study_meta(vcf_meta, maf) { study_meta.study_id = vcf_meta.study_id study_meta.variant_class = vcf_meta.variant_class study_meta.variant_caller = vcf_meta.variant_caller - [ study_meta, maf ] - return vcf_meta + return [ study_meta, maf ] } From c84cbbe180c5c5bde2e24f6701ec50a863c4b0f5 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 10:11:45 -0700 Subject: [PATCH 30/45] Shorten README title --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4812877..468423f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# nf-vcf2maf: Annotate Variants (VCF) and Generate MAF Files +# nf-vcf2maf: Annotate VCF Files and Generate MAF Files ## Purpose From ae3c0139f2771ec3a4fe21486261c890373931bf Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 10:47:19 -0700 Subject: [PATCH 31/45] Revert to comma-sep list of MAF Files` --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 6bc0052..784c2a7 100644 --- a/main.nf +++ b/main.nf @@ -130,7 +130,7 @@ process MERGE_MAFS { script: prefix = "${meta.study_id}-${meta.variant_class}-${meta.variant_caller}" """ - merge_mafs.py -d './' -o '${prefix}.merged.maf' + merge_mafs.py -i ${input_mafs.join(',')} -o '${prefix}.merged.maf' """ } From 13fd89b7da2f38dbd7dd5326dd2fc4373cb5e37f Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 11:00:49 -0700 Subject: [PATCH 32/45] Add quotes for input MAF files for `merge_mafs.py` --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 784c2a7..900a1be 100644 --- a/main.nf +++ b/main.nf @@ -130,7 +130,7 @@ process MERGE_MAFS { script: prefix = "${meta.study_id}-${meta.variant_class}-${meta.variant_caller}" """ - merge_mafs.py -i ${input_mafs.join(',')} -o '${prefix}.merged.maf' + merge_mafs.py -i '${input_mafs.join(',')}' -o '${prefix}.merged.maf' """ } From d574ee3bbe76c5c87cf2842fc413d50f3cf646ea Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 11:53:53 -0700 Subject: [PATCH 33/45] Remove `head` filtering --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 900a1be..ffb9ec8 100644 --- a/main.nf +++ b/main.nf @@ -74,9 +74,9 @@ process VCF2MAF { vep_forks = task.cpus + 2 """ if [[ ${input_vcf} == *.gz ]]; then - zcat ${input_vcf} | head -n 10000 > intermediate.vcf + zcat ${input_vcf} > intermediate.vcf else - cat ${input_vcf} | head -n 10000 > intermediate.vcf + cat ${input_vcf} > intermediate.vcf fi vcf2maf.pl \ From e802174283299d05dd4f0ca3294f08d5a20d7461 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 12:28:24 -0700 Subject: [PATCH 34/45] Address caching issue due to spaces in file names --- README.md | 3 ++- main.nf | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 468423f..b9889b3 100644 --- a/README.md +++ b/README.md @@ -73,9 +73,10 @@ Check out the [Quickstart](#quickstart) section for example parameter values. Yo ### Samplesheet -The input samplesheet should be in comma-separated values (CSV) format and contain the following columns: +The input samplesheet should be in comma-separated values (CSV) format and contain the following columns. **You should avoid using spaces or special characters in any of the columns.** Otherwise, you might run into job caching issues. 1. **`synapse_id`**: Synapse ID of the VCF file + - Make sure that the Synapse account associated with the personal access token has access to all listed VCF files 2. **`biospecimen_id`**: Biospecimen/sample identifier - This value will be used to populate the `Tumor_Sample_Barcode` MAF column - **Important:** This value needs to uniquely identify samples within each merged MAF file. See [below](#maf-files) for information on how MAF files are merged. diff --git a/main.nf b/main.nf index ffb9ec8..5775fb3 100644 --- a/main.nf +++ b/main.nf @@ -21,6 +21,7 @@ process SYNAPSE_GET { script: """ synapse get ${synapse_id} + for f in *\ *; do mv "\${f}" "\${f// /_}"; done """ } @@ -74,9 +75,9 @@ process VCF2MAF { vep_forks = task.cpus + 2 """ if [[ ${input_vcf} == *.gz ]]; then - zcat ${input_vcf} > intermediate.vcf + zcat ${input_vcf} | head -n 10000 > intermediate.vcf else - cat ${input_vcf} > intermediate.vcf + cat ${input_vcf} | head -n 10000 > intermediate.vcf fi vcf2maf.pl \ From 191ad6fe9b198c26ed19a54cd343af1abd698408 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 15:17:43 -0700 Subject: [PATCH 35/45] Leave basename intact in sample MAF files --- main.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 5775fb3..6b40e52 100644 --- a/main.nf +++ b/main.nf @@ -71,8 +71,9 @@ process VCF2MAF { // TODO: Remove hard-coded VEP path // TODO: Handle VCF genotype columns per variant caller script: - vep_path = "/root/miniconda3/envs/vep/bin" + vep_path = "/root/miniconda3/envs/vep/bin" vep_forks = task.cpus + 2 + basename = input_vcf.name.replaceAll(/.gz$/, "").replaceAll(/.vcf$/, "") """ if [[ ${input_vcf} == *.gz ]]; then zcat ${input_vcf} | head -n 10000 > intermediate.vcf @@ -88,7 +89,7 @@ process VCF2MAF { --tumor-id '${meta.biospecimen_id}' --vep-forks ${vep_forks} \ --species ${params.species} - grep -v '^#' intermediate.maf.raw > '${meta.biospecimen_id}-${meta.variant_class}-${meta.variant_caller}.maf' + grep -v '^#' intermediate.maf.raw > '${basename}.maf' """ } From 57a2076647e85d2fd32ed715b2106fbcd680d14b Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 15:36:01 -0700 Subject: [PATCH 36/45] Fix bug for handling empty glob matches --- main.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 6b40e52..d9c6f36 100644 --- a/main.nf +++ b/main.nf @@ -8,7 +8,7 @@ process SYNAPSE_GET { tag "${meta.synapse_id}" - container "sagebionetworks/synapsepythonclient:v2.6.0" + container "sagebionetworks/synapsepythonclient:v2.7.0" secret "SYNAPSE_AUTH_TOKEN" @@ -21,6 +21,8 @@ process SYNAPSE_GET { script: """ synapse get ${synapse_id} + + shopt -s nullglob for f in *\ *; do mv "\${f}" "\${f// /_}"; done """ From 14325f1ccce5040d7870abeca29e04628ba64103 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 16:28:22 -0700 Subject: [PATCH 37/45] Move away from for-loop in bash --- main.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/main.nf b/main.nf index d9c6f36..08f58d9 100644 --- a/main.nf +++ b/main.nf @@ -22,8 +22,7 @@ process SYNAPSE_GET { """ synapse get ${synapse_id} - shopt -s nullglob - for f in *\ *; do mv "\${f}" "\${f// /_}"; done + find . -type f -name "* *.xml" -exec bash -c 'mv "$0" "${0// /_}"' {} \; """ } From 8c369d3b9b6fb80819404de50df7483d8ddb1621 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 16:40:44 -0700 Subject: [PATCH 38/45] Escape dollar signs --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 08f58d9..3235552 100644 --- a/main.nf +++ b/main.nf @@ -22,7 +22,7 @@ process SYNAPSE_GET { """ synapse get ${synapse_id} - find . -type f -name "* *.xml" -exec bash -c 'mv "$0" "${0// /_}"' {} \; + find . -type f -name "* *" -exec bash -c 'mv "\$0" "\${0// /_}"' {} \; """ } From 285c064fe48d8664c1b1b6df0a4c261ce7a8616b Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 17:00:45 -0700 Subject: [PATCH 39/45] Revert synapseclient version upgrade --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 3235552..f85887e 100644 --- a/main.nf +++ b/main.nf @@ -8,7 +8,7 @@ process SYNAPSE_GET { tag "${meta.synapse_id}" - container "sagebionetworks/synapsepythonclient:v2.7.0" + container "sagebionetworks/synapsepythonclient:v2.6.0" secret "SYNAPSE_AUTH_TOKEN" From 0cb0240ec803dfd8aa9517857ebdb9902c90ba3a Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 17:04:54 -0700 Subject: [PATCH 40/45] Try `shell:` block instead of `script:` --- main.nf | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index f85887e..d709a37 100644 --- a/main.nf +++ b/main.nf @@ -18,12 +18,13 @@ process SYNAPSE_GET { output: tuple val(meta), path('*') - script: - """ - synapse get ${synapse_id} + shell: + ''' + synapse get !{synapse_id} - find . -type f -name "* *" -exec bash -c 'mv "\$0" "\${0// /_}"' {} \; - """ + shopt -s nullglob + for f in *\ *; do mv "${f}" "${f// /_}"; done + ''' } From fde57c9286b836fcff2e3006268b946d14de9519 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 17:14:55 -0700 Subject: [PATCH 41/45] Try escaping backslash --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index d709a37..43d0a65 100644 --- a/main.nf +++ b/main.nf @@ -23,7 +23,7 @@ process SYNAPSE_GET { synapse get !{synapse_id} shopt -s nullglob - for f in *\ *; do mv "${f}" "${f// /_}"; done + for f in *\\ *; do mv "${f}" "${f// /_}"; done ''' } From deaa7f90aa37f3de59ae4725b62137b15994ca89 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 17:32:22 -0700 Subject: [PATCH 42/45] Try `script:` with backslash escape --- main.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index 43d0a65..16ec9d0 100644 --- a/main.nf +++ b/main.nf @@ -18,13 +18,13 @@ process SYNAPSE_GET { output: tuple val(meta), path('*') - shell: - ''' - synapse get !{synapse_id} + script: + """ + synapse get ${synapse_id} shopt -s nullglob - for f in *\\ *; do mv "${f}" "${f// /_}"; done - ''' + for f in *\\ *; do mv "\${f}" "\${f// /_}"; done + """ } From e4b43e018d6dfd083b343856a52ba3ec4bcf6f9a Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 17:48:52 -0700 Subject: [PATCH 43/45] Give enough memory for the `merge_mafs` job --- main.nf | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/main.nf b/main.nf index 16ec9d0..9b581c6 100644 --- a/main.nf +++ b/main.nf @@ -58,6 +58,8 @@ process VCF2MAF { cpus 6 memory { 8.GB * task.attempt } + + errorStrategy = 'retry' maxRetries 3 afterScript "rm -f intermediate*" @@ -125,6 +127,11 @@ process MERGE_MAFS { container "python:3.10.4" + memory { 16.GB * task.attempt } + + errorStrategy = 'retry' + maxRetries 3 + input: tuple val(meta), path(input_mafs) From 3f093d958283d583b2cd94de9056da7235c5e678 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 17:49:11 -0700 Subject: [PATCH 44/45] Trim trailing whitespace --- main.nf | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/main.nf b/main.nf index 9b581c6..6e91a7b 100644 --- a/main.nf +++ b/main.nf @@ -64,12 +64,12 @@ process VCF2MAF { afterScript "rm -f intermediate*" - input: + input: tuple val(meta), path(input_vcf) tuple path(reference_fasta), path(reference_fasta_fai) path vep_data - output: + output: tuple val(meta), path("*.maf") // TODO: Remove hard-coded VEP path @@ -128,11 +128,11 @@ process MERGE_MAFS { container "python:3.10.4" memory { 16.GB * task.attempt } - + errorStrategy = 'retry' maxRetries 3 - input: + input: tuple val(meta), path(input_mafs) output: @@ -189,7 +189,7 @@ workflow SAMPLE_MAFS { sample_mafs_ch = VCF2MAF.out .map { meta, maf -> [ maf, meta.sample_parent_id ] } SYNAPSE_STORE(sample_mafs_ch) - + emit: VCF2MAF.out @@ -241,7 +241,7 @@ workflow { // Function to get list of [ meta, vcf ] def create_vcf_channel(LinkedHashMap row) { - + // Create metadata element def meta = [:] meta.synapse_id = row.synapse_id @@ -262,7 +262,7 @@ def create_vcf_channel(LinkedHashMap row) { // Function to get list of [ study_meta, maf ] def subset_study_meta(vcf_meta, maf) { - + // Subset metadata element def study_meta = [:] study_meta.merged_parent_id = vcf_meta.merged_parent_id From cbe08e084e3790d7bc54f56f85e312eadc08bb0d Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Fri, 16 Sep 2022 17:51:16 -0700 Subject: [PATCH 45/45] Remove `head` commands --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 6e91a7b..8cf1611 100644 --- a/main.nf +++ b/main.nf @@ -80,9 +80,9 @@ process VCF2MAF { basename = input_vcf.name.replaceAll(/.gz$/, "").replaceAll(/.vcf$/, "") """ if [[ ${input_vcf} == *.gz ]]; then - zcat ${input_vcf} | head -n 10000 > intermediate.vcf + zcat ${input_vcf} > intermediate.vcf else - cat ${input_vcf} | head -n 10000 > intermediate.vcf + cat ${input_vcf} > intermediate.vcf fi vcf2maf.pl \