From fe4d92623850dc48aaf1ec024b41a7b3d981f7a5 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Mon, 12 Sep 2022 23:14:59 +0000
Subject: [PATCH 01/45] Create first working version with vcf2maf

---
 .gitignore      |  3 +++
 README.md       | 34 ++++++++++++++++++++++++++++++++++
 main.nf         | 35 +++++++++++++++++++++++++++++++++++
 nextflow.config |  1 +
 4 files changed, 73 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 main.nf
 create mode 100644 nextflow.config

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..dcd7af8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+# Nextflow files
+.nextflow*
+work/
diff --git a/README.md b/README.md
index 663c567..ed52637 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,36 @@
 # nf-vcf2maf
+
 Annotate VCF files using VEP and generate MAF files using vcf2maf
+
+## Setup
+
+```console
+# Install tmux for long-running commands
+sudo yum install -y tmux
+
+# Download Ensembl VEP cache (expands to 22G)
+mkdir -p $HOME/ref/ $HOME/.vep/
+rsync -avr --progress rsync://ftp.ensembl.org/ensembl/pub/release-107/variation/indexed_vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz $HOME/ref/
+tar -zvxf $HOME/ref/homo_sapiens_vep_107_GRCh38.tar.gz -C $HOME/.vep/
+
+# Download reference genome FASTA file
+mkdir -p $HOME/ref/fasta/
+aws --no-sign-request s3 sync s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/ $HOME/ref/fasta/
+
+# Get example VCF file for testing purposes
+mkdir -p $HOME/data
+echo "Log into AWS using credentials from JumpCloud"
+aws s3 cp s3://ntap-add5-project-tower-bucket/outputs/sarek_syn29793238/VariantCalling/2-001_Plexiform_Neurofibroma/DeepVariant/2-001_Plexiform_Neurofibroma.vcf.gz ~/data/test.vcf.gz
+gunzip ~/data/test.vcf.gz
+head -n 4000 ~/data/test.vcf ~/data/test-small.vcf
+
+# Install Nextflow
+sudo yum install -y java
+(cd .local/bin && wget -qO- https://get.nextflow.io | bash)
+
+# Stage reference files in memory
+mkdir -p /dev/shm/vep/ /dev/shm/fasta/
+sudo mount -o remount,size=30G /dev/shm  # Increase /dev/shm size
+rsync -avhP $HOME/.vep/ /dev/shm/vep/
+rsync -avhP $HOME/ref/fasta/ /dev/shm/fasta/
+```
diff --git a/main.nf b/main.nf
new file mode 100644
index 0000000..f58e631
--- /dev/null
+++ b/main.nf
@@ -0,0 +1,35 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl=2
+
+params.input_vcf = "$HOME/data/test-small.vcf"
+params.reference_fasta = "/dev/shm/fasta/Homo_sapiens_assembly38.fasta"
+params.vep_data = "/dev/shm/vep/"
+
+process VCF2MAF {
+
+  container 'nfosi/vcf2maf:latest'
+
+  input:                
+  path input_vcf
+  path reference_fasta
+  path vep_data
+
+  output:               
+  path 'output.maf'
+
+  script:
+  """
+  perl /nf-osi-vcf2maf-*/vcf2maf.pl \
+    --input-vcf ${input_vcf} --output-maf output.maf --ref-fasta ${reference_fasta} \
+    --vep-data ${vep_data} --ncbi-build GRCh38 --max-subpop-af 0.0005 \
+    --vep-path /root/miniconda3/envs/vcf2maf/bin --maf-center "Sage Bionetworks"
+  """
+
+}
+
+workflow {
+  input_vcf_ch = Channel.fromPath(params.input_vcf)
+  VCF2MAF(input_vcf_ch, params.reference_fasta, params.vep_data)
+  VCF2MAF.out.view()
+}
diff --git a/nextflow.config b/nextflow.config
new file mode 100644
index 0000000..d3af3ea
--- /dev/null
+++ b/nextflow.config
@@ -0,0 +1 @@
+docker.enabled = true

From 8d180e84ad167b9e130dd84197acd76986531c16 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Tue, 13 Sep 2022 03:52:30 +0000
Subject: [PATCH 02/45] Switch to `gnomad-genomes` container

---
 main.nf | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/main.nf b/main.nf
index f58e631..aeeedd6 100644
--- a/main.nf
+++ b/main.nf
@@ -2,13 +2,18 @@
 
 nextflow.enable.dsl=2
 
-params.input_vcf = "$HOME/data/test-small.vcf"
+params.input_vcf = "$HOME/data/test-small*.vcf"
 params.reference_fasta = "/dev/shm/fasta/Homo_sapiens_assembly38.fasta"
 params.vep_data = "/dev/shm/vep/"
 
+params.maf_center = "Sage Bionetworks"
+params.max_subpop_af = 0.0005
+params.ncbi_build = "GRCh38"
+
+
 process VCF2MAF {
 
-  container 'nfosi/vcf2maf:latest'
+  container "sagebionetworks/vcf2maf:gnomad-genomes"
 
   input:                
   path input_vcf
@@ -19,17 +24,19 @@ process VCF2MAF {
   path 'output.maf'
 
   script:
+  vep_path = "/root/miniconda3/envs/vep/bin"
   """
-  perl /nf-osi-vcf2maf-*/vcf2maf.pl \
+  vcf2maf.pl \
     --input-vcf ${input_vcf} --output-maf output.maf --ref-fasta ${reference_fasta} \
-    --vep-data ${vep_data} --ncbi-build GRCh38 --max-subpop-af 0.0005 \
-    --vep-path /root/miniconda3/envs/vcf2maf/bin --maf-center "Sage Bionetworks"
+    --vep-data ${vep_data} --ncbi-build ${params.ncbi_build} --max-subpop-af ${params.max_subpop_af} \
+    --vep-path ${vep_path} --maf-center ${params.maf_center}
   """
 
 }
 
+
 workflow {
-  input_vcf_ch = Channel.fromPath(params.input_vcf)
+  input_vcf_ch = Channel.fromPath(params.input_vcf, checkIfExists: true)
   VCF2MAF(input_vcf_ch, params.reference_fasta, params.vep_data)
   VCF2MAF.out.view()
 }

From b9c342a6bb0b8cd953fcd3be1787cd2eac0a8c58 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Tue, 13 Sep 2022 05:37:20 +0000
Subject: [PATCH 03/45] Add `MERGE_MAFS` process

---
 main.nf | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 70 insertions(+), 9 deletions(-)

diff --git a/main.nf b/main.nf
index aeeedd6..324f1a2 100644
--- a/main.nf
+++ b/main.nf
@@ -2,7 +2,7 @@
 
 nextflow.enable.dsl=2
 
-params.input_vcf = "$HOME/data/test-small*.vcf"
+params.input = "$HOME/data/example.csv"
 params.reference_fasta = "/dev/shm/fasta/Homo_sapiens_assembly38.fasta"
 params.vep_data = "/dev/shm/vep/"
 
@@ -16,27 +16,88 @@ process VCF2MAF {
   container "sagebionetworks/vcf2maf:gnomad-genomes"
 
   input:                
-  path input_vcf
+  tuple val(meta), path(input_vcf)
   path reference_fasta
   path vep_data
 
   output:               
-  path 'output.maf'
+  tuple val(meta), path("${input_vcf}.maf")
 
   script:
   vep_path = "/root/miniconda3/envs/vep/bin"
   """
   vcf2maf.pl \
-    --input-vcf ${input_vcf} --output-maf output.maf --ref-fasta ${reference_fasta} \
-    --vep-data ${vep_data} --ncbi-build ${params.ncbi_build} --max-subpop-af ${params.max_subpop_af} \
-    --vep-path ${vep_path} --maf-center ${params.maf_center}
+    --input-vcf '${input_vcf}' --output-maf ${input_vcf}.maf --ref-fasta '${reference_fasta}' \
+    --vep-data '${vep_data}' --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \
+    --vep-path '${vep_path}' --maf-center '${params.maf_center}'
+  """
+
+}
+
+
+process MERGE_MAFS {
+
+  container "python:3.10.4"
+
+  input:                
+  tuple val(meta), path(input_mafs)
+
+  output:
+  tuple val(meta), path("${meta.study_id}.maf")
+
+  script:
+  script_url = "https://raw.githubusercontent.com/genome-nexus/annotation-tools/master/merge_mafs.py"
+  """
+  wget ${script_url}
+  python3 merge_mafs.py -o ${meta.study_id}.maf -i ${input_mafs.join(',')}
   """
 
 }
 
 
 workflow {
-  input_vcf_ch = Channel.fromPath(params.input_vcf, checkIfExists: true)
-  VCF2MAF(input_vcf_ch, params.reference_fasta, params.vep_data)
-  VCF2MAF.out.view()
+
+  input_ch = Channel
+    .fromPath ( params.input )
+    .splitCsv ( header:true, sep:',' )
+    .map { create_vcf_channel(it) }
+
+  VCF2MAF(input_ch, params.reference_fasta, params.vep_data)
+
+  merged_inputs_ch = VCF2MAF.out
+    .filter { meta, maf -> meta.is_releasable }
+    .map {
+        vcf_meta, maf ->
+          def study_meta = [:]
+          study_meta.merged_parent_id = vcf_meta.merged_parent_id
+          study_meta.study_id         = vcf_meta.study_id
+          study_meta.variant_class    = vcf_meta.variant_class
+          study_meta.variant_caller   = vcf_meta.variant_caller
+          [ study_meta, maf ] 
+    }
+    .groupTuple( by: 0 )
+  
+  MERGE_MAFS(merged_inputs_ch)
+
+  MERGE_MAFS.out.view()
+
+}
+
+
+// Function to get list of [ meta, vcf ]
+def create_vcf_channel(LinkedHashMap row) {
+  
+  // Create metadata element
+  def meta = [:]
+  meta.sample_parent_id = row.sample_parent_id
+  meta.merged_parent_id = row.merged_parent_id
+  meta.study_id         = row.study_id
+  meta.variant_class    = row.variant_class
+  meta.variant_caller   = row.variant_caller
+  meta.is_releasable    = row.is_releasable.toBoolean()
+
+  // Combine with VCF file element
+  def vcf_meta = [meta, file(row.vcf_file)]
+
+  return vcf_meta
 }

From b81a9926eeaa745ed3b82fc90e85032ae83fe7fe Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Tue, 13 Sep 2022 07:02:14 +0000
Subject: [PATCH 04/45] Add `FILTER_MAF` process

---
 main.nf | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 324f1a2..b432708 100644
--- a/main.nf
+++ b/main.nf
@@ -55,6 +55,37 @@ process MERGE_MAFS {
 }
 
 
+process FILTER_MAF {
+
+  container "python:3.10.4"
+
+  input:
+  tuple val(meta), path(input_maf)
+
+  output:
+  tuple val(meta), path("${input_maf}.filt.maf")
+
+  script:
+  """
+  #!/usr/bin/env python3
+
+  import csv
+
+  with (
+      open('${input_maf}', newline='') as infile, 
+      open('${input_maf}.filt.maf', "w", newline='') as outfile
+  ):
+    reader = csv.DictReader(infile, delimiter='\t')
+    writer = csv.DictWriter(outfile, reader.fieldnames, delimiter='\t')
+    for row in reader:
+      print(row)
+      if row['FILTER'] == 'PASS':
+        writer.writerow(row)
+  """
+
+}
+
+
 workflow {
 
   input_ch = Channel
@@ -79,7 +110,9 @@ workflow {
   
   MERGE_MAFS(merged_inputs_ch)
 
-  MERGE_MAFS.out.view()
+  FILTER_MAF(MERGE_MAFS.out)
+
+  FILTER_MAF.out.view()
 
 }
 

From afbcb7ffe76adfc7ac666db7ebeff300f49559e0 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Tue, 13 Sep 2022 07:02:40 +0000
Subject: [PATCH 05/45] Remove unnecessary print statement

---
 main.nf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/main.nf b/main.nf
index b432708..2f3b393 100644
--- a/main.nf
+++ b/main.nf
@@ -78,7 +78,6 @@ process FILTER_MAF {
     reader = csv.DictReader(infile, delimiter='\t')
     writer = csv.DictWriter(outfile, reader.fieldnames, delimiter='\t')
     for row in reader:
-      print(row)
       if row['FILTER'] == 'PASS':
         writer.writerow(row)
   """

From 5209817bc1d0d7f07b143a200e9ee1c6309fb614 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Tue, 13 Sep 2022 07:29:07 +0000
Subject: [PATCH 06/45] Add `SYNAPSE_STORE` process

---
 main.nf | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 2f3b393..cc9b423 100644
--- a/main.nf
+++ b/main.nf
@@ -85,6 +85,23 @@ process FILTER_MAF {
 }
 
 
+process SYNAPSE_STORE {
+
+  container "sagebionetworks/synapsepythonclient:v2.6.0"
+
+  secret "SYNAPSE_AUTH_TOKEN"
+
+  input:
+  tuple path(input), val(parent_id)
+
+  script:
+  """
+  synapse store --parentId '${parent_id}' '${input}'
+  """
+
+}
+
+
 workflow {
 
   input_ch = Channel
@@ -111,7 +128,10 @@ workflow {
 
   FILTER_MAF(MERGE_MAFS.out)
 
-  FILTER_MAF.out.view()
+  merged_mafs_ch = MERGE_MAFS.out
+    .map { meta, maf -> [ maf, meta.merged_parent_id ] }
+
+  SYNAPSE_STORE(merged_mafs_ch)
 
 }
 

From 61ac39b6a85208ef77693d2ff1c8bd61d6afc819 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Tue, 13 Sep 2022 07:50:29 +0000
Subject: [PATCH 07/45] Add `SYNAPSE_GET` and `SYNAPSE_STORE` processes

---
 main.nf | 105 +++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 81 insertions(+), 24 deletions(-)

diff --git a/main.nf b/main.nf
index cc9b423..0528d2d 100644
--- a/main.nf
+++ b/main.nf
@@ -11,6 +11,26 @@ params.max_subpop_af = 0.0005
 params.ncbi_build = "GRCh38"
 
 
+process SYNAPSE_GET {
+
+  container "sagebionetworks/synapsepythonclient:v2.6.0"
+
+  secret "SYNAPSE_AUTH_TOKEN"
+
+  input:
+  tuple val(meta), val(synapse_id)
+
+  output:
+  tuple val(meta), path('*')
+
+  script:
+  """
+  synapse get '${synapse_id}'
+  """
+
+}
+
+
 process VCF2MAF {
 
   container "sagebionetworks/vcf2maf:gnomad-genomes"
@@ -102,36 +122,73 @@ process SYNAPSE_STORE {
 }
 
 
+workflow SAMPLE_MAFS {
+
+  take:
+    sample_vcfs
+
+  main:
+    SYNAPSE_GET(sample_vcfs)
+
+    VCF2MAF(SYNAPSE_GET.out, params.reference_fasta, params.vep_data)
+
+    sample_mafs_ch = VCF2MAF.out
+      .map { meta, maf -> [ maf, meta.sample_parent_id ] }
+
+    SYNAPSE_STORE(sample_mafs_ch)
+  
+  emit:
+    VCF2MAF.out
+
+}
+
+
+workflow MERGED_MAFS {
+
+  take:
+    sample_mafs
+
+  main:
+    merged_inputs_ch = sample_mafs
+      .filter { meta, maf -> meta.is_releasable }
+      .map {
+          vcf_meta, maf ->
+            def study_meta = [:]
+            study_meta.merged_parent_id = vcf_meta.merged_parent_id
+            study_meta.study_id         = vcf_meta.study_id
+            study_meta.variant_class    = vcf_meta.variant_class
+            study_meta.variant_caller   = vcf_meta.variant_caller
+            [ study_meta, maf ] 
+      }
+      .groupTuple( by: 0 )
+    
+    MERGE_MAFS(merged_inputs_ch)
+
+    FILTER_MAF(MERGE_MAFS.out)
+
+    merged_mafs_ch = MERGE_MAFS.out
+      .map { meta, maf -> [ maf, meta.merged_parent_id ] }
+
+    filtered_mafs_ch = FILTER_MAF.out
+      .map { meta, maf -> [ maf, meta.merged_parent_id ] }
+
+    both_mafs_ch = merged_mafs_ch.mix(filtered_mafs_ch)
+
+    SYNAPSE_STORE(both_mafs_ch)
+
+}
+
+
 workflow {
 
-  input_ch = Channel
+  input_vcfs_ch = Channel
     .fromPath ( params.input )
     .splitCsv ( header:true, sep:',' )
     .map { create_vcf_channel(it) }
 
-  VCF2MAF(input_ch, params.reference_fasta, params.vep_data)
-
-  merged_inputs_ch = VCF2MAF.out
-    .filter { meta, maf -> meta.is_releasable }
-    .map {
-        vcf_meta, maf ->
-          def study_meta = [:]
-          study_meta.merged_parent_id = vcf_meta.merged_parent_id
-          study_meta.study_id         = vcf_meta.study_id
-          study_meta.variant_class    = vcf_meta.variant_class
-          study_meta.variant_caller   = vcf_meta.variant_caller
-          [ study_meta, maf ] 
-    }
-    .groupTuple( by: 0 )
-  
-  MERGE_MAFS(merged_inputs_ch)
-
-  FILTER_MAF(MERGE_MAFS.out)
-
-  merged_mafs_ch = MERGE_MAFS.out
-    .map { meta, maf -> [ maf, meta.merged_parent_id ] }
+  SAMPLE_MAFS(input_vcfs_ch)
 
-  SYNAPSE_STORE(merged_mafs_ch)
+  MERGED_MAFS(SAMPLE_MAFS.out)
 
 }
 
@@ -149,7 +206,7 @@ def create_vcf_channel(LinkedHashMap row) {
   meta.is_releasable    = row.is_releasable.toBoolean()
 
   // Combine with VCF file element
-  def vcf_meta = [meta, file(row.vcf_file)]
+  def vcf_meta = [meta, row.synapse_id]
 
   return vcf_meta
 }

From e81c2afca85de593562522cf46da5e36d4ebbbec Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Tue, 13 Sep 2022 08:02:08 +0000
Subject: [PATCH 08/45] Improve output file naming

---
 main.nf | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/main.nf b/main.nf
index 0528d2d..0a081c7 100644
--- a/main.nf
+++ b/main.nf
@@ -41,13 +41,13 @@ process VCF2MAF {
   path vep_data
 
   output:               
-  tuple val(meta), path("${input_vcf}.maf")
+  tuple val(meta), path("*.maf")
 
   script:
   vep_path = "/root/miniconda3/envs/vep/bin"
   """
   vcf2maf.pl \
-    --input-vcf '${input_vcf}' --output-maf ${input_vcf}.maf --ref-fasta '${reference_fasta}' \
+    --input-vcf '${input_vcf}' --output-maf ${input_vcf.baseName}.maf --ref-fasta '${reference_fasta}' \
     --vep-data '${vep_data}' --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \
     --vep-path '${vep_path}' --maf-center '${params.maf_center}'
   """
@@ -63,13 +63,15 @@ process MERGE_MAFS {
   tuple val(meta), path(input_mafs)
 
   output:
-  tuple val(meta), path("${meta.study_id}.maf")
+  tuple val(meta), path("*.merged.maf")
 
   script:
   script_url = "https://raw.githubusercontent.com/genome-nexus/annotation-tools/master/merge_mafs.py"
   """
   wget ${script_url}
-  python3 merge_mafs.py -o ${meta.study_id}.maf -i ${input_mafs.join(',')}
+  python3 merge_mafs.py \
+    -o ${meta.study_id}-${meta.variant_class}-${meta.variant_caller}.merged.maf \
+    -i ${input_mafs.join(',')}
   """
 
 }
@@ -83,7 +85,7 @@ process FILTER_MAF {
   tuple val(meta), path(input_maf)
 
   output:
-  tuple val(meta), path("${input_maf}.filt.maf")
+  tuple val(meta), path("*.passed.maf")
 
   script:
   """
@@ -93,7 +95,7 @@ process FILTER_MAF {
 
   with (
       open('${input_maf}', newline='') as infile, 
-      open('${input_maf}.filt.maf', "w", newline='') as outfile
+      open('${input_maf.baseName}.passed.maf', "w", newline='') as outfile
   ):
     reader = csv.DictReader(infile, delimiter='\t')
     writer = csv.DictWriter(outfile, reader.fieldnames, delimiter='\t')

From b55a6b29713ebfd847945d4de73b481f0ad98e8f Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Tue, 13 Sep 2022 08:07:12 +0000
Subject: [PATCH 09/45] Add TODOs for later

---
 main.nf | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/main.nf b/main.nf
index 0a081c7..4841964 100644
--- a/main.nf
+++ b/main.nf
@@ -31,6 +31,7 @@ process SYNAPSE_GET {
 }
 
 
+// TODO: Look at other vcf2maf options (like the sample names)
 process VCF2MAF {
 
   container "sagebionetworks/vcf2maf:gnomad-genomes"
@@ -55,6 +56,8 @@ process VCF2MAF {
 }
 
 
+// TODO: Sanity check output
+// TODO: Incorporate script inside container (or copy script to this repo)
 process MERGE_MAFS {
 
   container "python:3.10.4"
@@ -77,6 +80,7 @@ process MERGE_MAFS {
 }
 
 
+// TODO: Store Python script in a separate file
 process FILTER_MAF {
 
   container "python:3.10.4"
@@ -124,6 +128,7 @@ process SYNAPSE_STORE {
 }
 
 
+// TODO: Add comments
 workflow SAMPLE_MAFS {
 
   take:
@@ -145,6 +150,7 @@ workflow SAMPLE_MAFS {
 }
 
 
+// TODO: Add comments
 workflow MERGED_MAFS {
 
   take:

From e128c3009154665214357a3c7030b3241d2b1c86 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Tue, 13 Sep 2022 21:22:02 +0000
Subject: [PATCH 10/45] Update Docker container tag to official release

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 4841964..0bfeb5d 100644
--- a/main.nf
+++ b/main.nf
@@ -34,7 +34,7 @@ process SYNAPSE_GET {
 // TODO: Look at other vcf2maf options (like the sample names)
 process VCF2MAF {
 
-  container "sagebionetworks/vcf2maf:gnomad-genomes"
+  container "sagebionetworks/vcf2maf:107.1"
 
   input:                
   tuple val(meta), path(input_vcf)

From 8a735bba40297d8ed23102ecc44213e9c4b42647 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Wed, 14 Sep 2022 00:27:10 +0000
Subject: [PATCH 11/45] Filter individual MAF files before merging

---
 main.nf | 65 +++++++++++++++++++++++++++------------------------------
 1 file changed, 31 insertions(+), 34 deletions(-)

diff --git a/main.nf b/main.nf
index 0bfeb5d..8eb476a 100644
--- a/main.nf
+++ b/main.nf
@@ -48,33 +48,10 @@ process VCF2MAF {
   vep_path = "/root/miniconda3/envs/vep/bin"
   """
   vcf2maf.pl \
-    --input-vcf '${input_vcf}' --output-maf ${input_vcf.baseName}.maf --ref-fasta '${reference_fasta}' \
+    --input-vcf '${input_vcf}' --output-maf ${input_vcf.baseName}.maf.raw --ref-fasta '${reference_fasta}' \
     --vep-data '${vep_data}' --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \
     --vep-path '${vep_path}' --maf-center '${params.maf_center}'
-  """
-
-}
-
-
-// TODO: Sanity check output
-// TODO: Incorporate script inside container (or copy script to this repo)
-process MERGE_MAFS {
-
-  container "python:3.10.4"
-
-  input:                
-  tuple val(meta), path(input_mafs)
-
-  output:
-  tuple val(meta), path("*.merged.maf")
-
-  script:
-  script_url = "https://raw.githubusercontent.com/genome-nexus/annotation-tools/master/merge_mafs.py"
-  """
-  wget ${script_url}
-  python3 merge_mafs.py \
-    -o ${meta.study_id}-${meta.variant_class}-${meta.variant_caller}.merged.maf \
-    -i ${input_mafs.join(',')}
+  grep -v '^#' ${input_vcf.baseName}.maf.raw > ${input_vcf.baseName}.maf
   """
 
 }
@@ -103,6 +80,7 @@ process FILTER_MAF {
   ):
     reader = csv.DictReader(infile, delimiter='\t')
     writer = csv.DictWriter(outfile, reader.fieldnames, delimiter='\t')
+    writer.writeheader()
     for row in reader:
       if row['FILTER'] == 'PASS':
         writer.writerow(row)
@@ -111,6 +89,30 @@ process FILTER_MAF {
 }
 
 
+// TODO: Sanity check output
+// TODO: Incorporate script inside container (or copy script to this repo)
+process MERGE_MAFS {
+
+  container "python:3.10.4"
+
+  input:                
+  tuple val(meta), path(input_mafs)
+
+  output:
+  tuple val(meta), path("*.merged.maf")
+
+  script:
+  script_url = "https://raw.githubusercontent.com/genome-nexus/annotation-tools/master/merge_mafs.py"
+  """
+  wget ${script_url}
+  python3 merge_mafs.py \
+    -o ${meta.study_id}-${meta.variant_class}-${meta.variant_caller}.merged.maf \
+    -i ${input_mafs.join(',')}
+  """
+}
+
+
+
 process SYNAPSE_STORE {
 
   container "sagebionetworks/synapsepythonclient:v2.6.0"
@@ -143,9 +145,11 @@ workflow SAMPLE_MAFS {
       .map { meta, maf -> [ maf, meta.sample_parent_id ] }
 
     SYNAPSE_STORE(sample_mafs_ch)
+
+    FILTER_MAF(VCF2MAF.out)
   
   emit:
-    VCF2MAF.out
+    FILTER_MAF.out
 
 }
 
@@ -172,17 +176,10 @@ workflow MERGED_MAFS {
     
     MERGE_MAFS(merged_inputs_ch)
 
-    FILTER_MAF(MERGE_MAFS.out)
-
     merged_mafs_ch = MERGE_MAFS.out
       .map { meta, maf -> [ maf, meta.merged_parent_id ] }
 
-    filtered_mafs_ch = FILTER_MAF.out
-      .map { meta, maf -> [ maf, meta.merged_parent_id ] }
-
-    both_mafs_ch = merged_mafs_ch.mix(filtered_mafs_ch)
-
-    SYNAPSE_STORE(both_mafs_ch)
+    SYNAPSE_STORE(merged_mafs_ch)
 
 }
 

From 4e6b7d15d673e87e9dc9ceac397ca5c68d3f8e70 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Thu, 15 Sep 2022 15:57:08 +0000
Subject: [PATCH 12/45] Add process tags and handle gzipped VCF files

---
 bin/filter_maf.py |  18 ++++++
 bin/merge_mafs.py | 153 ++++++++++++++++++++++++++++++++++++++++++++++
 main.nf           |  69 ++++++++++++---------
 3 files changed, 212 insertions(+), 28 deletions(-)
 create mode 100755 bin/filter_maf.py
 create mode 100755 bin/merge_mafs.py

diff --git a/bin/filter_maf.py b/bin/filter_maf.py
new file mode 100755
index 0000000..d675584
--- /dev/null
+++ b/bin/filter_maf.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+
+import csv
+import sys
+
+input_maf  = sys.argv[1]
+output_maf = sys.argv[2]
+
+with (
+    open(input_maf, newline='') as infile,
+    open(output_maf, "w", newline='') as outfile
+):
+    reader = csv.DictReader(infile, delimiter='\t')
+    writer = csv.DictWriter(outfile, reader.fieldnames, delimiter='\t')
+    writer.writeheader()
+    for row in reader:
+        if row['FILTER'] == 'PASS':
+            writer.writerow(row)
diff --git a/bin/merge_mafs.py b/bin/merge_mafs.py
new file mode 100755
index 0000000..cff5948
--- /dev/null
+++ b/bin/merge_mafs.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+
+# Copied from the genome-nexus/annotation-tools GitHub repository:
+# https://github.com/genome-nexus/annotation-tools/blob/2e315735/merge_mafs.py
+
+# Copyright (c) 2020 Memorial Sloan Kettering Cancer Center
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import sys
+import argparse
+
+# Script for merging MAFs
+# Creates one aggregate header taken from column names acorss MAFs
+# Writes out rows while mapping to header
+# NAs and equivalents replaced with ''
+
+TUMOR_SAMPLE_BARCODE_COLUMN = "Tumor_Sample_Barcode"
+HUGO_SYMBOL_COLUMN = "Hugo_Symbol"
+ENTREZ_GENE_ID_COLUMN = "Entrez_Gene_Id"
+
+NULL_OR_MISSING_VALUES = ["", "NA", "N/A", None]
+
+def process_datum(val):
+    """ Cleans up datum. """
+    try:
+        vfixed = val.strip()
+    except AttributeError:
+        vfixed = ""
+    if vfixed in NULL_OR_MISSING_VALUES:
+        return ""
+    return vfixed
+
+def get_file_header(data_file):
+    """
+        Returns header from MAF
+        Assumes file is tab-delimited
+        Assumes header is first non-commented line
+    """
+    header = []
+    with open(data_file, "r") as header_source:
+        for line in header_source:
+            if not line.startswith("#"):
+                header = list(map(str.strip, line.rstrip().split("\t")))
+                break
+    if not header:
+        print("Could not extract header from mutation data file: %s - Exiting..." % (filename))
+        sys.exit(2)
+    return header
+
+def merge_headers(data_filenames):
+    """
+        Generates a merged header from all input files.
+        * Also ensures that "Hugo_Symbol" and "Entrez_Gene_Id"
+          are the first 2 columns in the merged MAF header.
+    """
+    merged_header = [HUGO_SYMBOL_COLUMN, ENTREZ_GENE_ID_COLUMN]
+    for fname in data_filenames:
+        columns_to_add = [column for column in get_file_header(fname) if column not in merged_header]
+        merged_header.extend(columns_to_add)
+    return merged_header
+
+def merge_input_mafs(input_mafs, output_maf_filename):
+    """ Generates a merged MAF given a list of input MAFs. """
+    merged_header = merge_headers(input_mafs)
+
+    rows_to_write = []
+    for input_maf in input_mafs:
+        print('Loading data from ' + input_maf)
+        header_processed = False
+        file_header = get_file_header(input_maf)
+        with open(input_maf, "r") as maf:
+            for line in maf:
+                if line.startswith("#"):
+                    continue
+                if not header_processed:
+                    header_processed = True
+                    continue
+                # map row values to current header columns
+                mapped_row = dict(zip(file_header, list(map(lambda x: process_datum(x), line.split("\t")))))
+                sample_id = mapped_row[TUMOR_SAMPLE_BARCODE_COLUMN]
+
+                # full record for the merged MAF (if mapped_row does not contain a column in merged_header, it is blank)
+                normalized_row = list(map(lambda x: mapped_row.get(x, ""), merged_header))
+                rows_to_write.append("\t".join(normalized_row))
+
+    with open(output_maf_filename, "w") as output_maf:
+        output_maf.write("\t".join(merged_header))
+        output_maf.write("\n")
+        output_maf.write("\n".join(rows_to_write))
+        output_maf.write("\n")
+    print('Merged MAF written to: %s' % (output_maf_filename))
+
+def verify_input_mafs(input_mafs):
+    for maf in input_mafs:
+        if not os.path.isfile(maf):
+            print("Could not find MAF by filename: %s" % (maf))
+            exit(1)
+
+def usage(parser):
+    parser.print_help()
+    sys.exit(2)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-i", "--input-mafs-list", action = "store", dest = "input_mafs_list", help = "comma-delimited list of MAFs to merge")
+    parser.add_argument("-d", "--input-mafs-directory", action = "store", dest = "input_mafs_directory", help = "directory containing all MAFs to merge")
+    parser.add_argument("-o", "--output-maf", action = "store", dest = "output_maf", help = "output filename for merged MAF [REQUIRED]")
+
+    args = parser.parse_args()
+    input_mafs_list = args.input_mafs_list
+    input_mafs_directory = args.input_mafs_directory
+    output_maf = args.output_maf
+
+    if not output_maf:
+        print("Missing required argument: -o | --output-maf")
+        usage(parser)
+
+    if (input_mafs_list and input_mafs_directory) or (not input_mafs_list and not input_mafs_directory):
+        print("Please choose only one of the following options when running script:  --input-mafs-list | --input-mafs-directory")
+        usage(parser)
+
+    if input_mafs_list:
+        input_mafs = list(map(str.strip, input_mafs_list.split(",")))
+    else:
+        input_mafs = []
+        for maf in os.listdir(input_mafs_directory):
+            full_maf_path = os.path.join(input_mafs_directory, maf)
+            if os.path.isfile(full_maf_path):
+                input_mafs.append(full_maf_path)
+            else:
+                print("Skipping non-file element in directory... '%s'" % (full_maf_path))
+
+    verify_input_mafs(input_mafs)
+    merge_input_mafs(input_mafs, output_maf)
diff --git a/main.nf b/main.nf
index 8eb476a..b2a2483 100644
--- a/main.nf
+++ b/main.nf
@@ -4,6 +4,7 @@ nextflow.enable.dsl=2
 
 params.input = "$HOME/data/example.csv"
 params.reference_fasta = "/dev/shm/fasta/Homo_sapiens_assembly38.fasta"
+params.reference_fasta_fai = "${params.reference_fasta}.fai"
 params.vep_data = "/dev/shm/vep/"
 
 params.maf_center = "Sage Bionetworks"
@@ -13,6 +14,8 @@ params.ncbi_build = "GRCh38"
 
 process SYNAPSE_GET {
 
+  tag "${meta.synapse_id}"
+
   container "sagebionetworks/synapsepythonclient:v2.6.0"
 
   secret "SYNAPSE_AUTH_TOKEN"
@@ -31,14 +34,21 @@ process SYNAPSE_GET {
 }
 
 
-// TODO: Look at other vcf2maf options (like the sample names)
+// TODO: Handle VCF genotype columns per variant caller
+// TODO: Add appropriate memory allocation
 process VCF2MAF {
 
-  container "sagebionetworks/vcf2maf:107.1"
+  tag "${meta.synapse_id}"
+
+  container "sagebionetworks/vcf2maf:107.1-debug2"
+
+  cpus 8
+
+  afterScript "rm -f intermediate*"
 
   input:                
   tuple val(meta), path(input_vcf)
-  path reference_fasta
+  tuple path(reference_fasta), path(reference_fasta_fai)
   path vep_data
 
   output:               
@@ -47,19 +57,29 @@ process VCF2MAF {
   script:
   vep_path = "/root/miniconda3/envs/vep/bin"
   """
+  if [[ ${input_vcf} == *.gz ]]; then
+    zcat '${input_vcf}' > 'intermediate.vcf'
+  else
+    mv '${input_vcf}' 'intermediate.vcf'
+  fi
+
   vcf2maf.pl \
-    --input-vcf '${input_vcf}' --output-maf ${input_vcf.baseName}.maf.raw --ref-fasta '${reference_fasta}' \
-    --vep-data '${vep_data}' --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \
-    --vep-path '${vep_path}' --maf-center '${params.maf_center}'
-  grep -v '^#' ${input_vcf.baseName}.maf.raw > ${input_vcf.baseName}.maf
+    --input-vcf 'intermediate.vcf' --output-maf 'intermediate.maf.raw' \
+    --ref-fasta '${reference_fasta}' --vep-data '${vep_data}' \
+    --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \
+    --vep-path '${vep_path}' --maf-center '${params.maf_center}' \
+    --tumor-id '${meta.biospecimen_id}' --vep-forks '${task.cpus}'
+
+  grep -v '^#' 'intermediate.maf.raw' > '${meta.biospecimen_id}-${meta.variant_class}-${meta.variant_caller}.maf'
   """
 
 }
 
 
-// TODO: Store Python script in a separate file
 process FILTER_MAF {
 
+  tag "${input_maf.name}"
+
   container "python:3.10.4"
 
   input:
@@ -70,29 +90,17 @@ process FILTER_MAF {
 
   script:
   """
-  #!/usr/bin/env python3
-
-  import csv
-
-  with (
-      open('${input_maf}', newline='') as infile, 
-      open('${input_maf.baseName}.passed.maf', "w", newline='') as outfile
-  ):
-    reader = csv.DictReader(infile, delimiter='\t')
-    writer = csv.DictWriter(outfile, reader.fieldnames, delimiter='\t')
-    writer.writeheader()
-    for row in reader:
-      if row['FILTER'] == 'PASS':
-        writer.writerow(row)
+  filter_maf.py ${input_maf} ${input_maf.baseName}.passed.maf
   """
 
 }
 
 
 // TODO: Sanity check output
-// TODO: Incorporate script inside container (or copy script to this repo)
 process MERGE_MAFS {
 
+  tag "${meta.study_id}-${meta.variant_class}-${meta.variant_caller}"
+
   container "python:3.10.4"
 
   input:                
@@ -102,19 +110,18 @@ process MERGE_MAFS {
   tuple val(meta), path("*.merged.maf")
 
   script:
-  script_url = "https://raw.githubusercontent.com/genome-nexus/annotation-tools/master/merge_mafs.py"
   """
-  wget ${script_url}
-  python3 merge_mafs.py \
+  merge_mafs.py \
     -o ${meta.study_id}-${meta.variant_class}-${meta.variant_caller}.merged.maf \
     -i ${input_mafs.join(',')}
   """
 }
 
 
-
 process SYNAPSE_STORE {
 
+  tag "${parent_id}/${input.name}"
+
   container "sagebionetworks/synapsepythonclient:v2.6.0"
 
   secret "SYNAPSE_AUTH_TOKEN"
@@ -137,9 +144,13 @@ workflow SAMPLE_MAFS {
     sample_vcfs
 
   main:
+    reference_fasta_pair = [
+      params.reference_fasta, params.reference_fasta_fai
+    ]
+
     SYNAPSE_GET(sample_vcfs)
 
-    VCF2MAF(SYNAPSE_GET.out, params.reference_fasta, params.vep_data)
+    VCF2MAF(SYNAPSE_GET.out, reference_fasta_pair, params.vep_data)
 
     sample_mafs_ch = VCF2MAF.out
       .map { meta, maf -> [ maf, meta.sample_parent_id ] }
@@ -203,6 +214,8 @@ def create_vcf_channel(LinkedHashMap row) {
   
   // Create metadata element
   def meta = [:]
+  meta.synapse_id       = row.synapse_id
+  meta.biospecimen_id   = row.biospecimen_id
   meta.sample_parent_id = row.sample_parent_id
   meta.merged_parent_id = row.merged_parent_id
   meta.study_id         = row.study_id

From e32b68c2c5c2fc66b82af8ad9e7c6ff838a3c70e Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Thu, 15 Sep 2022 16:18:26 +0000
Subject: [PATCH 13/45] Add steps for Nextflow secrets

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ed52637..8991cb4 100644
--- a/README.md
+++ b/README.md
@@ -24,9 +24,12 @@ aws s3 cp s3://ntap-add5-project-tower-bucket/outputs/sarek_syn29793238/VariantC
 gunzip ~/data/test.vcf.gz
 head -n 4000 ~/data/test.vcf ~/data/test-small.vcf
 
-# Install Nextflow
+# Install and setup Nextflow
 sudo yum install -y java
 (cd .local/bin && wget -qO- https://get.nextflow.io | bash)
+echo 'export NXF_ENABLE_SECRETS=true' >> ~/.bashrc
+source ~/.bashrc
+nextflow secrets put -n SYNAPSE_AUTH_TOKEN -v "<synapse-pat>"
 
 # Stage reference files in memory
 mkdir -p /dev/shm/vep/ /dev/shm/fasta/

From 1203985d2e6ed60a42e5502a5a31537ceddefcf2 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Thu, 15 Sep 2022 22:30:43 +0000
Subject: [PATCH 14/45] Switch to downloading the VEP cache tarball

---
 main.nf         | 26 +++++++++++---------------
 nextflow.config | 34 +++++++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/main.nf b/main.nf
index b2a2483..0e3726d 100644
--- a/main.nf
+++ b/main.nf
@@ -2,15 +2,6 @@
 
 nextflow.enable.dsl=2
 
-params.input = "$HOME/data/example.csv"
-params.reference_fasta = "/dev/shm/fasta/Homo_sapiens_assembly38.fasta"
-params.reference_fasta_fai = "${params.reference_fasta}.fai"
-params.vep_data = "/dev/shm/vep/"
-
-params.maf_center = "Sage Bionetworks"
-params.max_subpop_af = 0.0005
-params.ncbi_build = "GRCh38"
-
 
 process SYNAPSE_GET {
 
@@ -35,14 +26,15 @@ process SYNAPSE_GET {
 
 
 // TODO: Handle VCF genotype columns per variant caller
-// TODO: Add appropriate memory allocation
+// TODO: Improve handling of vep_path
 process VCF2MAF {
 
   tag "${meta.synapse_id}"
 
-  container "sagebionetworks/vcf2maf:107.1-debug2"
+  container "sagebionetworks/vcf2maf:107.1"
 
   cpus 8
+  memory '16.GB'
 
   afterScript "rm -f intermediate*"
 
@@ -58,17 +50,21 @@ process VCF2MAF {
   vep_path = "/root/miniconda3/envs/vep/bin"
   """
   if [[ ${input_vcf} == *.gz ]]; then
-    zcat '${input_vcf}' > 'intermediate.vcf'
+    zcat '${input_vcf}' | head 10000 > 'intermediate.vcf'
   else
-    mv '${input_vcf}' 'intermediate.vcf'
+    cat  '${input_vcf}' | head 10000 > 'intermediate.vcf'
   fi
 
+  mkdir -p 'vep_data/'
+  tar -zxf '${vep_data}' -C 'vep_data/'
+
   vcf2maf.pl \
     --input-vcf 'intermediate.vcf' --output-maf 'intermediate.maf.raw' \
-    --ref-fasta '${reference_fasta}' --vep-data '${vep_data}' \
+    --ref-fasta '${reference_fasta}' --vep-data 'vep_data/' \
     --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \
     --vep-path '${vep_path}' --maf-center '${params.maf_center}' \
-    --tumor-id '${meta.biospecimen_id}' --vep-forks '${task.cpus}'
+    --tumor-id '${meta.biospecimen_id}' --vep-forks '${task.cpus}' \
+    --species ${params.species}
 
   grep -v '^#' 'intermediate.maf.raw' > '${meta.biospecimen_id}-${meta.variant_class}-${meta.variant_caller}.maf'
   """
diff --git a/nextflow.config b/nextflow.config
index d3af3ea..bd4251c 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1 +1,33 @@
-docker.enabled = true
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    nf-core/rnaseq Nextflow config file
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Default config options for all compute environments
+----------------------------------------------------------------------------------------
+*/
+
+// Input parameters
+params.input 				= null
+params.reference_fasta 		= "s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta"
+params.reference_fasta_fai 	= "${params.reference_fasta}.fai"
+params.vep_data 			= "s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz"
+
+// vcf2maf parameters
+params.maf_center 			= "Sage Bionetworks"
+params.max_subpop_af 		= 0.0005
+params.ncbi_build 			= "GRCh38"
+params.species 				= "homo_sapiens"
+
+// Profiles
+profiles {
+
+    docker {
+        docker.enabled         = true
+        docker.userEmulation   = true
+        singularity.enabled    = false
+        podman.enabled         = false
+        shifter.enabled        = false
+        charliecloud.enabled   = false
+    }
+
+}

From 2617be492328f4a5724f6761132b52fa97bb5ed6 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Thu, 15 Sep 2022 22:30:53 +0000
Subject: [PATCH 15/45] Add performance benchmarks

---
 README.md | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)

diff --git a/README.md b/README.md
index 8991cb4..735fe4c 100644
--- a/README.md
+++ b/README.md
@@ -37,3 +37,127 @@ sudo mount -o remount,size=30G /dev/shm  # Increase /dev/shm size
 rsync -avhP $HOME/.vep/ /dev/shm/vep/
 rsync -avhP $HOME/ref/fasta/ /dev/shm/fasta/
 ```
+
+## Benchmarks
+
+### Ensembl VEP cache
+
+I want to see what's the fastest way of downloading the VEP cache. Based on the tests below, here are some options:
+
+- Download tarball from Ensembl and extract: 16m34s (10m23s + 6m11s)
+- Download tarball from S3 and extract: 9m25s (3m14 + 6m11s)
+- Download extracted tarball from S3: 4m5
+
+#### Using rsync with Ensembl's servers
+
+```console
+$ time rsync -avr --progress rsync://ftp.ensembl.org/ensembl/pub/release-107/variation/indexed_vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz ./
+
+real	10m23.648s
+user	0m50.824s
+sys	0m44.770s
+```
+
+#### Using S3 with our public bucket
+
+```console
+$ time aws --no-sign-request s3 cp s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz ./s3-homo_sapiens_vep_107_GRCh38.tar.gz
+
+real	3m14.885s
+user	3m31.085s
+sys	4m1.697s
+```
+
+#### Extracting the tarball
+
+```console
+$ time tar -zxf $HOME/ref/homo_sapiens_vep_107_GRCh38.tar.gz -C $HOME/.vep/
+
+real	6m11.087s
+user	2m59.190s
+sys	0m40.660s
+```
+
+#### Downloading the extracted tarball
+
+```console
+time aws --no-sign-request s3 sync s3://sage-igenomes/vep_cache/homo_sapiens/107_GRCh38/ ./s3-107_GRCh38/
+
+real	4m5.307s
+user	5m13.736s
+sys	3m24.302s
+```
+
+### Nextflow workflow runs
+
+#### Original (all reference files in `/dev/shm`)
+
+```console
+$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv
+
+real	3m43.240s
+user	0m29.439s
+sys	0m1.415s
+```
+
+#### VEP in shm and FASTA in S3
+
+Note that I needed to set `aws.client.anonymous = true` in the Nextflow config.
+
+```console
+$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta
+
+real	4m9.529s
+user	0m56.803s
+sys	0m5.996s
+```
+
+#### VEP in non-shm and FASTA in S3
+
+I'm curious about the difference in performance between SHM and non-SHM.
+
+```console
+$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data ~/.vep/homo_sapiens/107_GRCh38/
+
+real	3m43.073s
+user	0m27.599s
+sys	0m1.449s
+```
+
+#### VEP folder and FASTA in S3
+
+I had to kill this job because there was barely any CPU being used. It seemed stuck on staging the `107_GRCh38/` folder. Maybe Nextflow is more efficient with staging files over folders (compared to the AWS CLI). 
+
+```console
+$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data s3://sage-igenomes/vep_cache/homo_sapiens/107_GRCh38/
+
+^C
+
+real	17m7.522s
+user	2m28.423s
+sys	0m38.765s
+```
+
+#### VEP tarball in non-shm and FASTA in S3
+
+
+```console
+$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data ~/ref/homo_sapiens_vep_107_GRCh38.tar.gz
+
+real	8m39.798s
+user	0m29.715s
+sys	0m1.455s
+```
+
+
+#### VEP tarball and FASTA in S3
+
+
+```console
+$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz
+
+real	8m38.703s
+user	2m47.076s
+sys	0m50.287s
+```
+

From 06c92886e3b5fb39d7e1512e94db235b1ae77848 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 00:09:38 +0000
Subject: [PATCH 16/45] Split off tarball extraction

---
 main.nf         | 32 +++++++++++++++++++++++++-------
 nextflow.config |  2 +-
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/main.nf b/main.nf
index 0e3726d..999d4ac 100644
--- a/main.nf
+++ b/main.nf
@@ -25,6 +25,25 @@ process SYNAPSE_GET {
 }
 
 
+process EXTRACT_TAR_GZ {
+
+  container "sagebionetworks/vcf2maf:107.1"
+
+  input:
+  path vep_tarball
+
+  output:
+  path "vep_data"
+
+  script:
+  """
+  mkdir -p 'vep_data/'
+  tar -zxf '${vep_tarball}' -C 'vep_data/'
+  """
+
+}
+
+
 // TODO: Handle VCF genotype columns per variant caller
 // TODO: Improve handling of vep_path
 process VCF2MAF {
@@ -50,17 +69,14 @@ process VCF2MAF {
   vep_path = "/root/miniconda3/envs/vep/bin"
   """
   if [[ ${input_vcf} == *.gz ]]; then
-    zcat '${input_vcf}' | head 10000 > 'intermediate.vcf'
+    zcat '${input_vcf}' | head -n 10000 > 'intermediate.vcf'
   else
-    cat  '${input_vcf}' | head 10000 > 'intermediate.vcf'
+    cat  '${input_vcf}' | head -n 10000 > 'intermediate.vcf'
   fi
 
-  mkdir -p 'vep_data/'
-  tar -zxf '${vep_data}' -C 'vep_data/'
-
   vcf2maf.pl \
     --input-vcf 'intermediate.vcf' --output-maf 'intermediate.maf.raw' \
-    --ref-fasta '${reference_fasta}' --vep-data 'vep_data/' \
+    --ref-fasta '${reference_fasta}' --vep-data '${vep_data}/' \
     --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \
     --vep-path '${vep_path}' --maf-center '${params.maf_center}' \
     --tumor-id '${meta.biospecimen_id}' --vep-forks '${task.cpus}' \
@@ -146,7 +162,9 @@ workflow SAMPLE_MAFS {
 
     SYNAPSE_GET(sample_vcfs)
 
-    VCF2MAF(SYNAPSE_GET.out, reference_fasta_pair, params.vep_data)
+    EXTRACT_TAR_GZ(params.vep_tarball)
+
+    VCF2MAF(SYNAPSE_GET.out, reference_fasta_pair, EXTRACT_TAR_GZ.out)
 
     sample_mafs_ch = VCF2MAF.out
       .map { meta, maf -> [ maf, meta.sample_parent_id ] }
diff --git a/nextflow.config b/nextflow.config
index bd4251c..1b7503e 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -10,7 +10,7 @@
 params.input 				= null
 params.reference_fasta 		= "s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta"
 params.reference_fasta_fai 	= "${params.reference_fasta}.fai"
-params.vep_data 			= "s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz"
+params.vep_tarball 			= "s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz"
 
 // vcf2maf parameters
 params.maf_center 			= "Sage Bionetworks"

From 9fd3071ee9dc7d59566fca3df2a3269368f940b9 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Thu, 15 Sep 2022 17:37:01 -0700
Subject: [PATCH 17/45] Add missing dependency for Tower

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 999d4ac..0c9f16a 100644
--- a/main.nf
+++ b/main.nf
@@ -27,7 +27,7 @@ process SYNAPSE_GET {
 
 process EXTRACT_TAR_GZ {
 
-  container "sagebionetworks/vcf2maf:107.1"
+  container "sagebionetworks/vcf2maf:107.2"
 
   input:
   path vep_tarball

From b2efbb7c96c5a1f6ff4914cba9d168d3a5fdfb7e Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Thu, 15 Sep 2022 20:43:33 -0700
Subject: [PATCH 18/45] Add missing dependency for Tower (take 2)

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 0c9f16a..6f64dc7 100644
--- a/main.nf
+++ b/main.nf
@@ -50,7 +50,7 @@ process VCF2MAF {
 
   tag "${meta.synapse_id}"
 
-  container "sagebionetworks/vcf2maf:107.1"
+  container "sagebionetworks/vcf2maf:107.2"
 
   cpus 8
   memory '16.GB'

From 238d41e69172391d99bf7a5058096be0d21dd168 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Thu, 15 Sep 2022 22:07:06 -0700
Subject: [PATCH 19/45] Quote all files in commands

---
 main.nf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index 6f64dc7..7596e4f 100644
--- a/main.nf
+++ b/main.nf
@@ -102,7 +102,7 @@ process FILTER_MAF {
 
   script:
   """
-  filter_maf.py ${input_maf} ${input_maf.baseName}.passed.maf
+  filter_maf.py '${input_maf}' '${input_maf.baseName}.passed.maf'
   """
 
 }
@@ -124,8 +124,8 @@ process MERGE_MAFS {
   script:
   """
   merge_mafs.py \
-    -o ${meta.study_id}-${meta.variant_class}-${meta.variant_caller}.merged.maf \
-    -i ${input_mafs.join(',')}
+    -o '${meta.study_id}-${meta.variant_class}-${meta.variant_caller}.merged.maf' \
+    -i '${input_mafs.join(',')}'
   """
 }
 

From 44332dfc1f5034852e2d3f22beb5ed9044ad51c8 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Thu, 15 Sep 2022 22:54:49 -0700
Subject: [PATCH 20/45] Remove `head` command

---
 main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 7596e4f..f9fbda2 100644
--- a/main.nf
+++ b/main.nf
@@ -69,9 +69,9 @@ process VCF2MAF {
   vep_path = "/root/miniconda3/envs/vep/bin"
   """
   if [[ ${input_vcf} == *.gz ]]; then
-    zcat '${input_vcf}' | head -n 10000 > 'intermediate.vcf'
+    zcat '${input_vcf}' > 'intermediate.vcf'
   else
-    cat  '${input_vcf}' | head -n 10000 > 'intermediate.vcf'
+    cat  '${input_vcf}' > 'intermediate.vcf'
   fi
 
   vcf2maf.pl \

From 580cce223f9b948a0a4c76c2fece3a04b3764ef7 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 06:35:29 -0700
Subject: [PATCH 21/45] Optimize vcf2maf resources

---
 main.nf | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index f9fbda2..d23987a 100644
--- a/main.nf
+++ b/main.nf
@@ -52,8 +52,8 @@ process VCF2MAF {
 
   container "sagebionetworks/vcf2maf:107.2"
 
-  cpus 8
-  memory '16.GB'
+  cpus   6
+  memory 8.GB * task.attempt
 
   afterScript "rm -f intermediate*"
 
@@ -67,6 +67,7 @@ process VCF2MAF {
 
   script:
   vep_path = "/root/miniconda3/envs/vep/bin"
+  vep_forks = task.cpus + 2
   """
   if [[ ${input_vcf} == *.gz ]]; then
     zcat '${input_vcf}' > 'intermediate.vcf'
@@ -79,7 +80,7 @@ process VCF2MAF {
     --ref-fasta '${reference_fasta}' --vep-data '${vep_data}/' \
     --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \
     --vep-path '${vep_path}' --maf-center '${params.maf_center}' \
-    --tumor-id '${meta.biospecimen_id}' --vep-forks '${task.cpus}' \
+    --tumor-id '${meta.biospecimen_id}' --vep-forks '${vep_forks}' \
     --species ${params.species}
 
   grep -v '^#' 'intermediate.maf.raw' > '${meta.biospecimen_id}-${meta.variant_class}-${meta.variant_caller}.maf'

From 8b3700a279dc6c77f64779811308bdacff39c27a Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 06:40:28 -0700
Subject: [PATCH 22/45] Add missing command quotes

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index d23987a..19fadda 100644
--- a/main.nf
+++ b/main.nf
@@ -81,7 +81,7 @@ process VCF2MAF {
     --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \
     --vep-path '${vep_path}' --maf-center '${params.maf_center}' \
     --tumor-id '${meta.biospecimen_id}' --vep-forks '${vep_forks}' \
-    --species ${params.species}
+    --species '${params.species}'
 
   grep -v '^#' 'intermediate.maf.raw' > '${meta.biospecimen_id}-${meta.variant_class}-${meta.variant_caller}.maf'
   """

From 878b519057f8269e90d5aa38d4762a580c78148c Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 07:04:55 -0700
Subject: [PATCH 23/45] Polish workflow and add comments

---
 main.nf         | 77 ++++++++++++++++++++++++++++++-------------------
 nextflow.config |  8 -----
 2 files changed, 48 insertions(+), 37 deletions(-)

diff --git a/main.nf b/main.nf
index 19fadda..85db611 100644
--- a/main.nf
+++ b/main.nf
@@ -3,6 +3,7 @@
 nextflow.enable.dsl=2
 
 
+// Process for downloading files from Synapse
 process SYNAPSE_GET {
 
   tag "${meta.synapse_id}"
@@ -25,6 +26,7 @@ process SYNAPSE_GET {
 }
 
 
+// Process for decompressing and extracting the VEP cache tarball
 process EXTRACT_TAR_GZ {
 
   container "sagebionetworks/vcf2maf:107.2"
@@ -44,8 +46,7 @@ process EXTRACT_TAR_GZ {
 }
 
 
-// TODO: Handle VCF genotype columns per variant caller
-// TODO: Improve handling of vep_path
+// Process for annotating VCF file and converting to MAF
 process VCF2MAF {
 
   tag "${meta.synapse_id}"
@@ -65,14 +66,16 @@ process VCF2MAF {
   output:               
   tuple val(meta), path("*.maf")
 
+  // TODO: Remove hard-coded VEP path
+  // TODO: Handle VCF genotype columns per variant caller
   script:
   vep_path = "/root/miniconda3/envs/vep/bin"
   vep_forks = task.cpus + 2
   """
   if [[ ${input_vcf} == *.gz ]]; then
-    zcat '${input_vcf}' > 'intermediate.vcf'
+    zcat '${input_vcf}' | head -n 10000 > 'intermediate.vcf'
   else
-    cat  '${input_vcf}' > 'intermediate.vcf'
+    cat  '${input_vcf}' | head -n 10000 > 'intermediate.vcf'
   fi
 
   vcf2maf.pl \
@@ -89,6 +92,7 @@ process VCF2MAF {
 }
 
 
+// Process for filtering MAF files for passed variants
 process FILTER_MAF {
 
   tag "${input_maf.name}"
@@ -109,7 +113,7 @@ process FILTER_MAF {
 }
 
 
-// TODO: Sanity check output
+// Process for merging study MAF files
 process MERGE_MAFS {
 
   tag "${meta.study_id}-${meta.variant_class}-${meta.variant_caller}"
@@ -131,6 +135,7 @@ process MERGE_MAFS {
 }
 
 
+// Process for uploading files to Synapse
 process SYNAPSE_STORE {
 
   tag "${parent_id}/${input.name}"
@@ -150,76 +155,75 @@ process SYNAPSE_STORE {
 }
 
 
-// TODO: Add comments
+// Workflow for generating sample-level MAF files
 workflow SAMPLE_MAFS {
 
   take:
     sample_vcfs
 
   main:
-    reference_fasta_pair = [
-      params.reference_fasta, params.reference_fasta_fai
-    ]
+    // Pair up FASTA and FAI reference files
+    ref_fasta_pair = [params.reference_fasta, params.reference_fasta_fai]
 
+    // Download VCF files from Synapse
     SYNAPSE_GET(sample_vcfs)
 
+    // Decompress and extract VEP cache tarball
     EXTRACT_TAR_GZ(params.vep_tarball)
 
+    // Run vcf2maf on each vcf file
     VCF2MAF(SYNAPSE_GET.out, reference_fasta_pair, EXTRACT_TAR_GZ.out)
 
+    // Upload MAF files to Synapse
     sample_mafs_ch = VCF2MAF.out
       .map { meta, maf -> [ maf, meta.sample_parent_id ] }
-
     SYNAPSE_STORE(sample_mafs_ch)
-
-    FILTER_MAF(VCF2MAF.out)
   
   emit:
-    FILTER_MAF.out
+    VCF2MAF.out
 
 }
 
 
-// TODO: Add comments
-workflow MERGED_MAFS {
+// Workflow for generating study-level MAF files
+workflow STUDY_MAFS {
 
   take:
     sample_mafs
 
   main:
-    merged_inputs_ch = sample_mafs
+    // Filter MAF files for passed variants
+    FILTER_MAF(sample_mafs)
+
+    // Group MAF files by study and merge
+    mafs_by_study_ch = FILTER_MAF.out
       .filter { meta, maf -> meta.is_releasable }
-      .map {
-          vcf_meta, maf ->
-            def study_meta = [:]
-            study_meta.merged_parent_id = vcf_meta.merged_parent_id
-            study_meta.study_id         = vcf_meta.study_id
-            study_meta.variant_class    = vcf_meta.variant_class
-            study_meta.variant_caller   = vcf_meta.variant_caller
-            [ study_meta, maf ] 
-      }
+      .map { meta, maf -> subset_study_meta(meta, maf) }
       .groupTuple( by: 0 )
-    
-    MERGE_MAFS(merged_inputs_ch)
+    MERGE_MAFS(mafs_by_study_ch)
 
+    // Upload study MAF files to Synapse
     merged_mafs_ch = MERGE_MAFS.out
       .map { meta, maf -> [ maf, meta.merged_parent_id ] }
-
     SYNAPSE_STORE(merged_mafs_ch)
 
 }
 
 
+// Entrypoint workflow
 workflow {
 
+  // Parse input CSV sample sheet
   input_vcfs_ch = Channel
     .fromPath ( params.input )
     .splitCsv ( header:true, sep:',' )
     .map { create_vcf_channel(it) }
 
+  // Process individual sample VCF files
   SAMPLE_MAFS(input_vcfs_ch)
 
-  MERGED_MAFS(SAMPLE_MAFS.out)
+  // Filter and merge MAF files by study
+  STUDY_MAFS(SAMPLE_MAFS.out)
 
 }
 
@@ -243,3 +247,18 @@ def create_vcf_channel(LinkedHashMap row) {
 
   return vcf_meta
 }
+
+
+// Function to get list of [ study_meta, maf ]
+def subset_study_meta(vcf_meta, maf) {
+  
+  // Subset metadata element
+  def study_meta = [:]
+  study_meta.merged_parent_id = vcf_meta.merged_parent_id
+  study_meta.study_id         = vcf_meta.study_id
+  study_meta.variant_class    = vcf_meta.variant_class
+  study_meta.variant_caller   = vcf_meta.variant_caller
+  [ study_meta, maf ] 
+
+  return vcf_meta
+}
diff --git a/nextflow.config b/nextflow.config
index 1b7503e..6a39361 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,11 +1,3 @@
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    nf-core/rnaseq Nextflow config file
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    Default config options for all compute environments
-----------------------------------------------------------------------------------------
-*/
-
 // Input parameters
 params.input 				= null
 params.reference_fasta 		= "s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta"

From 13fcb6b8dfff73b83e566cec98fba3691dd00326 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 07:50:22 -0700
Subject: [PATCH 24/45] Fix variable name mismatch

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 85db611..9bc2f6e 100644
--- a/main.nf
+++ b/main.nf
@@ -172,7 +172,7 @@ workflow SAMPLE_MAFS {
     EXTRACT_TAR_GZ(params.vep_tarball)
 
     // Run vcf2maf on each vcf file
-    VCF2MAF(SYNAPSE_GET.out, reference_fasta_pair, EXTRACT_TAR_GZ.out)
+    VCF2MAF(SYNAPSE_GET.out, ref_fasta_pair, EXTRACT_TAR_GZ.out)
 
     // Upload MAF files to Synapse
     sample_mafs_ch = VCF2MAF.out

From 64d548c8d0d474012d04d46b4301f55049b3db78 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 08:09:27 -0700
Subject: [PATCH 25/45] Enable CSV value stripping

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 9bc2f6e..3d49b17 100644
--- a/main.nf
+++ b/main.nf
@@ -216,7 +216,7 @@ workflow {
   // Parse input CSV sample sheet
   input_vcfs_ch = Channel
     .fromPath ( params.input )
-    .splitCsv ( header:true, sep:',' )
+    .splitCsv ( header:true, strip:true )
     .map { create_vcf_channel(it) }
 
   // Process individual sample VCF files

From 13e7bd166cc886da3d49e9232fb2ad1aaae0d43b Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 08:12:35 -0700
Subject: [PATCH 26/45] Add missing closure for `memory`

---
 main.nf | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 3d49b17..267afb1 100644
--- a/main.nf
+++ b/main.nf
@@ -54,7 +54,8 @@ process VCF2MAF {
   container "sagebionetworks/vcf2maf:107.2"
 
   cpus   6
-  memory 8.GB * task.attempt
+  memory { 8.GB * task.attempt }
+  maxRetries 3
 
   afterScript "rm -f intermediate*"
 

From c0c927ada884e0f6779c0a015321ae50bd1082aa Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 09:29:30 -0700
Subject: [PATCH 27/45] Fix quotes

Spaces in inputs are automatically escaped,
but not in file attributes (like `baseName`).
So, I removed the quotes from the former and kept them for the latter.
---
 main.nf | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/main.nf b/main.nf
index 267afb1..30b2a29 100644
--- a/main.nf
+++ b/main.nf
@@ -20,7 +20,7 @@ process SYNAPSE_GET {
 
   script:
   """
-  synapse get '${synapse_id}'
+  synapse get ${synapse_id}
   """
 
 }
@@ -39,8 +39,8 @@ process EXTRACT_TAR_GZ {
 
   script:
   """
-  mkdir -p 'vep_data/'
-  tar -zxf '${vep_tarball}' -C 'vep_data/'
+  mkdir -p vep_data/
+  tar -zxf ${vep_tarball} -C vep_data/
   """
 
 }
@@ -74,20 +74,20 @@ process VCF2MAF {
   vep_forks = task.cpus + 2
   """
   if [[ ${input_vcf} == *.gz ]]; then
-    zcat '${input_vcf}' | head -n 10000 > 'intermediate.vcf'
+    zcat ${input_vcf} | head -n 10000 > intermediate.vcf
   else
-    cat  '${input_vcf}' | head -n 10000 > 'intermediate.vcf'
+    cat  ${input_vcf} | head -n 10000 > intermediate.vcf
   fi
 
   vcf2maf.pl \
-    --input-vcf 'intermediate.vcf' --output-maf 'intermediate.maf.raw' \
-    --ref-fasta '${reference_fasta}' --vep-data '${vep_data}/' \
-    --ncbi-build '${params.ncbi_build}' --max-subpop-af '${params.max_subpop_af}' \
-    --vep-path '${vep_path}' --maf-center '${params.maf_center}' \
-    --tumor-id '${meta.biospecimen_id}' --vep-forks '${vep_forks}' \
-    --species '${params.species}'
-
-  grep -v '^#' 'intermediate.maf.raw' > '${meta.biospecimen_id}-${meta.variant_class}-${meta.variant_caller}.maf'
+    --input-vcf intermediate.vcf --output-maf intermediate.maf.raw \
+    --ref-fasta ${reference_fasta} --vep-data ${vep_data}/ \
+    --ncbi-build ${params.ncbi_build} --max-subpop-af ${params.max_subpop_af} \
+    --vep-path ${vep_path} --maf-center ${params.maf_center} \
+    --tumor-id '${meta.biospecimen_id}' --vep-forks ${vep_forks} \
+    --species ${params.species}
+
+  grep -v '^#' intermediate.maf.raw > '${meta.biospecimen_id}-${meta.variant_class}-${meta.variant_caller}.maf'
   """
 
 }
@@ -108,7 +108,7 @@ process FILTER_MAF {
 
   script:
   """
-  filter_maf.py '${input_maf}' '${input_maf.baseName}.passed.maf'
+  filter_maf.py ${input_maf} '${input_maf.baseName}.passed.maf'
   """
 
 }
@@ -128,10 +128,9 @@ process MERGE_MAFS {
   tuple val(meta), path("*.merged.maf")
 
   script:
+  prefix = "${meta.study_id}-${meta.variant_class}-${meta.variant_caller}"
   """
-  merge_mafs.py \
-    -o '${meta.study_id}-${meta.variant_class}-${meta.variant_caller}.merged.maf' \
-    -i '${input_mafs.join(',')}'
+  merge_mafs.py -d './' -o '${prefix}.merged.maf'
   """
 }
 
@@ -150,7 +149,7 @@ process SYNAPSE_STORE {
 
   script:
   """
-  synapse store --parentId '${parent_id}' '${input}'
+  synapse store --parentId ${parent_id} ${input}
   """
 
 }

From 20ba11a73daa1cc8a3aacee4376fe69c9dfbbdf1 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 10:08:02 -0700
Subject: [PATCH 28/45] Create detailed README

---
 README.md | 240 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 129 insertions(+), 111 deletions(-)

diff --git a/README.md b/README.md
index 735fe4c..4812877 100644
--- a/README.md
+++ b/README.md
@@ -1,163 +1,181 @@
-# nf-vcf2maf
+# nf-vcf2maf: Annotate Variants (VCF) and Generate MAF Files
 
-Annotate VCF files using VEP and generate MAF files using vcf2maf
+## Purpose
 
-## Setup
+The purpose of this Nextflow workflow is to annotate variants in VCF files using the Ensembl Variant Effect Predictor (VEP) and convert the annotated VCF file into the [Mutation Annotation Format](https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/) (MAF). Unlike VEP-anntotated VCF files, MAF files are generally more useful for downstream applications given their tabular nature. For example, you can easily load them into R (_e.g._ with the [`maftools`](https://www.bioconductor.org/packages/release/bioc/html/maftools.html) package) and/or derive the input files for [cBioPortal](https://www.cbioportal.org/visualize).
 
-```console
-# Install tmux for long-running commands
-sudo yum install -y tmux
+**Important:** Please read the [limitations](#known-limitations) listed below.
 
-# Download Ensembl VEP cache (expands to 22G)
-mkdir -p $HOME/ref/ $HOME/.vep/
-rsync -avr --progress rsync://ftp.ensembl.org/ensembl/pub/release-107/variation/indexed_vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz $HOME/ref/
-tar -zvxf $HOME/ref/homo_sapiens_vep_107_GRCh38.tar.gz -C $HOME/.vep/
+This repository leverages a [fork of vcf2maf](https://github.com/Sage-Bionetworks-Workflows/vcf2maf) and a [custom container image](https://github.com/Sage-Bionetworks-Workflows/vcf2maf-docker).
 
-# Download reference genome FASTA file
-mkdir -p $HOME/ref/fasta/
-aws --no-sign-request s3 sync s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/ $HOME/ref/fasta/
+## Quickstart
 
-# Get example VCF file for testing purposes
-mkdir -p $HOME/data
-echo "Log into AWS using credentials from JumpCloud"
-aws s3 cp s3://ntap-add5-project-tower-bucket/outputs/sarek_syn29793238/VariantCalling/2-001_Plexiform_Neurofibroma/DeepVariant/2-001_Plexiform_Neurofibroma.vcf.gz ~/data/test.vcf.gz
-gunzip ~/data/test.vcf.gz
-head -n 4000 ~/data/test.vcf ~/data/test-small.vcf
+1. Prepare a CSV samplesheet according to the [format](#samplesheet) described below.
 
-# Install and setup Nextflow
-sudo yum install -y java
-(cd .local/bin && wget -qO- https://get.nextflow.io | bash)
-echo 'export NXF_ENABLE_SECRETS=true' >> ~/.bashrc
-source ~/.bashrc
-nextflow secrets put -n SYNAPSE_AUTH_TOKEN -v "<synapse-pat>"
+    **Example:** Stored locally as `./samplesheet.csv`
 
-# Stage reference files in memory
-mkdir -p /dev/shm/vep/ /dev/shm/fasta/
-sudo mount -o remount,size=30G /dev/shm  # Increase /dev/shm size
-rsync -avhP $HOME/.vep/ /dev/shm/vep/
-rsync -avhP $HOME/ref/fasta/ /dev/shm/fasta/
-```
+    ```csv
+    synapse_id  ,sample_parent_id ,merged_parent_id ,study_id ,variant_class ,variant_caller ,is_releasable
+    syn87654301 ,syn87654311      ,syn87654321      ,study_x  ,germline      ,deepvariant    ,true
+    syn87654302 ,syn87654311      ,syn87654321      ,study_x  ,germline      ,deepvariant    ,false
+    syn87654303 ,syn87654311      ,syn87654321      ,study_x  ,germline      ,deepvariant    ,true
+    syn87654304 ,syn87654312      ,syn87654321      ,study_x  ,germline      ,mutect2        ,false
+    syn87654305 ,syn87654312      ,syn87654321      ,study_x  ,germline      ,mutect2        ,false
+    syn87654306 ,syn87654312      ,syn87654321      ,study_x  ,germline      ,mutect2        ,false
+    syn87654307 ,syn87654313      ,syn87654322      ,study_y  ,germline      ,deepvariant    ,true
+    syn36245848 ,syn87654313      ,syn87654322      ,study_y  ,germline      ,deepvariant    ,true
+    ```
 
-## Benchmarks
+2. Create a Nextflow secret called `SYNAPSE_AUTH_TOKEN` with a Synapse personal access token ([docs](#authentication)).
 
-### Ensembl VEP cache
+3. Prepare your parameters file. For more details, check out the [Parameters](#parameters) section. Only the `input` parameters is required.
 
-I want to see what's the fastest way of downloading the VEP cache. Based on the tests below, here are some options:
+    **Example:** Stored locally as `./params.yml`
 
-- Download tarball from Ensembl and extract: 16m34s (10m23s + 6m11s)
-- Download tarball from S3 and extract: 9m25s (3m14 + 6m11s)
-- Download extracted tarball from S3: 4m5
+    ```yaml
+    input: "./samplesheet.csv"
+    maf_center: "Sage Bionetworks"
+    max_subpop_af: 0.0005
+    ```
 
-#### Using rsync with Ensembl's servers
+4. Launch workflow using the [Nextflow CLI](https://nextflow.io/docs/latest/cli.html#run), the [Tower CLI](https://help.tower.nf/latest/cli/), or the [Tower web UI](https://help.tower.nf/latest/launch/launchpad/).
 
-```console
-$ time rsync -avr --progress rsync://ftp.ensembl.org/ensembl/pub/release-107/variation/indexed_vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz ./
+    **Example:** Launched using the Nextflow CLI with Docker enabled
 
-real	10m23.648s
-user	0m50.824s
-sys	0m44.770s
-```
+    ```console
+    nextflow run sage-bionetworks-workflows/nf-vcf2maf -params-file ./params.yml -profile docker
+    ```
 
-#### Using S3 with our public bucket
+5. Explore the MAf files uploaded to Synapse (using the parent IDs listed in the samplesheet).
 
-```console
-$ time aws --no-sign-request s3 cp s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz ./s3-homo_sapiens_vep_107_GRCh38.tar.gz
+## Authentication
 
-real	3m14.885s
-user	3m31.085s
-sys	4m1.697s
-```
+This workflow takes care of transferring files to and from Synapse. Hence, it requires a secret with a personal access token for authentication. To configure Nextflow with such a token, follow these steps:
 
-#### Extracting the tarball
+1. Generate a personal access token (PAT) on Synapse using [this dashboard](https://www.synapse.org/#!PersonalAccessTokens:). Make sure to enable the `view`, `download`, and `modify` scopes since this workflow both downloads and uploads to Synapse.
+2. Create a secret called `SYNAPSE_AUTH_TOKEN` containing a Synapse personal access token using the [Nextflow CLI](https://nextflow.io/docs/latest/secrets.html) or [Nextflow Tower](https://help.tower.nf/latest/secrets/overview/).
+3. (Tower only) When launching the workflow, include the `SYNAPSE_AUTH_TOKEN` as a pipeline secret from either your user or workspace secrets.
 
-```console
-$ time tar -zxf $HOME/ref/homo_sapiens_vep_107_GRCh38.tar.gz -C $HOME/.vep/
+## Parameters
 
-real	6m11.087s
-user	2m59.190s
-sys	0m40.660s
-```
+Check out the [Quickstart](#quickstart) section for example parameter values. You are encouraged to read the [limitations](#known-limitations) listed below because some parameters have not been tested with non-default values.
 
-#### Downloading the extracted tarball
+- **`input`**: (Required) A CSV samplesheet that lists the VCF files that should be processed. See below for the [samplesheet format](#samplesheet).
+- **`max_subpop_af`**: Threshold used by vcf2maf for labeling variants with the `common_variant` filter. Specifically, the `common_variant` filter is applied to variants with an allele frequency of at least `max_subpop_af` in one or more gnomAD sub-populations ([source](https://github.com/mskcc/vcf2maf/blob/5ed414428046e71833f454d4b64da6c30362a89b/docs/vep_maf_readme.txt#L116-L120)). This filter is useful for removing false-positive somatic variants. The merged MAF files lack these common variants. Default: `0.0005`.
+- **`maf_center`**: Value used in the `Center` MAF column. Default: `"Sage Bionetworks"`.
+- **`reference_fasta`**: Reference genome FASTA file used in variant calling. Default: `"s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta"`.
+- **`reference_fasta_fai`**: Reference genome FASTA index (FAI) file. This shouldn't be needed in most cases since the workflow will automatically pick up on the `.fai` file alongside the `.fasta` file. Default: `"${reference_fasta}.fai"`.
+- **`vep_tarball`**: A tarball (ideally compressed) of the VEP cache. Default: `"s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz"`.
+- **`ncbi_build`**: The NCBI genome build. Passed to `--assembly` in VEP ([source](http://Jul2022.archive.ensembl.org/info/docs/tools/vep/script/vep_options.html)). Default: `"GRCh38"`.
+- **`species`**: The species identifier. Passed to `--species` in VEP ([source](http://Jul2022.archive.ensembl.org/info/docs/tools/vep/script/vep_options.html)). Default: `"homo_sapiens"`.
 
-```console
-time aws --no-sign-request s3 sync s3://sage-igenomes/vep_cache/homo_sapiens/107_GRCh38/ ./s3-107_GRCh38/
+## Inputs
 
-real	4m5.307s
-user	5m13.736s
-sys	3m24.302s
-```
+### Samplesheet
 
-### Nextflow workflow runs
+The input samplesheet should be in comma-separated values (CSV) format and contain the following columns:
 
-#### Original (all reference files in `/dev/shm`)
+1. **`synapse_id`**: Synapse ID of the VCF file
+2. **`biospecimen_id`**: Biospecimen/sample identifier
+    - This value will be used to populate the `Tumor_Sample_Barcode` MAF column
+    - **Important:** This value needs to uniquely identify samples within each merged MAF file. See [below](#maf-files) for information on how MAF files are merged.
+3. **`sample_parent_id`**: Synapse ID of the folder where the individual sample MAF file will be uploaded
+    - Suggestion: The folder that contains the VCF file
+4. **`merged_parent_id`**: The Synapse ID of the folder where the merged MAF file will be uploaded
+    - Suggestion: The root folder containing the VCF files
+    - **Important:** This value should be consistent across VCF files that are expected to be merged. Otherwise, you will end up with artificially split merged MAF files. See [below](#maf-files) for information on how MAF files are merged.
+5. **`study_id`**: Study identifier
+    - Suggestion: The Synapse ID of the project representing the study if you don’t have shorthand study IDs
+6. **`variant_class`**: Whether the VCF file contains somatic or germline mutations
+    - **Valid values:** `somatic` or `germline`
+7. **`variant_caller`**: Name of the variant caller
+8. **`is_releasable`**: Whether the VCF file should be included in the merged MAF file
+    - **Valid values:** `true` or `false`
 
-```console
-$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv
+## Outputs
 
-real	3m43.240s
-user	0m29.439s
-sys	0m1.415s
-```
+### MAF Files
 
-#### VEP in shm and FASTA in S3
+- Individual sample MAF files
+  - Unfiltered (_i.e._ includes all variants regardless of their FILTER status, including those that weren’t deemed high-confidence by the variant caller)
+  - File naming: `${biospecimen_id}-${variant_class}-${variant_caller}.maf`
+- Merged study MAF files (one for every combination of `study_id`, `variant_class` and `variant_caller`)
+  - Filtered (_i.e._ restricted to “releasable” samples and variants where `FILTER == 'PASS'`, which excludes those with common_variant due to any(gnomAD_*_AF) >= 0.0005)
+  - File naming: `${study_id}-${variant_class}-${variant_caller}.merged.maf`
 
-Note that I needed to set `aws.client.anonymous = true` in the Nextflow config.
+## Known Limitations
 
-```console
-$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta
+- This workflow has only been tested with the following parameters:
+  - `vep_tarball`: Ensembl VEP 107
+  - `species`: `homo_sapiens`
+  - `ncbi_build`: `GRCh38`
+  - `reference_fasta`: GATK FASTA file
 
-real	4m9.529s
-user	0m56.803s
-sys	0m5.996s
-```
+## Benchmarks
 
-#### VEP in non-shm and FASTA in S3
+### Setup
 
-I'm curious about the difference in performance between SHM and non-SHM.
+The following benchmarks were performed with the following setup on an EC2 instance:
 
 ```console
-$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data ~/.vep/homo_sapiens/107_GRCh38/
+# Install tmux for long-running commands
+sudo yum install -y tmux
 
-real	3m43.073s
-user	0m27.599s
-sys	0m1.449s
-```
+# Install and setup Nextflow
+sudo yum install -y java
+(cd .local/bin && wget -qO- https://get.nextflow.io | bash)
+echo 'export NXF_ENABLE_SECRETS=true' >> ~/.bashrc
+source ~/.bashrc
+nextflow secrets put -n SYNAPSE_AUTH_TOKEN -v "<synapse-pat>"
+mkdir -p $HOME/.nextflow/
+echo 'aws.client.anonymous = true' >> $HOME/.nextflow/config
 
-#### VEP folder and FASTA in S3
+# Download and extract Ensembl VEP cache
+mkdir -p $HOME/ref/ $HOME/.vep/
+rsync -avr --progress rsync://ftp.ensembl.org/ensembl/pub/release-107/variation/indexed_vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz $HOME/ref/
+tar -zvxf $HOME/ref/homo_sapiens_vep_107_GRCh38.tar.gz -C $HOME/.vep/
 
-I had to kill this job because there was barely any CPU being used. It seemed stuck on staging the `107_GRCh38/` folder. Maybe Nextflow is more efficient with staging files over folders (compared to the AWS CLI). 
+# Download reference genome FASTA file
+mkdir -p $HOME/ref/fasta/
+aws --no-sign-request s3 sync s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/ $HOME/ref/fasta/
 
-```console
-$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data s3://sage-igenomes/vep_cache/homo_sapiens/107_GRCh38/
+# Stage reference files in memory
+mkdir -p /dev/shm/vep/ /dev/shm/fasta/
+sudo mount -o remount,size=25G /dev/shm  # Increase /dev/shm size
+rsync -avhP $HOME/.vep/ /dev/shm/vep/
+rsync -avhP $HOME/ref/fasta/ /dev/shm/fasta/
+```
 
-^C
+### Preparing the Ensembl VEP cache
 
-real	17m7.522s
-user	2m28.423s
-sys	0m38.765s
-```
+#### Outside of Nextflow
 
-#### VEP tarball in non-shm and FASTA in S3
+To determine the most efficient way of preparing the VEP cache for vcf2maf, I tried different permutations of downloading the tarball or extracted folder from Ensembl or S3. Here are the individual results:
 
+- **Download tarball using rsync from Ensembl:** 10 min 23 sec
+- **Download tarball using AWS CLI from S3:** 3 min 14 sec
+- **Extract tarball using `tar` locally:** 6 min 11 sec
+- **Download extracted folder using AWS CLI from S3:** 4 min 5 sec
 
-```console
-$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data ~/ref/homo_sapiens_vep_107_GRCh38.tar.gz
+Based on the above results, here are some estimated runtimes:
 
-real	8m39.798s
-user	0m29.715s
-sys	0m1.455s
-```
+- **Download tarball from Ensembl and extract locally:** 16 min 34 sec
+- **Download tarball from S3 and extract locally:** 9 min 25 sec
+- **Download extracted tarball from S3:** 4 min 5 sec
 
+Based on the above estimates, downloading the extracted tarball from S3 seems like the most efficient method followed by downloading the tarball from S3 and extracting locally.
 
-#### VEP tarball and FASTA in S3
+#### Within Nextflow
 
+After benchmarking different methods for downloading VEP cache, I performed various test within a Nextflow run. Note that SHM refers to files being in shared memory (_i.e._ `/dev/shm`).
 
-```console
-$ time nextflow run ~/nf-vcf2maf/main.nf --input ~/data/test.csv --reference_fasta s3://sage-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta --vep_data s3://sage-igenomes/vep_cache/homo_sapiens_vep_107_GRCh38.tar.gz
+- **Baseline (all reference files in SHM):** 3 min 43 sec
+- **FASTA in S3 and VEP folder in SHM:** 4 min 9 sec
+- **FASTA in S3 and VEP folder in non-SHM:** 3 min 43 sec
+- **FASTA in S3 and VEP folder in S3:** Over 17 min 7 sec[^1]
+- **FASTA in S3 and VEP tarball in non-SHM:** 8 min 39 sec
+- **FASTA and VEP tarball in S3:** 8 min 38 sec
 
-real	8m38.703s
-user	2m47.076s
-sys	0m50.287s
-```
+The above results showed that downloading the tarball from S3 was the most efficient method. While ~10 minutes is a long time to spend on preparing reference files, it's trivial compared to the actual runtimes of vcf2maf, which can reach 4-5 hours. The benefit is portability, including the ability of running this workflow on Tower.
 
+[^1]: While this was expected to be the most efficient method of downloading the VEP cache, I had to kill the job because it was taking so long. Perhaps the AWS Java SDK isn't as efficient as the AWS CLI for downloading a folder in S3 recursively.

From ec141690f1d24d7454c529d178381176de2149b5 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 10:09:44 -0700
Subject: [PATCH 29/45] Fix typo in `subset_study_meta()`

---
 main.nf | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 30b2a29..6bc0052 100644
--- a/main.nf
+++ b/main.nf
@@ -258,7 +258,6 @@ def subset_study_meta(vcf_meta, maf) {
   study_meta.study_id         = vcf_meta.study_id
   study_meta.variant_class    = vcf_meta.variant_class
   study_meta.variant_caller   = vcf_meta.variant_caller
-  [ study_meta, maf ] 
 
-  return vcf_meta
+  return [ study_meta, maf ]
 }

From c84cbbe180c5c5bde2e24f6701ec50a863c4b0f5 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 10:11:45 -0700
Subject: [PATCH 30/45] Shorten README title

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4812877..468423f 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# nf-vcf2maf: Annotate Variants (VCF) and Generate MAF Files
+# nf-vcf2maf: Annotate VCF Files and Generate MAF Files
 
 ## Purpose
 

From ae3c0139f2771ec3a4fe21486261c890373931bf Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 10:47:19 -0700
Subject: [PATCH 31/45] Revert to comma-sep list of MAF Files`

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 6bc0052..784c2a7 100644
--- a/main.nf
+++ b/main.nf
@@ -130,7 +130,7 @@ process MERGE_MAFS {
   script:
   prefix = "${meta.study_id}-${meta.variant_class}-${meta.variant_caller}"
   """
-  merge_mafs.py -d './' -o '${prefix}.merged.maf'
+  merge_mafs.py -i ${input_mafs.join(',')} -o '${prefix}.merged.maf'
   """
 }
 

From 13fd89b7da2f38dbd7dd5326dd2fc4373cb5e37f Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 11:00:49 -0700
Subject: [PATCH 32/45] Add quotes for input MAF files for `merge_mafs.py`

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 784c2a7..900a1be 100644
--- a/main.nf
+++ b/main.nf
@@ -130,7 +130,7 @@ process MERGE_MAFS {
   script:
   prefix = "${meta.study_id}-${meta.variant_class}-${meta.variant_caller}"
   """
-  merge_mafs.py -i ${input_mafs.join(',')} -o '${prefix}.merged.maf'
+  merge_mafs.py -i '${input_mafs.join(',')}' -o '${prefix}.merged.maf'
   """
 }
 

From d574ee3bbe76c5c87cf2842fc413d50f3cf646ea Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 11:53:53 -0700
Subject: [PATCH 33/45] Remove `head` filtering

---
 main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 900a1be..ffb9ec8 100644
--- a/main.nf
+++ b/main.nf
@@ -74,9 +74,9 @@ process VCF2MAF {
   vep_forks = task.cpus + 2
   """
   if [[ ${input_vcf} == *.gz ]]; then
-    zcat ${input_vcf} | head -n 10000 > intermediate.vcf
+    zcat ${input_vcf} > intermediate.vcf
   else
-    cat  ${input_vcf} | head -n 10000 > intermediate.vcf
+    cat  ${input_vcf} > intermediate.vcf
   fi
 
   vcf2maf.pl \

From e802174283299d05dd4f0ca3294f08d5a20d7461 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 12:28:24 -0700
Subject: [PATCH 34/45] Address caching issue due to spaces in file names

---
 README.md | 3 ++-
 main.nf   | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 468423f..b9889b3 100644
--- a/README.md
+++ b/README.md
@@ -73,9 +73,10 @@ Check out the [Quickstart](#quickstart) section for example parameter values. Yo
 
 ### Samplesheet
 
-The input samplesheet should be in comma-separated values (CSV) format and contain the following columns:
+The input samplesheet should be in comma-separated values (CSV) format and contain the following columns. **You should avoid using spaces or special characters in any of the columns.** Otherwise, you might run into job caching issues.
 
 1. **`synapse_id`**: Synapse ID of the VCF file
+   - Make sure that the Synapse account associated with the personal access token has access to all listed VCF files
 2. **`biospecimen_id`**: Biospecimen/sample identifier
     - This value will be used to populate the `Tumor_Sample_Barcode` MAF column
     - **Important:** This value needs to uniquely identify samples within each merged MAF file. See [below](#maf-files) for information on how MAF files are merged.
diff --git a/main.nf b/main.nf
index ffb9ec8..5775fb3 100644
--- a/main.nf
+++ b/main.nf
@@ -21,6 +21,7 @@ process SYNAPSE_GET {
   script:
   """
   synapse get ${synapse_id}
+  for f in *\ *; do mv "\${f}" "\${f// /_}"; done
   """
 
 }
@@ -74,9 +75,9 @@ process VCF2MAF {
   vep_forks = task.cpus + 2
   """
   if [[ ${input_vcf} == *.gz ]]; then
-    zcat ${input_vcf} > intermediate.vcf
+    zcat ${input_vcf} | head -n 10000 > intermediate.vcf
   else
-    cat  ${input_vcf} > intermediate.vcf
+    cat  ${input_vcf} | head -n 10000 > intermediate.vcf
   fi
 
   vcf2maf.pl \

From 191ad6fe9b198c26ed19a54cd343af1abd698408 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 15:17:43 -0700
Subject: [PATCH 35/45] Leave basename intact in sample MAF files

---
 main.nf | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 5775fb3..6b40e52 100644
--- a/main.nf
+++ b/main.nf
@@ -71,8 +71,9 @@ process VCF2MAF {
   // TODO: Remove hard-coded VEP path
   // TODO: Handle VCF genotype columns per variant caller
   script:
-  vep_path = "/root/miniconda3/envs/vep/bin"
+  vep_path  = "/root/miniconda3/envs/vep/bin"
   vep_forks = task.cpus + 2
+  basename  = input_vcf.name.replaceAll(/.gz$/, "").replaceAll(/.vcf$/, "")
   """
   if [[ ${input_vcf} == *.gz ]]; then
     zcat ${input_vcf} | head -n 10000 > intermediate.vcf
@@ -88,7 +89,7 @@ process VCF2MAF {
     --tumor-id '${meta.biospecimen_id}' --vep-forks ${vep_forks} \
     --species ${params.species}
 
-  grep -v '^#' intermediate.maf.raw > '${meta.biospecimen_id}-${meta.variant_class}-${meta.variant_caller}.maf'
+  grep -v '^#' intermediate.maf.raw > '${basename}.maf'
   """
 
 }

From 57a2076647e85d2fd32ed715b2106fbcd680d14b Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 15:36:01 -0700
Subject: [PATCH 36/45] Fix bug for handling empty glob matches

---
 main.nf | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 6b40e52..d9c6f36 100644
--- a/main.nf
+++ b/main.nf
@@ -8,7 +8,7 @@ process SYNAPSE_GET {
 
   tag "${meta.synapse_id}"
 
-  container "sagebionetworks/synapsepythonclient:v2.6.0"
+  container "sagebionetworks/synapsepythonclient:v2.7.0"
 
   secret "SYNAPSE_AUTH_TOKEN"
 
@@ -21,6 +21,8 @@ process SYNAPSE_GET {
   script:
   """
   synapse get ${synapse_id}
+
+  shopt -s nullglob
   for f in *\ *; do mv "\${f}" "\${f// /_}"; done
   """
 

From 14325f1ccce5040d7870abeca29e04628ba64103 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 16:28:22 -0700
Subject: [PATCH 37/45] Move away from for-loop in bash

---
 main.nf | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index d9c6f36..08f58d9 100644
--- a/main.nf
+++ b/main.nf
@@ -22,8 +22,7 @@ process SYNAPSE_GET {
   """
   synapse get ${synapse_id}
 
-  shopt -s nullglob
-  for f in *\ *; do mv "\${f}" "\${f// /_}"; done
+  find . -type f -name "* *.xml" -exec bash -c 'mv "$0" "${0// /_}"' {} \;
   """
 
 }

From 8c369d3b9b6fb80819404de50df7483d8ddb1621 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 16:40:44 -0700
Subject: [PATCH 38/45] Escape dollar signs

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 08f58d9..3235552 100644
--- a/main.nf
+++ b/main.nf
@@ -22,7 +22,7 @@ process SYNAPSE_GET {
   """
   synapse get ${synapse_id}
 
-  find . -type f -name "* *.xml" -exec bash -c 'mv "$0" "${0// /_}"' {} \;
+  find . -type f -name "* *" -exec bash -c 'mv "\$0" "\${0// /_}"' {} \;
   """
 
 }

From 285c064fe48d8664c1b1b6df0a4c261ce7a8616b Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 17:00:45 -0700
Subject: [PATCH 39/45] Revert synapseclient version upgrade

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 3235552..f85887e 100644
--- a/main.nf
+++ b/main.nf
@@ -8,7 +8,7 @@ process SYNAPSE_GET {
 
   tag "${meta.synapse_id}"
 
-  container "sagebionetworks/synapsepythonclient:v2.7.0"
+  container "sagebionetworks/synapsepythonclient:v2.6.0"
 
   secret "SYNAPSE_AUTH_TOKEN"
 

From 0cb0240ec803dfd8aa9517857ebdb9902c90ba3a Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 17:04:54 -0700
Subject: [PATCH 40/45] Try `shell:` block instead of `script:`

---
 main.nf | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/main.nf b/main.nf
index f85887e..d709a37 100644
--- a/main.nf
+++ b/main.nf
@@ -18,12 +18,13 @@ process SYNAPSE_GET {
   output:
   tuple val(meta), path('*')
 
-  script:
-  """
-  synapse get ${synapse_id}
+  shell:
+  '''
+  synapse get !{synapse_id}
 
-  find . -type f -name "* *" -exec bash -c 'mv "\$0" "\${0// /_}"' {} \;
-  """
+  shopt -s nullglob
+  for f in *\ *; do mv "${f}" "${f// /_}"; done
+  '''
 
 }
 

From fde57c9286b836fcff2e3006268b946d14de9519 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 17:14:55 -0700
Subject: [PATCH 41/45] Try escaping backslash

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index d709a37..43d0a65 100644
--- a/main.nf
+++ b/main.nf
@@ -23,7 +23,7 @@ process SYNAPSE_GET {
   synapse get !{synapse_id}
 
   shopt -s nullglob
-  for f in *\ *; do mv "${f}" "${f// /_}"; done
+  for f in *\\ *; do mv "${f}" "${f// /_}"; done
   '''
 
 }

From deaa7f90aa37f3de59ae4725b62137b15994ca89 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 17:32:22 -0700
Subject: [PATCH 42/45] Try `script:` with backslash escape

---
 main.nf | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/main.nf b/main.nf
index 43d0a65..16ec9d0 100644
--- a/main.nf
+++ b/main.nf
@@ -18,13 +18,13 @@ process SYNAPSE_GET {
   output:
   tuple val(meta), path('*')
 
-  shell:
-  '''
-  synapse get !{synapse_id}
+  script:
+  """
+  synapse get ${synapse_id}
 
   shopt -s nullglob
-  for f in *\\ *; do mv "${f}" "${f// /_}"; done
-  '''
+  for f in *\\ *; do mv "\${f}" "\${f// /_}"; done
+  """
 
 }
 

From e4b43e018d6dfd083b343856a52ba3ec4bcf6f9a Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 17:48:52 -0700
Subject: [PATCH 43/45] Give enough memory for the `merge_mafs` job

---
 main.nf | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/main.nf b/main.nf
index 16ec9d0..9b581c6 100644
--- a/main.nf
+++ b/main.nf
@@ -58,6 +58,8 @@ process VCF2MAF {
 
   cpus   6
   memory { 8.GB * task.attempt }
+
+  errorStrategy = 'retry'
   maxRetries 3
 
   afterScript "rm -f intermediate*"
@@ -125,6 +127,11 @@ process MERGE_MAFS {
 
   container "python:3.10.4"
 
+  memory { 16.GB * task.attempt }
+  
+  errorStrategy = 'retry'
+  maxRetries 3
+
   input:                
   tuple val(meta), path(input_mafs)
 

From 3f093d958283d583b2cd94de9056da7235c5e678 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 17:49:11 -0700
Subject: [PATCH 44/45] Trim trailing whitespace

---
 main.nf | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/main.nf b/main.nf
index 9b581c6..6e91a7b 100644
--- a/main.nf
+++ b/main.nf
@@ -64,12 +64,12 @@ process VCF2MAF {
 
   afterScript "rm -f intermediate*"
 
-  input:                
+  input:
   tuple val(meta), path(input_vcf)
   tuple path(reference_fasta), path(reference_fasta_fai)
   path vep_data
 
-  output:               
+  output:
   tuple val(meta), path("*.maf")
 
   // TODO: Remove hard-coded VEP path
@@ -128,11 +128,11 @@ process MERGE_MAFS {
   container "python:3.10.4"
 
   memory { 16.GB * task.attempt }
-  
+
   errorStrategy = 'retry'
   maxRetries 3
 
-  input:                
+  input:
   tuple val(meta), path(input_mafs)
 
   output:
@@ -189,7 +189,7 @@ workflow SAMPLE_MAFS {
     sample_mafs_ch = VCF2MAF.out
       .map { meta, maf -> [ maf, meta.sample_parent_id ] }
     SYNAPSE_STORE(sample_mafs_ch)
-  
+
   emit:
     VCF2MAF.out
 
@@ -241,7 +241,7 @@ workflow {
 
 // Function to get list of [ meta, vcf ]
 def create_vcf_channel(LinkedHashMap row) {
-  
+
   // Create metadata element
   def meta = [:]
   meta.synapse_id       = row.synapse_id
@@ -262,7 +262,7 @@ def create_vcf_channel(LinkedHashMap row) {
 
 // Function to get list of [ study_meta, maf ]
 def subset_study_meta(vcf_meta, maf) {
-  
+
   // Subset metadata element
   def study_meta = [:]
   study_meta.merged_parent_id = vcf_meta.merged_parent_id

From cbe08e084e3790d7bc54f56f85e312eadc08bb0d Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Fri, 16 Sep 2022 17:51:16 -0700
Subject: [PATCH 45/45] Remove `head` commands

---
 main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 6e91a7b..8cf1611 100644
--- a/main.nf
+++ b/main.nf
@@ -80,9 +80,9 @@ process VCF2MAF {
   basename  = input_vcf.name.replaceAll(/.gz$/, "").replaceAll(/.vcf$/, "")
   """
   if [[ ${input_vcf} == *.gz ]]; then
-    zcat ${input_vcf} | head -n 10000 > intermediate.vcf
+    zcat ${input_vcf} > intermediate.vcf
   else
-    cat  ${input_vcf} | head -n 10000 > intermediate.vcf
+    cat  ${input_vcf} > intermediate.vcf
   fi
 
   vcf2maf.pl \