Merge pull request #27 from ghga-de/dev

Dev
ghga-de · Dec 13, 2023 · c3bbcf0 · c3bbcf0
2 parents cd6082d + 84fbfdf
commit c3bbcf0
Show file tree

Hide file tree

Showing 25 changed files with 265 additions and 248 deletions.
diff --git a/README.md b/README.md
@@ -34,6 +34,8 @@ The pipeline has 6 main steps: Indel calling using platypus, basic annotations,
    ANNOVAR ([`Annovar`](https://annovar.openbioinformatics.org/en/latest/))
    : annotate_variation.pl is used to annotate variants. The tool makes classifications for intergenic, intogenic, nonsynoymous SNP, frameshift deletion or large-scale duplication regions.
 
+   ENSEMBL VEP(['ENSEBL VEP'](https://www.ensembl.org/info/docs/tools/vep/index.html)) :can also be used alternative to annovar. Gene annotations will be extracted.
+
    Reliability and confidation annotations: It is an optional ste for mapability, hiseq, selfchain and repeat regions checks for reliability and confidence of those scores.
 
 3. Deep Annotation (--runIndelDeepAnnotation True):
@@ -68,6 +70,20 @@ The pipeline has 6 main steps: Indel calling using platypus, basic annotations,
 annotate_variation.pl -downdb wgEncodeGencodeBasicV19 humandb/ -build hg19
 ```
 
+Gene annotation is also possible with ENSEMBL VEP tool, for test purposes only, it can be used online. But for big analysis, it is recommended to either download cache file or use --download_cache flag in parameters.
+
+Follow the documentation [here](https://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html#cache)
+
+Example:
+
+Download [cache](https://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/)
+
+```console
+cd $HOME/.vep
+curl -O https://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_vep_110_GRCh38.tar.gz
+tar xzf homo_sapiens_vep_110_GRCh38.tar.gz
+```
+
 4. Download the pipeline and test it on a minimal dataset with a single command:
 
    ```console
@@ -121,6 +137,8 @@ Note that some form of configuration will be needed so that Nextflow knows how t
 Annotations are optional for the user.
 All VCF and BED files need to be indexed with tabix and should be in the same folder!
 
+The reference set bundle which is used in PCAWG study can be found and downloaded [here](https://dcc.icgc.org/api/v1/download?fn=/PCAWG/reference_data/pcawg-dkfz/dkfz-workflow-dependencies_150318_0951.tar.gz). (NOTE: only in hg19)
+
 **Basic Annotation Files**
 
 - dbSNP INDELs (vcf)
@@ -176,18 +194,22 @@ Please read [usage](https://github.com/ghga-de/nf-platypusindelcalling/blob/main
 
 ## Credits
 
-nf-platypusindelcalling was originally written by Kuebra Narci [email protected].
+nf-platypusindelcalling was originally translated from roddy-based pipeline by Kuebra Narci [email protected].
 
 The pipeline is originally written in workflow management language Roddy. [Inspired github page](https://github.com/DKFZ-ODCF/IndelCallingWorkflow)
 
+The Indel calling workflow was in the pan-cancer analysis of whole genomes (PCAWG) and can be cited in the following publication:
+
+Pan-cancer analysis of whole genomes. The ICGC/TCGA Pan-Cancer Analysis of Whole Genomes Consortium. Nature volume 578, pages 82–93 (2020). DOI 10.1038/s41586-020-1969-6
+
 We thank the following people for their extensive assistance in the development of this pipeline:
 
+- Nagarajan Paramasivam (@NagaComBio) [email protected]
+
 **TODO**
 
 <!-- TODO nf-core: If applicable, make list of people who have also contributed -->
 
-Nagarajan Paramasivam @NagaComBio [email protected]
-
 ## Contributions and Support
 
 If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).

diff --git a/assets/test.csv b/assets/test.csv
diff --git a/bin/vcfparser.pyc b/bin/vcfparser.pyc
diff --git a/conf/dkfz_cluster_38.config b/conf/dkfz_cluster_38.config
@@ -43,6 +43,7 @@ params {
 
     // Annovar 
     // Annovar needs to be build locally
+    annotation_tool            = "annovar"
     buildver                   = "hg38"
     dbtype                     = "wgEncodeGencodeCompV39"
     segdupcol                  = "SEGDUP"

diff --git a/conf/test.config b/conf/test.config
@@ -40,10 +40,8 @@ params {
 
     // Annotation with vep
     annotation_tool            = "vep"
-    species                    = "homo_sapiens"
     vep_cache_version          = 110
     vep_genome                 = 'GRCh38'
-    vep_version                = '110'
     vep_cache                  = null
     download_cache             = false  // DO NOT Download annotation cache
 
@@ -102,6 +100,6 @@ process {
    }
    // using vep online is only recommended for test purposes for a minimal set of variants!
     withName: 'ENSEMBLVEP_VEP' {
-        ext.args         ='--per_gene --total_length --database'
+        ext.args         ='--per_gene --total_length'
     }
 }
diff --git a/docs/usage.md b/docs/usage.md
@@ -36,6 +36,8 @@ NOTE: this workflow is configured to use both igenomes and refgenie. **genome**
 
 **Annotation Step:**
 
+NOTE: The reference set bundle which is used in PCAWG study can be found and downloaded [here](https://dcc.icgc.org/api/v1/download?fn=/PCAWG/reference_data/pcawg-dkfz/dkfz-workflow-dependencies_150318_0951.tar.gz). (only in hg19)
+
 If --runIndelAnnotation is true, the following files must be defined (with corresponding indexes):
 
 **1. annotate.vcf Options:**

diff --git a/modules/local/annotate_vcf.nf b/modules/local/annotate_vcf.nf
@@ -8,15 +8,15 @@ process ANNOTATE_VCF {
     'docker://kubran/odcf_platypusindelcalling:v1' :'kubran/odcf_platypusindelcalling:v1' }"
 
     input:
-    tuple val(meta)            , file(vcf)     , file(vcf_tbi)
-    tuple file(kgenome)        , file(kgenome_i)
-    tuple file(dbsnpindel)     , file(dbsnpindel_i)
-    tuple file(exac)           , file(exac_i)
-    tuple file(evs)            , file(evs_i)
-    tuple file(localcontrolwgs), file(localcontrolwgs_i)
-    tuple file(localcontrolwes), file(localcontrolwes_i)
-    tuple file(gnomadgenomes)  , file(gnomadgenomes_i)
-    tuple file(gnomadexomes)   , file(gnomadexomes_i)
+    tuple val(meta), path(vcf), path(vcf_tbi)
+    tuple path(kgenome),path(kgenome_i)
+    tuple path(dbsnpindel),path(dbsnpindel_i)
+    tuple path(exac),path(exac_i)
+    tuple path(evs),path(evs_i)
+    tuple path(localcontrolwgs),path(localcontrolwgs_i)
+    tuple path(localcontrolwes),path(localcontrolwes_i)
+    tuple path(gnomadgenomes),path(gnomadgenomes_i)
+    tuple path(gnomadexomes),path(gnomadexomes_i)
     val (chrprefix)
 
     output:
@@ -30,14 +30,14 @@ process ANNOTATE_VCF {
     script:
     def args        = task.ext.args ?: ''
     def prefix      = task.ext.prefix ?: "${meta.id}"
-    def pipe  = [dbsnpindel.baseName !='input' ? " | annotate_vcf.pl -a - -b ${dbsnpindel} --columnName='DBSNP' --reportMatchType --bAdditionalColumn=2 --reportBFeatCoord --padding=${params.padding} --minOverlapFraction=${params.minoverlapfraction} --maxBorderDistanceSum=${params.maxborderdist} --maxNrOfMatches=${params.maxmatches}" : '',
-                kgenome.baseName !='input' ? " | annotate_vcf.pl -a - -b ${kgenome} --columnName='1K_GENOMES' --reportMatchType --bAdditionalColumn=2 --reportBFeatCoord --padding=${params.padding} --minOverlapFraction=${params.minoverlapfraction} --maxBorderDistanceSum=${params.maxborderdist} --maxNrOfMatches=${params.maxmatches}" : '',
-                exac.baseName !='input' ? " | annotate_vcf.pl -a - -b ${exac} --columnName='ExAC' --bFileType vcf --reportLevel 4 --reportMatchType" : '',
-                evs.baseName !='input' ? " | annotate_vcf.pl -a - -b ${evs} --columnName='EVS' --bFileType vcf --reportLevel 4 --reportMatchType" : '',
-                localcontrolwgs.baseName !='input' ? " | annotate_vcf.pl -a - -b ${localcontrolwgs} --columnName='LocalControlAF_WGS' --bFileType vcf --reportLevel 4 --reportMatchType" : '',
-                localcontrolwes.baseName !='input' ? " | annotate_vcf.pl -a - -b ${localcontrolwes} --columnName='LocalControlAF_WES' --bFileType vcf --reportLevel 4 --reportMatchType" : '',
-                gnomadgenomes.baseName !='input' ? " | annotate_vcf.pl -a - -b ${gnomadgenomes} --columnName='GNOMAD_GENOMES' --bFileType vcf --reportLevel 4 --reportMatchType" : '',
-                gnomadexomes.baseName !='input' ? " | annotate_vcf.pl -a - -b ${gnomadexomes} --columnName='GNOMAD_EXOMES' --bFileType vcf --reportLevel 4 --reportMatchType" : ''
+    def pipe  = [dbsnpindel ? " | annotate_vcf.pl -a - -b ${dbsnpindel} --columnName='DBSNP' --reportMatchType --bAdditionalColumn=2 --reportBFeatCoord --padding=${params.padding} --minOverlapFraction=${params.minoverlapfraction} --maxBorderDistanceSum=${params.maxborderdist} --maxNrOfMatches=${params.maxmatches}" : '',
+                kgenome ? " | annotate_vcf.pl -a - -b ${kgenome} --columnName='1K_GENOMES' --reportMatchType --bAdditionalColumn=2 --reportBFeatCoord --padding=${params.padding} --minOverlapFraction=${params.minoverlapfraction} --maxBorderDistanceSum=${params.maxborderdist} --maxNrOfMatches=${params.maxmatches}" : '',
+                exac ? " | annotate_vcf.pl -a - -b ${exac} --columnName='ExAC' --bFileType vcf --reportLevel 4 --reportMatchType" : '',
+                evs ? " | annotate_vcf.pl -a - -b ${evs} --columnName='EVS' --bFileType vcf --reportLevel 4 --reportMatchType" : '',
+                localcontrolwgs ? " | annotate_vcf.pl -a - -b ${localcontrolwgs} --columnName='LocalControlAF_WGS' --bFileType vcf --reportLevel 4 --reportMatchType" : '',
+                localcontrolwes ? " | annotate_vcf.pl -a - -b ${localcontrolwes} --columnName='LocalControlAF_WES' --bFileType vcf --reportLevel 4 --reportMatchType" : '',
+                gnomadgenomes ? " | annotate_vcf.pl -a - -b ${gnomadgenomes} --columnName='GNOMAD_GENOMES' --bFileType vcf --reportLevel 4 --reportMatchType" : '',
+                gnomadexomes ? " | annotate_vcf.pl -a - -b ${gnomadexomes} --columnName='GNOMAD_EXOMES' --bFileType vcf --reportLevel 4 --reportMatchType" : ''
                 ].join(' ').trim()
 
     """

diff --git a/modules/local/annotation_pipes.nf b/modules/local/annotation_pipes.nf
@@ -8,19 +8,19 @@ process ANNOTATION_PIPES {
     'docker://kubran/odcf_platypusindelcalling:v1' :'kubran/odcf_platypusindelcalling:v1' }"
 
     input:
-    tuple val(meta)           , file(vcf)             , file(vcf_tbi)
-    tuple file(enchangers)    , file(enchangers_i)
-    tuple file(cpgislands)    , file(cpgislands_i)
-    tuple file(tfbscons)      , file(tfbscons_i)
-    tuple file(encode_dnase)  , file(encode_dnase_i)
-    tuple file(mirnas_snornas), file(mirnas_snornas_i)
-    tuple file(cosmic)        , file(cosmic_i)
-    tuple file(mirbase)       , file(mirbase_i)
-    tuple file(mir_targets)   , file(mir_targets_i)
-    tuple file(cgi_mountains) , file(cgi_mountains_i)
-    tuple file(phastconselem) , file(phastconselem_i)
-    tuple file(encode_tfbs)   , file(encode_tfbs_i)
-    tuple file(mirnas_sncrnas), file(mirnas_sncrnas_i)
+    tuple val(meta), path(vcf), path(vcf_tbi)
+    tuple path(enchangers),path(enchangers_i)
+    tuple path(cpgislands),path(cpgislands_i)
+    tuple path(tfbscons),path(tfbscons_i)
+    tuple path(encode_dnase),path(encode_dnase_i)
+    tuple path(mirnas_snornas),path(mirnas_snornas_i)
+    tuple path(cosmic),file(cosmic_i)
+    tuple path(mirbase),path(mirbase_i)
+    tuple path(mir_targets),path(mir_targets_i)
+    tuple path(cgi_mountains),path(cgi_mountains_i)
+    tuple path(phastconselem),path(phastconselem_i)
+    tuple path(encode_tfbs),path(encode_tfbs_i)
+    tuple path(mirnas_sncrnas),path(mirnas_sncrnas_i)
 
     output:
     tuple val(meta), path('*.deepanno.vcf.gz'), path('*.deepanno.vcf.gz.tbi') , emit: vcf
@@ -33,18 +33,18 @@ process ANNOTATION_PIPES {
     def args   = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
 
-    def pipe  = [enchangers.baseName !='input' ? " | annotate_vcf.pl -a - -b ${enchangers} --bFileType=bed --columnName='Enhancers'" : '',
-                cpgislands.baseName !='input' ? " | annotate_vcf.pl -a - -b ${cpgislands} --bFileType=bed --columnName='CpGislands'" : '',
-                tfbscons.baseName !='input' ? " | annotate_vcf.pl -a - -b ${tfbscons} --bFileType=bed --columnName='TFBScons'" : '',
-                mirnas_snornas.baseName !='input' ? " | annotate_vcf.pl -a - -b ${mirnas_snornas} --bFileType=bed --columnName='miRNAs_snoRNAs'" : '',
-                encode_dnase.baseName !='input' ? " | annotate_vcf.pl -a - -b ${encode_dnase} --bFileType=bed --columnName='ENCODE_DNASE'" : '',
-                mirbase.baseName !='input' ? " | annotate_vcf.pl -a - -b ${mirbase} --bFileType=bed --columnName='miRBase18'" : '',
-                cosmic.baseName !='input' ? " | annotate_vcf.pl -a - -b ${cosmic} --bFileType=bed --columnName='COSMIC' --bAdditionalColumns=7,8,9 --reportLevel=1" : '',
-                mir_targets.baseName !='input' ? " | annotate_vcf.pl -a - -b ${mir_targets} --columnName='miRNAtargets'" : '' ,
-                cgi_mountains.baseName !='input' ? " | annotate_vcf.pl -a - -b ${cgi_mountains} --bFileType=bed --columnName='CgiMountains' --bAdditionalColumns=4" : '',
-                phastconselem.baseName !='input' ? " | annotate_vcf.pl -a - -b ${phastconselem} --bFileType=bed --columnName='phastConsElem20bp' --bAdditionalColumns=4" : '',
-                encode_tfbs.baseName !='input' ? " | annotate_vcf.pl -a - -b ${encode_tfbs} --columnName='ENCODE_TFBS'" : '',
-                mirnas_sncrnas.baseName !='input' ? " | annotate_vcf.pl -a - -b ${mirnas_sncrnas} --bFileType=bed --columnName='miRNAs_sncRNAs'" : ''
+    def pipe  = [enchangers ? " | annotate_vcf.pl -a - -b ${enchangers} --bFileType=bed --columnName='Enhancers'" : '',
+                cpgislands ? " | annotate_vcf.pl -a - -b ${cpgislands} --bFileType=bed --columnName='CpGislands'" : '',
+                tfbscons ? " | annotate_vcf.pl -a - -b ${tfbscons} --bFileType=bed --columnName='TFBScons'" : '',
+                mirnas_snornas ? " | annotate_vcf.pl -a - -b ${mirnas_snornas} --bFileType=bed --columnName='miRNAs_snoRNAs'" : '',
+                encode_dnase ? " | annotate_vcf.pl -a - -b ${encode_dnase} --bFileType=bed --columnName='ENCODE_DNASE'" : '',
+                mirbase ? " | annotate_vcf.pl -a - -b ${mirbase} --bFileType=bed --columnName='miRBase18'" : '',
+                cosmic ? " | annotate_vcf.pl -a - -b ${cosmic} --bFileType=bed --columnName='COSMIC' --bAdditionalColumns=7,8,9 --reportLevel=1" : '',
+                mir_targets ? " | annotate_vcf.pl -a - -b ${mir_targets} --columnName='miRNAtargets'" : '' ,
+                cgi_mountains ? " | annotate_vcf.pl -a - -b ${cgi_mountains} --bFileType=bed --columnName='CgiMountains' --bAdditionalColumns=4" : '',
+                phastconselem ? " | annotate_vcf.pl -a - -b ${phastconselem} --bFileType=bed --columnName='phastConsElem20bp' --bAdditionalColumns=4" : '',
+                encode_tfbs ? " | annotate_vcf.pl -a - -b ${encode_tfbs} --columnName='ENCODE_TFBS'" : '',
+                mirnas_sncrnas ? " | annotate_vcf.pl -a - -b ${mirnas_sncrnas} --bFileType=bed --columnName='miRNAs_sncRNAs'" : ''
                 ].join(' ').trim() 
     """
     zcat < $vcf $pipe > ${prefix}.deepanno.vcf

diff --git a/modules/local/annovar.nf b/modules/local/annovar.nf
@@ -1,6 +1,5 @@
 //# Gene annotation with annovar
 // PROCESS ANNOVAR table_annovar
-// working database is annovar_Feb2016
 
 process ANNOVAR {
     tag "$meta.id"
@@ -11,8 +10,8 @@ process ANNOVAR {
     'docker://kubran/odcf_platypusindelcalling:v1' :'kubran/odcf_platypusindelcalling:v1' }"
 
     input:          
-    tuple val(meta)         , file(ch_vcf),  file(annovar_bed)
-    each file(annovar_table)
+    tuple val(meta)    , path(ch_vcf),  path(annovar_bed)
+    path(annovar_table)
     val(chrprefix)
 
     output:

diff --git a/modules/local/check_if_corrupted.nf b/modules/local/check_if_corrupted.nf
@@ -7,7 +7,7 @@ process CHECK_IF_CORRUPTED {
         'docker://kubran/odcf_platypusindelcalling:v1' :'kubran/odcf_platypusindelcalling:v1' }"
 
     input:
-    tuple val(meta), file(vcf)
+    tuple val(meta), path(vcf)
 
     output:
     tuple val(meta),path("*.raw.vcf.gz"), path("*.raw.vcf.gz.tbi"), emit: vcf

diff --git a/modules/local/confidence_annotation.nf b/modules/local/confidence_annotation.nf
@@ -7,7 +7,7 @@ process CONFIDENCE_ANNOTATION {
     'docker://kubran/odcf_platypusindelcalling:v1' :'kubran/odcf_platypusindelcalling:v1' }"
 
     input:
-    tuple val(meta), val(tumorname), val(controlname), file(vcfgz), file(vcf_tbi)
+    tuple val(meta), val(tumorname), val(controlname), path(vcfgz), path(vcf_tbi)
     val(ref_type)
 
     output:

diff --git a/modules/local/filter_by_crit.nf b/modules/local/filter_by_crit.nf
@@ -8,7 +8,7 @@ process FILTER_BY_CRIT {
     'docker://kubran/odcf_platypusindelcalling:v1' :'kubran/odcf_platypusindelcalling:v1' }"
 
     input:
-    tuple val(meta), file(vcfgz), file(vcf_tbi)
+    tuple val(meta), path(vcfgz), path(vcf_tbi)
 
     output:
     tuple val(meta), path('*Filtered.vcf.gz'),  path('*Filtered.vcf.gz.tbi')   , emit: vcf

diff --git a/modules/local/indel_extraction.nf b/modules/local/indel_extraction.nf
@@ -8,7 +8,7 @@ process INDEL_EXTRACTION {
     'docker://kubran/odcf_platypusindelcalling:v1' :'kubran/odcf_platypusindelcalling:v1' }"
 
     input:
-    tuple val(meta), file(ch_vcf), file(ch_vcf_i)
+    tuple val(meta), path(ch_vcf), path(ch_vcf_i)
 
     output:
     tuple val(meta), path('indel_*_somatic_functional_indels_conf_*_to_10.vcf')          , emit: somatic_functional

diff --git a/modules/local/indel_json.nf b/modules/local/indel_json.nf
@@ -10,7 +10,7 @@ process INDEL_JSON {
 
 
     input:
-    tuple val(meta), file(vcf)
+    tuple val(meta), path(vcf)
 
     output:
     path('*.indel.json')                       , emit: json