Merge pull request #22 from ghga-de/dev

Dev
ghga-de · Dec 1, 2023 · cd6082d · cd6082d
2 parents 80ff04c + b53e255
commit cd6082d
Show file tree

Hide file tree

Showing 24 changed files with 586 additions and 94 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md b/README.md
@@ -185,6 +185,7 @@ We thank the following people for their extensive assistance in the development
 **TODO**
 
 <!-- TODO nf-core: If applicable, make list of people who have also contributed -->
+
 Nagarajan Paramasivam @NagaComBio [email protected]
 
 ## Contributions and Support
@@ -193,7 +194,6 @@ If you would like to contribute to this pipeline, please see the [contributing g
 
 ## Citations
 
-
 <!-- If you use  nf-platypusindelcalling for your analysis, please cite it using the following doi: [10.5281/zenodo.XXXXXX](https://doi.org/10.5281/zenodo.XXXXXX) -->
 
 <!-- TODO nf-core: Add bibliography of tools and data used in your pipeline -->

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -1,54 +1,54 @@
 {
-    "$schema": "http://json-schema.org/draft-07/schema",
-    "$id": "https://raw.githubusercontent.com/ghga-de/nf-platypusindelcalling/master/assets/schema_input.json",
-    "title": "nf-platypusindelcalling pipeline - params.input schema",
-    "description": "Schema for the file provided with params.input",
-    "type": "array",
-    "items": {
-        "type": "object",
-        "properties": {
-            "sample": {
-                "type": "string",
-                "pattern": "^\\S+$",
-                "errorMessage": "Sample name must be provided and cannot contain spaces"
-            },
-            "tumor": {
-                "type": "string",
-                "pattern": "^\\S+\\.bam$",
-                "errorMessage": "BAM file for tumors must be provided'"
-            },
-            "tumor_index": {
-                "type": "string",
-                "pattern": "^\\S+\\.bai$",
-                "errorMessage": "BAI file matching to BAM for tumors must be provided'"
-            },
-            "control": {
-                "errorMessage": "BAM file for as control matching to tumor, if there is",
-                "anyOf": [
-                    {
-                        "type": "string",
-                        "pattern": "^\\S+\\.bam$"
-                    },
-                    {
-                        "type": "string",
-                        "maxLength": 0
-                    }
-                ]
-            },
-            "control_index": {
-                "errorMessage": "BAI file matching to BAM for as control matching to tumor, if there is",
-                "anyOf": [
-                    {
-                        "type": "string",
-                        "pattern": "^\\S+\\.bai$"
-                    },
-                    {
-                        "type": "string",
-                        "maxLength": 0
-                    }
-                ]
-            }
-        },
-        "required": ["sample", "tumor", "tumor_index"]
-    }
+  "$schema": "http://json-schema.org/draft-07/schema",
+  "$id": "https://raw.githubusercontent.com/ghga-de/nf-platypusindelcalling/master/assets/schema_input.json",
+  "title": "nf-platypusindelcalling pipeline - params.input schema",
+  "description": "Schema for the file provided with params.input",
+  "type": "array",
+  "items": {
+    "type": "object",
+    "properties": {
+      "sample": {
+        "type": "string",
+        "pattern": "^\\S+$",
+        "errorMessage": "Sample name must be provided and cannot contain spaces"
+      },
+      "tumor": {
+        "type": "string",
+        "pattern": "^\\S+\\.bam$",
+        "errorMessage": "BAM file for tumors must be provided'"
+      },
+      "tumor_index": {
+        "type": "string",
+        "pattern": "^\\S+\\.bai$",
+        "errorMessage": "BAI file matching to BAM for tumors must be provided'"
+      },
+      "control": {
+        "errorMessage": "BAM file for as control matching to tumor, if there is",
+        "anyOf": [
+          {
+            "type": "string",
+            "pattern": "^\\S+\\.bam$"
+          },
+          {
+            "type": "string",
+            "maxLength": 0
+          }
+        ]
+      },
+      "control_index": {
+        "errorMessage": "BAI file matching to BAM for as control matching to tumor, if there is",
+        "anyOf": [
+          {
+            "type": "string",
+            "pattern": "^\\S+\\.bai$"
+          },
+          {
+            "type": "string",
+            "maxLength": 0
+          }
+        ]
+      }
+    },
+    "required": ["sample", "tumor", "tumor_index"]
+  }
 }
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -37,7 +37,7 @@ def check_samplesheet(file_in, file_out):
     """
     This function checks that the samplesheet follows the following structure:
     sample,tumor,tumor_index, control, control_index
-    sample_WithControl,tumor1.bam,tumot1.bai, control1.bam, control1.bai
+    sample_WithControl,tumor1.bam,tumor1.bai, control1.bam, control1.bai
     sample_WithoutControl,tumor2.bam,tumor2.bai,,
     For an example see:
     https://github.com/ghga-de/nf-platypusindelcalling/assets/samplesheet.csv

diff --git a/bin/vcfparser.pyc b/bin/vcfparser.pyc
diff --git a/conf/modules.config b/conf/modules.config
@@ -106,7 +106,24 @@ process {
             pattern: "*.{txt}",
             mode: params.publish_dir_mode
         ]
-    }     
+    }
+    withName: 'ENSEMBLVEP_DOWNLOAD' {
+        ext.args   = { '--AUTO c --CONVERT --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE' }
+        publishDir = [
+            mode: params.publish_dir_mode,
+            path: { params.outdir_cache ? "${params.outdir_cache}/": "${params.outdir}/cache/" }
+        ]
+    }
+    withName: 'ENSEMBLVEP_VEP' {
+        //ext.args         ='--everything --filter_common --per_gene --total_length --offline'
+        publishDir       = [
+            [
+                mode: params.publish_dir_mode,
+                path: { "${params.outdir}/${meta.id}/" },
+                pattern: "*{gz,tbi,html}"
+            ]
+        ]
+    }    
 }
 //
 // Don't publish results for these processes

diff --git a/conf/test.config b/conf/test.config
@@ -34,19 +34,19 @@ params {
     skip_multiqc               = false
     min_confidence_score       = 0
 
-    // Annovar 
-    // Annovar needs to be build locally
-    buildver                   = "hg38"
-    dbtype                     = "wgEncodeGencodeCompV39"
-    segdupcol                  = "SEGDUP"
-    cytobandcol                = "CYTOBAND"
-    geneannocols               = '"ANNOVAR_FUNCTION,GENE,EXONIC_CLASSIFICATION,ANNOVAR_TRANSCRIPTS"'
-    annovar_path               = "/Users/w620-admin/Desktop/Workflows/Annovar/annovar_Sept2022"
-
 
     // Reference Files //
     genome                     = "GRCh38"
 
+    // Annotation with vep
+    annotation_tool            = "vep"
+    species                    = "homo_sapiens"
+    vep_cache_version          = 110
+    vep_genome                 = 'GRCh38'
+    vep_version                = '110'
+    vep_cache                  = null
+    download_cache             = false  // DO NOT Download annotation cache
+
      // Annotation files
     k_genome                   ="${projectDir}/testdata/annotation_files/kgenomes_snvindels.GRCh38.27022019.sites.test.vcf.gz"
     dbsnp_indel                ="${projectDir}/testdata/annotation_files/dbsnp_v151_GRCh38.INDEL.test.vcf.gz"
@@ -100,4 +100,8 @@ process {
         cpus            = { check_max( 2 * task.attempt, 'cpus' ) }
         memory          = { check_max( 6.GB * task.attempt, 'memory' ) }
    }
+   // using vep online is only recommended for test purposes for a minimal set of variants!
+    withName: 'ENSEMBLVEP_VEP' {
+        ext.args         ='--per_gene --total_length --database'
+    }
 }
diff --git a/modules/nf-core/modules/ensemblvep/Dockerfile b/modules/nf-core/modules/ensemblvep/Dockerfile
@@ -0,0 +1,31 @@
+FROM nfcore/base:1.14
+LABEL \
+    author="Maxime Garcia" \
+    description="VEP image for nf-core pipelines" \
+    maintainer="[email protected]"
+
+# Install the conda environment
+COPY environment.yml /
+RUN conda env create -f /environment.yml && conda clean -a
+
+# Setup default ARG variables
+ARG GENOME=GRCh38
+ARG SPECIES=homo_sapiens
+ARG VEP_CACHE_VERSION=108
+ARG VEP_VERSION=108.2
+
+# Add conda installation dir to PATH (instead of doing 'conda activate')
+ENV PATH /opt/conda/envs/nf-core-vep-${VEP_VERSION}/bin:$PATH
+
+# Download Genome
+RUN vep_install \
+    -a c \
+    -c .vep \
+    -s ${SPECIES} \
+    -y ${GENOME} \
+    --CACHE_VERSION ${VEP_CACHE_VERSION} \
+    --CONVERT \
+    --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE
+
+# Dump the details of the installed packages to a file for posterity
+RUN conda env export --name nf-core-vep-${VEP_VERSION} > nf-core-vep-${VEP_VERSION}.yml
diff --git a/modules/nf-core/modules/ensemblvep/build.sh b/modules/nf-core/modules/ensemblvep/build.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Build and push all containers
+
+build_push() {
+    GENOME=$1
+    SPECIES=$2
+    VEP_CACHE_VERSION=$3
+    VEP_VERSION=$4
+
+    docker build \
+        . \
+        -t nfcore/vep:${VEP_VERSION}.${GENOME} \
+        --build-arg GENOME=${GENOME} \
+        --build-arg SPECIES=${SPECIES} \
+        --build-arg VEP_CACHE_VERSION=${VEP_CACHE_VERSION} \
+        --build-arg VEP_VERSION=${VEP_VERSION}
+
+    docker push nfcore/vep:${VEP_VERSION}.${GENOME}
+}
+
+build_push "CanFam3.1" "canis_lupus_familiaris"   "104" "108.2"
+build_push "GRCh37"    "homo_sapiens"             "108" "108.2"
+build_push "GRCh38"    "homo_sapiens"             "108" "108.2"
+build_push "GRCm38"    "mus_musculus"             "102" "108.2"
+build_push "GRCm39"    "mus_musculus"             "108" "108.2"
+build_push "R64-1-1"   "saccharomyces_cerevisiae" "108" "108.2"
+build_push "UMD3.1"    "bos_taurus"               "94"  "108.2"
+build_push "WBcel235"  "caenorhabditis_elegans"   "108" "108.2"
diff --git a/modules/nf-core/modules/ensemblvep/download/environment.yml b/modules/nf-core/modules/ensemblvep/download/environment.yml
@@ -0,0 +1,7 @@
+name: ensemblvep_download
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::ensembl-vep=110.0
diff --git a/modules/nf-core/modules/ensemblvep/download/main.nf b/modules/nf-core/modules/ensemblvep/download/main.nf
@@ -0,0 +1,45 @@
+process ENSEMBLVEP_DOWNLOAD {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/ensembl-vep:110.0--pl5321h2a3209d_0' :
+        'quay.io/biocontainers/ensembl-vep:110.0--pl5321h2a3209d_0' }"
+
+    input:
+    tuple val(meta), path(x)
+
+    output:
+    path("vep_cache")      , emit: cache
+    path "versions.yml"    , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    vep_install \\
+        --CACHEDIR vep_cache \\
+        --SPECIES $params.species \\
+        --ASSEMBLY $params.vep_genome \\
+        --CACHE_VERSION $params.vep_cache_version \\
+        $args
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    mkdir vep_cache
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/modules/ensemblvep/download/meta.yml b/modules/nf-core/modules/ensemblvep/download/meta.yml
@@ -0,0 +1,45 @@
+name: ensemblvep_download
+description: Ensembl Variant Effect Predictor (VEP). The cache downloading options are controlled through `task.ext.args`.
+keywords:
+  - annotation
+  - cache
+  - download
+tools:
+  - ensemblvep:
+      description: |
+        VEP determines the effect of your variants (SNPs, insertions, deletions, CNVs
+        or structural variants) on genes, transcripts, and protein sequence, as well as regulatory regions.
+      homepage: https://www.ensembl.org/info/docs/tools/vep/index.html
+      documentation: https://www.ensembl.org/info/docs/tools/vep/script/index.html
+      licence: ["Apache-2.0"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - assembly:
+      type: string
+      description: |
+        Genome assembly
+  - species:
+      type: string
+      description: |
+        Specie
+  - cache_version:
+      type: string
+      description: |
+        cache version
+output:
+  - cache:
+      type: file
+      description: cache
+      pattern: "*"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@maxulysse"
+maintainers:
+  - "@maxulysse"
diff --git a/modules/nf-core/modules/ensemblvep/environment.yml b/modules/nf-core/modules/ensemblvep/environment.yml
@@ -0,0 +1,10 @@
+# You can use this file to create a conda environment for this module:
+#   conda env create -f environment.yml
+name: nf-core-vep-108.2
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+
+dependencies:
+  - bioconda::ensembl-vep=108.2
diff --git a/modules/nf-core/modules/ensemblvep/filtervep/environment.yml b/modules/nf-core/modules/ensemblvep/filtervep/environment.yml
@@ -0,0 +1,7 @@
+name: ensemblvep_filtervep
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::ensembl-vep=110.0