Skip to content

Commit

Permalink
Merge pull request #22 from ghga-de/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
kubranarci authored Dec 1, 2023
2 parents 80ff04c + b53e255 commit cd6082d
Show file tree
Hide file tree
Showing 24 changed files with 586 additions and 94 deletions.
Binary file added .DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ We thank the following people for their extensive assistance in the development
**TODO**

<!-- TODO nf-core: If applicable, make list of people who have also contributed -->

Nagarajan Paramasivam @NagaComBio [email protected]

## Contributions and Support
Expand All @@ -193,7 +194,6 @@ If you would like to contribute to this pipeline, please see the [contributing g

## Citations


<!-- If you use nf-platypusindelcalling for your analysis, please cite it using the following doi: [10.5281/zenodo.XXXXXX](https://doi.org/10.5281/zenodo.XXXXXX) -->

<!-- TODO nf-core: Add bibliography of tools and data used in your pipeline -->
Expand Down
104 changes: 52 additions & 52 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
@@ -1,54 +1,54 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"$id": "https://raw.githubusercontent.com/ghga-de/nf-platypusindelcalling/master/assets/schema_input.json",
"title": "nf-platypusindelcalling pipeline - params.input schema",
"description": "Schema for the file provided with params.input",
"type": "array",
"items": {
"type": "object",
"properties": {
"sample": {
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "Sample name must be provided and cannot contain spaces"
},
"tumor": {
"type": "string",
"pattern": "^\\S+\\.bam$",
"errorMessage": "BAM file for tumors must be provided'"
},
"tumor_index": {
"type": "string",
"pattern": "^\\S+\\.bai$",
"errorMessage": "BAI file matching to BAM for tumors must be provided'"
},
"control": {
"errorMessage": "BAM file for as control matching to tumor, if there is",
"anyOf": [
{
"type": "string",
"pattern": "^\\S+\\.bam$"
},
{
"type": "string",
"maxLength": 0
}
]
},
"control_index": {
"errorMessage": "BAI file matching to BAM for as control matching to tumor, if there is",
"anyOf": [
{
"type": "string",
"pattern": "^\\S+\\.bai$"
},
{
"type": "string",
"maxLength": 0
}
]
}
},
"required": ["sample", "tumor", "tumor_index"]
}
"$schema": "http://json-schema.org/draft-07/schema",
"$id": "https://raw.githubusercontent.com/ghga-de/nf-platypusindelcalling/master/assets/schema_input.json",
"title": "nf-platypusindelcalling pipeline - params.input schema",
"description": "Schema for the file provided with params.input",
"type": "array",
"items": {
"type": "object",
"properties": {
"sample": {
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "Sample name must be provided and cannot contain spaces"
},
"tumor": {
"type": "string",
"pattern": "^\\S+\\.bam$",
"errorMessage": "BAM file for tumors must be provided'"
},
"tumor_index": {
"type": "string",
"pattern": "^\\S+\\.bai$",
"errorMessage": "BAI file matching to BAM for tumors must be provided'"
},
"control": {
"errorMessage": "BAM file for as control matching to tumor, if there is",
"anyOf": [
{
"type": "string",
"pattern": "^\\S+\\.bam$"
},
{
"type": "string",
"maxLength": 0
}
]
},
"control_index": {
"errorMessage": "BAI file matching to BAM for as control matching to tumor, if there is",
"anyOf": [
{
"type": "string",
"pattern": "^\\S+\\.bai$"
},
{
"type": "string",
"maxLength": 0
}
]
}
},
"required": ["sample", "tumor", "tumor_index"]
}
}
2 changes: 1 addition & 1 deletion bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def check_samplesheet(file_in, file_out):
"""
This function checks that the samplesheet follows the following structure:
sample,tumor,tumor_index, control, control_index
sample_WithControl,tumor1.bam,tumot1.bai, control1.bam, control1.bai
sample_WithControl,tumor1.bam,tumor1.bai, control1.bam, control1.bai
sample_WithoutControl,tumor2.bam,tumor2.bai,,
For an example see:
https://github.com/ghga-de/nf-platypusindelcalling/assets/samplesheet.csv
Expand Down
Binary file modified bin/vcfparser.pyc
Binary file not shown.
19 changes: 18 additions & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,24 @@ process {
pattern: "*.{txt}",
mode: params.publish_dir_mode
]
}
}
withName: 'ENSEMBLVEP_DOWNLOAD' {
ext.args = { '--AUTO c --CONVERT --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE' }
publishDir = [
mode: params.publish_dir_mode,
path: { params.outdir_cache ? "${params.outdir_cache}/": "${params.outdir}/cache/" }
]
}
withName: 'ENSEMBLVEP_VEP' {
//ext.args ='--everything --filter_common --per_gene --total_length --offline'
publishDir = [
[
mode: params.publish_dir_mode,
path: { "${params.outdir}/${meta.id}/" },
pattern: "*{gz,tbi,html}"
]
]
}
}
//
// Don't publish results for these processes
Expand Down
22 changes: 13 additions & 9 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,19 @@ params {
skip_multiqc = false
min_confidence_score = 0

// Annovar
// Annovar needs to be build locally
buildver = "hg38"
dbtype = "wgEncodeGencodeCompV39"
segdupcol = "SEGDUP"
cytobandcol = "CYTOBAND"
geneannocols = '"ANNOVAR_FUNCTION,GENE,EXONIC_CLASSIFICATION,ANNOVAR_TRANSCRIPTS"'
annovar_path = "/Users/w620-admin/Desktop/Workflows/Annovar/annovar_Sept2022"


// Reference Files //
genome = "GRCh38"

// Annotation with vep
annotation_tool = "vep"
species = "homo_sapiens"
vep_cache_version = 110
vep_genome = 'GRCh38'
vep_version = '110'
vep_cache = null
download_cache = false // DO NOT Download annotation cache

// Annotation files
k_genome ="${projectDir}/testdata/annotation_files/kgenomes_snvindels.GRCh38.27022019.sites.test.vcf.gz"
dbsnp_indel ="${projectDir}/testdata/annotation_files/dbsnp_v151_GRCh38.INDEL.test.vcf.gz"
Expand Down Expand Up @@ -100,4 +100,8 @@ process {
cpus = { check_max( 2 * task.attempt, 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
}
// using vep online is only recommended for test purposes for a minimal set of variants!
withName: 'ENSEMBLVEP_VEP' {
ext.args ='--per_gene --total_length --database'
}
}
31 changes: 31 additions & 0 deletions modules/nf-core/modules/ensemblvep/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
FROM nfcore/base:1.14
LABEL \
author="Maxime Garcia" \
description="VEP image for nf-core pipelines" \
maintainer="[email protected]"

# Install the conda environment
COPY environment.yml /
RUN conda env create -f /environment.yml && conda clean -a

# Setup default ARG variables
ARG GENOME=GRCh38
ARG SPECIES=homo_sapiens
ARG VEP_CACHE_VERSION=108
ARG VEP_VERSION=108.2

# Add conda installation dir to PATH (instead of doing 'conda activate')
ENV PATH /opt/conda/envs/nf-core-vep-${VEP_VERSION}/bin:$PATH

# Download Genome
RUN vep_install \
-a c \
-c .vep \
-s ${SPECIES} \
-y ${GENOME} \
--CACHE_VERSION ${VEP_CACHE_VERSION} \
--CONVERT \
--NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE

# Dump the details of the installed packages to a file for posterity
RUN conda env export --name nf-core-vep-${VEP_VERSION} > nf-core-vep-${VEP_VERSION}.yml
30 changes: 30 additions & 0 deletions modules/nf-core/modules/ensemblvep/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash
set -euo pipefail

# Build and push all containers

build_push() {
GENOME=$1
SPECIES=$2
VEP_CACHE_VERSION=$3
VEP_VERSION=$4

docker build \
. \
-t nfcore/vep:${VEP_VERSION}.${GENOME} \
--build-arg GENOME=${GENOME} \
--build-arg SPECIES=${SPECIES} \
--build-arg VEP_CACHE_VERSION=${VEP_CACHE_VERSION} \
--build-arg VEP_VERSION=${VEP_VERSION}

docker push nfcore/vep:${VEP_VERSION}.${GENOME}
}

build_push "CanFam3.1" "canis_lupus_familiaris" "104" "108.2"
build_push "GRCh37" "homo_sapiens" "108" "108.2"
build_push "GRCh38" "homo_sapiens" "108" "108.2"
build_push "GRCm38" "mus_musculus" "102" "108.2"
build_push "GRCm39" "mus_musculus" "108" "108.2"
build_push "R64-1-1" "saccharomyces_cerevisiae" "108" "108.2"
build_push "UMD3.1" "bos_taurus" "94" "108.2"
build_push "WBcel235" "caenorhabditis_elegans" "108" "108.2"
7 changes: 7 additions & 0 deletions modules/nf-core/modules/ensemblvep/download/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: ensemblvep_download
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- bioconda::ensembl-vep=110.0
45 changes: 45 additions & 0 deletions modules/nf-core/modules/ensemblvep/download/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
process ENSEMBLVEP_DOWNLOAD {
tag "$meta.id"
label 'process_medium'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/ensembl-vep:110.0--pl5321h2a3209d_0' :
'quay.io/biocontainers/ensembl-vep:110.0--pl5321h2a3209d_0' }"

input:
tuple val(meta), path(x)

output:
path("vep_cache") , emit: cache
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
"""
vep_install \\
--CACHEDIR vep_cache \\
--SPECIES $params.species \\
--ASSEMBLY $params.vep_genome \\
--CACHE_VERSION $params.vep_cache_version \\
$args
cat <<-END_VERSIONS > versions.yml
"${task.process}":
ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//')
END_VERSIONS
"""

stub:
"""
mkdir vep_cache
cat <<-END_VERSIONS > versions.yml
"${task.process}":
ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//')
END_VERSIONS
"""
}
45 changes: 45 additions & 0 deletions modules/nf-core/modules/ensemblvep/download/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: ensemblvep_download
description: Ensembl Variant Effect Predictor (VEP). The cache downloading options are controlled through `task.ext.args`.
keywords:
- annotation
- cache
- download
tools:
- ensemblvep:
description: |
VEP determines the effect of your variants (SNPs, insertions, deletions, CNVs
or structural variants) on genes, transcripts, and protein sequence, as well as regulatory regions.
homepage: https://www.ensembl.org/info/docs/tools/vep/index.html
documentation: https://www.ensembl.org/info/docs/tools/vep/script/index.html
licence: ["Apache-2.0"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- assembly:
type: string
description: |
Genome assembly
- species:
type: string
description: |
Specie
- cache_version:
type: string
description: |
cache version
output:
- cache:
type: file
description: cache
pattern: "*"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@maxulysse"
maintainers:
- "@maxulysse"
10 changes: 10 additions & 0 deletions modules/nf-core/modules/ensemblvep/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# You can use this file to create a conda environment for this module:
# conda env create -f environment.yml
name: nf-core-vep-108.2
channels:
- conda-forge
- bioconda
- defaults

dependencies:
- bioconda::ensembl-vep=108.2
7 changes: 7 additions & 0 deletions modules/nf-core/modules/ensemblvep/filtervep/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: ensemblvep_filtervep
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- bioconda::ensembl-vep=110.0
Loading

0 comments on commit cd6082d

Please sign in to comment.