Implement nf-test for rank variants (#435)

* wip - unstable genmod output * Add changelog * review suggestions * allow empty vep_plugin_files
genomic-medicine-sweden · Oct 24, 2024 · 2caa215 · 2caa215
1 parent 49fa8e7
commit 2caa215
Show file tree

Hide file tree

Showing 12 changed files with 460 additions and 84 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -38,6 +38,7 @@ jobs:
           - "SNV_ANNOTATION"
           - "CALL_SVS"
           - "ANNOTATE_SVS"
+          - "RANK_VARIANTS"
         profile:
           - "docker"
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#419](https://github.com/genomic-medicine-sweden/nallo/pull/419) - Added support for SV filtering using input BED file ([#348](https://github.com/genomic-medicine-sweden/nallo/issues/348))
 - [#430](https://github.com/genomic-medicine-sweden/nallo/pull/430) - Added a GitHub action to build and publish docs to GitHub Pages
 - [#431](https://github.com/genomic-medicine-sweden/nallo/pull/431) - Added files needed to automatically build and publish docs to GitHub Pages
+- [#435](https://github.com/genomic-medicine-sweden/nallo/pull/435) - Added nf-test to rank variants
 
 ### `Changed`
 
@@ -58,6 +59,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#431](https://github.com/genomic-medicine-sweden/nallo/pull/431) - Changed `CITATIONS.md` to `docs/CITATIONS.md`,
 - [#433](https://github.com/genomic-medicine-sweden/nallo/pull/433) - Updated docs and README.
 - [#434](https://github.com/genomic-medicine-sweden/nallo/pull/434) - Updated the SVDB merge module to fix unstable CALL_SVS tests
+- [#435](https://github.com/genomic-medicine-sweden/nallo/pull/435) - Updated and refactored processes and workflows related to variant ranking
 
 ### `Removed`
 

diff --git a/conf/modules/rank_variants.config b/conf/modules/rank_variants.config
@@ -47,8 +47,4 @@ process {
         ext.args = "--temp_dir ./"
     }
 
-    withName: '.*:RANK_VARIANTS_SNV:BCFTOOLS_SORT' {
-        ext.when = false
-    }
-
 }
diff --git a/modules/local/add_most_severe_consequence.nf b/modules/local/add_most_severe_consequence.nf
@@ -1,6 +1,6 @@
 process ADD_MOST_SEVERE_CSQ {
     tag "$meta.id"
-    label 'process_low'
+    label 'process_single'
 
     conda "conda-forge::python=3.8.3"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@@ -9,20 +9,25 @@ process ADD_MOST_SEVERE_CSQ {
 
     input:
     tuple val(meta), path(vcf)
-    path (variant_consequences)
+    tuple val(meta2), path (variant_consequences)
 
     output:
-    tuple val(meta), path("*.vcf")        , emit: vcf
-    path "versions.yml"                   , emit: versions
+    tuple val(meta), path("*.vcf"), emit: vcf
+    path "versions.yml"           , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
+    if ("$vcf" == "${prefix}.vcf" ) error "Input and output names are the same, set prefix in module configuration to disambiguate!"
+
     """
-    add_most_severe_consequence.py --file_in ${vcf} --file_out ${prefix}.vcf --variant_csq ${variant_consequences}
+    add_most_severe_consequence.py \\
+        --file_in ${vcf} \\
+        --file_out ${prefix}.vcf \\
+        --variant_csq ${variant_consequences}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/add_most_severe_pli.nf b/modules/local/add_most_severe_pli.nf
@@ -1,6 +1,6 @@
 process ADD_MOST_SEVERE_PLI {
     tag "$meta.id"
-    label 'process_low'
+    label 'process_single'
 
     conda "conda-forge::python=3.8.3"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@@ -11,17 +11,21 @@ process ADD_MOST_SEVERE_PLI {
     tuple val(meta), path(vcf)
 
     output:
-    tuple val(meta), path("*.vcf")  , emit: vcf
-    path "versions.yml"             , emit: versions
+    tuple val(meta), path("*.vcf"), emit: vcf
+    path "versions.yml"           , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
+    if ("$vcf" == "${prefix}.vcf" ) error "Input and output names are the same, set prefix in module configuration to disambiguate!"
+
     """
-    add_most_severe_pli.py --file_in ${vcf} --file_out ${prefix}.vcf
+    add_most_severe_pli.py \\
+        --file_in ${vcf} \\
+        --file_out ${prefix}.vcf
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/subworkflows/local/annotate_consequence_pli.nf b/subworkflows/local/annotate_consequence_pli.nf
@@ -9,7 +9,7 @@ include { TABIX_BGZIPTABIX    } from '../../modules/nf-core/tabix/bgziptabix/mai
 workflow ANNOTATE_CSQ_PLI {
     take:
     ch_vcf                  // channel: [mandatory] [ val(meta), path(vcf) ]
-    ch_variant_consequences // channel: [mandatory] [ path(consequences) ]
+    ch_variant_consequences // channel: [mandatory] [ val(meta), path(consequences) ]
 
     main:
     ch_versions = Channel.empty()
@@ -24,7 +24,7 @@ workflow ANNOTATE_CSQ_PLI {
     ch_versions = ch_versions.mix(TABIX_BGZIPTABIX.out.versions)
 
     emit:
-    vcf_ann  = TABIX_BGZIPTABIX.out.gz_tbi.map { meta, vcf, tbi -> return [ meta, vcf ] } // channel: [ val(meta), path(vcf) ]
-    tbi_ann  = TABIX_BGZIPTABIX.out.gz_tbi.map { meta, vcf, tbi -> return [ meta, tbi ] } // channel: [ val(meta), path(tbi) ]
-    versions = ch_versions                                                                // channel: [ path(versions.yml) ]
+    vcf      = TABIX_BGZIPTABIX.out.gz_tbi.map { meta, vcf, tbi -> [ meta, vcf ] } // channel: [ val(meta), path(vcf) ]
+    tbi      = TABIX_BGZIPTABIX.out.gz_tbi.map { meta, vcf, tbi -> [ meta, tbi ] } // channel: [ val(meta), path(tbi) ]
+    versions = ch_versions                                                         // channel: [ path(versions.yml) ]
 }
diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf
@@ -6,8 +6,11 @@ include { UNTAR as UNTAR_VEP_CACHE } from '../../modules/nf-core/untar/main'
 workflow PREPARE_GENOME {
 
     take:
-    fasta_in     // channel: [mandatory] [ val(meta), path(fasta) ]
-    ch_vep_cache // channel: [optional] [ path(cache) ]
+    fasta_in                   // channel: [mandatory] [ val(meta), path(fasta) ]
+    gunzip_fasta               //    bool: should we gunzip fasta
+    ch_vep_cache               // channel: [optional] [ val(meta), path(cache) ]
+    split_vep_files            //    bool: are there vep extra files
+    ch_vep_extra_files_unsplit // channel: [optional] [ val(meta), path(csv) ]
 
     main:
     ch_versions = Channel.empty()
@@ -16,16 +19,15 @@ workflow PREPARE_GENOME {
     fasta_file = fasta_in.map{meta, file -> file}
 
     // Will not catch cases where fasta is bgzipped
-    if ( params.fasta.endsWith('.gz') ) {
-        GUNZIP_FASTA(fasta_in)
+    if ( gunzip_fasta ) {
+        GUNZIP_FASTA ( fasta_in )
             .gunzip
             .collect()
-            .set{ch_fasta}
-
+            .set { ch_fasta }
         ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions.first())
     } else {
         fasta_in
-            .set{ch_fasta}
+            .set { ch_fasta }
     }
 
     SAMTOOLS_FAIDX ( ch_fasta, [[],[]] )
@@ -38,14 +40,33 @@ workflow PREPARE_GENOME {
     ch_versions = ch_versions.mix(UNTAR_VEP_CACHE.out.versions)
 
     UNTAR_VEP_CACHE.out.untar
-        .map { meta, files -> [files] }
+        .map { meta, files -> [ files ] }
         .collect()
         .set { untarred_vep }
 
+    // Read and store paths in the vep_plugin_files file
+    if ( split_vep_files ) {
+        ch_vep_extra_files_unsplit
+            .splitCsv ( header:true )
+            .map { row ->
+                path = file(row.vep_files[0])
+                if(path.exists()) {
+                    return [path]
+                } else {
+                    error("\nVep database file ${path} does not exist.")
+                }
+            }
+            .collect()
+            .set { ch_vep_extra_files }
+    } else {
+        ch_vep_extra_files = Channel.value([])
+    }
+
     emit:
-    mmi           = MINIMAP2_INDEX.out.index.collect() // channel: [ val(meta), path(mmi) ]
-    fai           = SAMTOOLS_FAIDX.out.fai.collect()   // channel: [ val(meta), path(fai) ]
-    fasta         = ch_fasta                           // channel: [ val(meta), path(fasta) ]
-    vep_resources = untarred_vep                       // channel: [ path(cache) ]
-    versions      = ch_versions                        // channel: [ versions.yml ]
+    mmi             = MINIMAP2_INDEX.out.index.collect() // channel: [ val(meta), path(mmi) ]
+    fai             = SAMTOOLS_FAIDX.out.fai.collect()   // channel: [ val(meta), path(fai) ]
+    fasta           = ch_fasta                           // channel: [ val(meta), path(fasta) ]
+    vep_resources   = untarred_vep                       // channel: [ path(cache) ]
+    vep_extra_files = ch_vep_extra_files                 // channel: [ path(files) ]
+    versions        = ch_versions                        // channel: [ versions.yml ]
 }
diff --git a/subworkflows/local/rank_variants/main.nf b/subworkflows/local/rank_variants/main.nf
@@ -6,47 +6,44 @@ include { GENMOD_ANNOTATE  } from '../../../modules/nf-core/genmod/annotate/main
 include { GENMOD_MODELS    } from '../../../modules/nf-core/genmod/models/main'
 include { GENMOD_SCORE     } from '../../../modules/nf-core/genmod/score/main'
 include { GENMOD_COMPOUND  } from '../../../modules/nf-core/genmod/compound/main'
-include { BCFTOOLS_SORT    } from '../../../modules/nf-core/bcftools/sort/main'
-include { TABIX_BGZIP      } from '../../../modules/nf-core/tabix/bgzip/main'
-include { TABIX_TABIX      } from '../../../modules/nf-core/tabix/tabix/main'
+include { TABIX_BGZIPTABIX } from '../../../modules/nf-core/tabix/bgziptabix/main'
 
 workflow RANK_VARIANTS {
 
     take:
-        ch_vcf                // channel: [mandatory] [ val(meta), path(vcf) ]
-        ch_pedfile            // channel: [mandatory] [ path(ped) ]
-        ch_reduced_penetrance // channel: [mandatory] [ path(pentrance) ]
-        ch_score_config       // channel: [mandatory] [ path(ini) ]
+    ch_vcf                // channel: [mandatory] [ val(meta), path(vcf) ]
+    ch_pedfile            // channel: [mandatory] [ val(meta), path(ped) ]
+    ch_reduced_penetrance // channel: [mandatory] [ val(meta), path(pentrance) ]
+    ch_score_config       // channel: [mandatory] [ val(meta), path(ini) ]
 
     main:
-        ch_versions = Channel.empty()
+    ch_versions = Channel.empty()
 
-        GENMOD_ANNOTATE(ch_vcf)
+    GENMOD_ANNOTATE ( ch_vcf )
+    ch_versions = ch_versions.mix(GENMOD_ANNOTATE.out.versions)
 
-        GENMOD_MODELS(GENMOD_ANNOTATE.out.vcf, ch_pedfile, ch_reduced_penetrance)
+    GENMOD_MODELS (
+        GENMOD_ANNOTATE.out.vcf,
+        ch_pedfile.map { meta, ped -> ped },
+        ch_reduced_penetrance.map { meta, file -> file }
+    )
+    ch_versions = ch_versions.mix(GENMOD_MODELS.out.versions)
 
-        GENMOD_SCORE(GENMOD_MODELS.out.vcf, ch_pedfile, ch_score_config)
+    GENMOD_SCORE (
+        GENMOD_MODELS.out.vcf,
+        ch_pedfile.map { meta, ped -> ped },
+        ch_score_config.map { meta, file -> file }
+    )
+    ch_versions = ch_versions.mix(GENMOD_SCORE.out.versions)
 
-        GENMOD_COMPOUND(GENMOD_SCORE.out.vcf)
+    GENMOD_COMPOUND ( GENMOD_SCORE.out.vcf )
+    ch_versions = ch_versions.mix(GENMOD_COMPOUND.out.versions)
 
-        BCFTOOLS_SORT(GENMOD_COMPOUND.out.vcf) // SV file needs to be sorted before indexing
-
-        TABIX_BGZIP(GENMOD_COMPOUND.out.vcf) //run only for SNVs
-
-        ch_vcf = TABIX_BGZIP.out.output.mix(BCFTOOLS_SORT.out.vcf)
-
-        TABIX_TABIX (ch_vcf)
-
-        ch_versions = ch_versions.mix(GENMOD_ANNOTATE.out.versions)
-        ch_versions = ch_versions.mix(GENMOD_MODELS.out.versions)
-        ch_versions = ch_versions.mix(GENMOD_SCORE.out.versions)
-        ch_versions = ch_versions.mix(GENMOD_COMPOUND.out.versions)
-        ch_versions = ch_versions.mix(BCFTOOLS_SORT.out.versions)
-        ch_versions = ch_versions.mix(TABIX_BGZIP.out.versions)
-        ch_versions = ch_versions.mix(TABIX_TABIX.out.versions)
+    TABIX_BGZIPTABIX ( GENMOD_COMPOUND.out.vcf )
+    ch_versions = ch_versions.mix(TABIX_BGZIPTABIX.out.versions)
 
     emit:
-        vcf      = ch_vcf              // channel: [ val(meta), path(vcf) ]
-        tbi      = TABIX_TABIX.out.tbi // channel: [ val(meta), path(tbi) ]
-        versions = ch_versions         // channel: [ path(versions.yml) ]
+    vcf      = TABIX_BGZIPTABIX.out.gz_tbi.map { meta, vcf, tbi -> [ meta, vcf ] } // channel: [ val(meta), path(vcf) ]
+    tbi      = TABIX_BGZIPTABIX.out.gz_tbi.map { meta, vcf, tbi -> [ meta, tbi ] } // channel: [ val(meta), path(tbi) ]
+    versions = ch_versions                                                         // channel: [ path(versions.yml) ]
 }