Merge pull request #26 from N-Hoffmann/stringtie

Help message and bug fixes
IGDRion · Sep 26, 2024 · 0ae3bec · 0ae3bec
2 parents 5f2a5df + c49d9e9
commit 0ae3bec
Show file tree

Hide file tree

Showing 7 changed files with 267 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -59,27 +59,41 @@ Required:
 --gtf               : Path to reference annotation.
 
 
-Optional:
+Profile options:
 -profile test       : Run annexa on toy dataset.
 -profile slurm      : Run annexa on slurm executor.
 -profile singularity: Run annexa in singularity container.
 -profile conda      : Run annexa in conda environment.
 -profile docker     : Run annexa in docker container.
 
+
+Main options:
 --tx_discovery      : Specify which transcriptome reconstruction tool to use. Options: "bambu" (default) or "stringtie2".
 --filter            : Perform or not the filtering step. false by default.
+--withGeneCoverage  : Run RSeQC (can be long depending on annotation and bam sizes). False by default.
+
+
+Bambu options:
+--bambu_strand      : Run bambu with stranded data. true by default.
+--prefilter_ndr     : When using ANNEXA with bambu, prefilter before the filtering step. false by default.
+--bambu_threshold   : bambu NDR threshold below which new transcripts are retained.
+
+
+Filtering options:
 --tfkmers_tokenizer : Path to TransforKmers tokenizer. Required if filter activated.
 --tfkmers_model     : Path to TransforKmers model. Required if filter activated.
---bambu_threshold   : bambu NDR threshold below which new transcripts are retained.
 --tfkmers_threshold : TransforKmers prediction threshold below which new transcripts are retained.
 --operation         : Operation to retained novel transcripts. "union" retain tx validated by either bambu or transforkmers, "intersection" retain tx validated by both.
 
---withGeneCoverage  : Run RSeQC (can be long depending on annotation and bam sizes). False by default.
 
+Performance options:
 --maxCpu            : max cpu threads used by ANNEXA. 8 by default.
 --maxMemory         : max memory used by ANNEXA. 40GB by default.
 
--with-report        : Create an HTML execution report with metrics such as resource usage for each workflow process
+
+Nextflow options:
+-resume             : Resume task from cached work (useful for recovering from errors when using singularity).
+-with-report        : Create an HTML execution report with metrics such as resource usage for each workflow process.
 ```
 
 > If the filter argument is set to `true`, TransforKmers model and tokenizer paths have to be given. They can be either downloaded from the [TransforKmers official repository](https://github.com/IGDRion/TransforKmers) or trained in advance by yourself on your own data.
@@ -101,3 +115,5 @@ To use them, extract the zip, and point `--tfkmers_model` and `--tfkmers_tokeniz
 The filtered annotation can be the `union` of these 2 tools, _i.e._ all the transcripts validated by one or both of these tools; or the `intersection`, _i.e._ the transcripts validated by both tools (the latter being the default).
 
 At the end, the QC steps are performed both on the full and filtered extended annotations.
+
+### Prefiltering
diff --git a/bin/validate_gtf.py b/bin/validate_gtf.py
@@ -16,11 +16,10 @@
     #######################################################
 
     for record in GTF.parse_by_line(args.gtf):
-        if record.feature =="transcript":
-            if "gene_biotype" in record:
-                g_biotype = record["gene_biotype"]
-            elif "gene_type" in record:
-                g_biotype = record["gene_type"]
+        if "gene_biotype" in record:
+            g_biotype = record["gene_biotype"]
+        elif "gene_type" in record:
+            g_biotype = record["gene_type"]
         if record.feature == "gene" or record.feature == "transcript":
             continue
 

diff --git a/main.nf b/main.nf
@@ -1,6 +1,16 @@
 ///////////////////////////////////////////////////////////////////////////
 // PARSE ARGS
 ///////////////////////////////////////////////////////////////////////////
+
+// Help Message
+include { paramsHelp } from 'plugin/nf-schema'
+include { helpHeader           } from './modules/header.nf'
+if (params.help) {
+  log.info helpHeader()
+    log.info paramsHelp("nextflow run IGDRion/ANNEXA -profile {test,docker,singularity,conda,slurm} --input samples.txt --gtf /path/to/ref.gtf --fa /path/to/ref.fa")
+    exit 0
+}
+
 if (params.input) { input = file(params.input, checkIfExists: true) }
 else { exit 1, "Input file not specified!" }
 
@@ -27,6 +37,13 @@ if (params.filter) {
 include { logHeader           } from './modules/header.nf'
 log.info logHeader(params)
 
+if (params.tx_discovery == "bambu"){
+  log.warn """You are using Bambu as the transcript discovery tool. Please note that Bambu takes into account strandedness by default.
+  Bambu using strandedness: ${params.bambu_strand}. Please verifiy that it corresponds to your sequencing protocol.
+  You can change strandedness mode using the --bambu_strand parameter (options: true or false).
+  """
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // WORKFLOW
 ///////////////////////////////////////////////////////////////////////////

diff --git a/modules/add_class_code.nf b/modules/add_class_code.nf
@@ -14,7 +14,11 @@ process ADD_CLASS_CODE {
   script:
   """  
   class_code.R ${class_code_gtf} ${gtf} "class_code.${gtf}"
-  ## Remove header created by gtfsort
+
+  # Remove header created by gtfsort
   sed -i 1,3d "class_code.${gtf}"
+
+  # Add semicolon at end of tx lines
+  sed -i '/\\ttranscript\\t/s/\$/;/' "class_code.${gtf}"
   """
 }
diff --git a/modules/header.nf b/modules/header.nf
@@ -30,3 +30,22 @@ Stranded              : ${params.bambu_strand}
 -${c_dim}-------------------------------------${c_reset}-
 """.stripIndent()
 }
+
+def helpHeader() {
+    // Log colors ANSI codes
+    c_dim = "\033[2m";
+    c_green = "\033[0;32m";
+    c_purple = "\033[0;35m";
+    c_reset = "\033[0m";
+
+    return """-${c_dim}-------------------------------------${c_reset}-
+${c_green}    ___    _   ___   _________  __ ___
+   /   |  / | / / | / / ____/ |/ //   |
+  / /| | /  |/ /  |/ / __/  |   // /| |
+ / ___ |/ /|  / /|  / /___ /   |/ ___ |
+/_/  |_/_/ |_/_/ |_/_____//_/|_/_/  |_|
+                                       ${c_reset}
+-${c_dim}-------------------------------------${c_reset}-
+${c_purple}github.com/igdrion/ANNEXA${c_reset}
+""".stripIndent()
+}
diff --git a/nextflow.config b/nextflow.config
@@ -1,4 +1,8 @@
 // General
+plugins {
+  id '[email protected]'
+}
+
 params {
   outdir            = "results"
   withGeneCoverage  = false
@@ -14,6 +18,7 @@ params {
   tfkmers_tokenizer = null
   bambu_strand      = true
   tx_discovery      = "bambu"
+  help              = false
 }
 
 process {

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -0,0 +1,197 @@
+{
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "$id": "https://raw.githubusercontent.com/IGDRion/ANNEXA/master/nextflow_schema.json",
+    "title": "IGDRion/ANNEXA pipeline parameters",
+    "description": "An all-in-one pipeline to analyze LR-RNAseq data,reconstruct and quantify known and novel genes and isoforms.",
+    "type": "object",
+    "defs": {
+        "required": {
+            "title": "Required options",
+            "type": "object",
+            "fa_icon": "fas fa-terminal",
+            "description": "Define where the pipeline should find input data and save output data.",
+            "properties": {
+                "input": {
+                    "type": "string",
+                    "format": "file-path",
+                    "pattern": "^\\S+\\.(csv|tsv|yaml|json)$",
+                    "description": "Path to file listing paths to bam files.",
+                    "fa_icon": "fas fa-file-csv"
+                },
+                "fa": {
+                    "type": "string",
+                    "format": "file-path",
+                    "description": "Path to reference genome.",
+                    "fa_icon": "fas fa-folder-open"
+                },
+                "gtf": {
+                    "type": "string",
+                    "format": "directory-path",
+                    "description": "Path to reference annotation.",
+                    "fa_icon": "fas fa-folder-open"
+                }
+            }
+        },
+        "profile_options": {
+            "title": "Profile options",
+            "type": "object",
+            "fa_icon": "fas fa-terminal",
+            "description": "Define how the pipeline should be executed.",
+            "properties": {
+                "profile test": {
+                    "type": "string",
+                    "description": "Run annexa on toy dataset.",
+                    "fa_icon": "fas fa-file-csv"
+                },
+                "profile slurm": {
+                    "type": "string",
+                    "description": "Run annexa on slurm executor."
+                },
+                "profile singularity": {
+                    "type": "string",
+                    "description": "Run annexa in singularity container."
+                },
+                "profile conda": {
+                    "type": "string",
+                    "description": "Run annexa in conda environment."
+                },
+                "profile docker": {
+                    "type": "string",
+                    "description": "Run annexa in docker container."
+                }
+            }
+        },
+        "main_options": {
+            "title": "Main options",
+            "type": "object",
+            "fa_icon": "fas fa-terminal",
+            "description": "General ANNEXA parameters.",
+            "properties": {
+                "tx_discovery": {
+                    "type": "string",
+                    "description": "Specify which transcriptome reconstruction tool to use.",
+                    "default": "bambu",
+                    "enum": ["bambu", "stringtie2"],
+                    "fa_icon": "fas fa-file-csv"
+                },
+                "filter": {
+                    "type": "boolean",
+                    "description": "Perform or not the filtering step.",
+                    "default": "true"
+                },
+                "withGeneCoverage": {
+                    "type": "boolean",
+                    "description": "Run RSeQC (can be long depending on annotation and bam sizes).",
+                    "default": "false"
+                }
+            }
+        },
+        "bzmbu_options": {
+            "title": "Bambu options",
+            "type": "object",
+            "fa_icon": "fas fa-terminal",
+            "description": "Parameters related to Bambu.",
+            "properties": {
+                "bambu_strand": {
+                    "type": "boolean",
+                    "description": "Run bambu with stranded data",
+                    "default": "true"
+                },
+                "prefilter_ndr": {
+                    "type": "boolean",
+                    "description": "When using ANNEXA with bambu, prefilter before the filtering step. false by default.",
+                    "default": "false"
+                },
+                "bambu_threshold": {
+                    "type": "integer",
+                    "description": "bambu NDR threshold below which new transcripts are retained.",
+                    "default": "0.2"
+                }
+            }
+        },
+        "filtering_options": {
+            "title": "Filtering options",
+            "type": "object",
+            "fa_icon": "fas fa-terminal",
+            "description": "Parameters related to filtering step.",
+            "properties": {
+                "tfkmers_tokenizer": {
+                    "type": "string",
+                    "description": "Path to TransforKmers tokenizer. Required if filter option is activated."
+                },
+                "tfkmers_model": {
+                    "type": "string",
+                    "description": "Path to TransforKmers model. Required if filter activated."
+                },
+                "tfkmers_threshold": {
+                    "type": "integer",
+                    "description": "TransforKmers prediction threshold below which new transcripts are retained.",
+                    "default": "0.2",
+                    "minimum": "0",
+                    "maximum": "1"
+                },
+                "operation": {
+                    "type": "string",
+                    "description": "Operation to retained novel transcripts. 'union' retain tx validated by either bambu or transforkmers, 'intersection' retain tx validated by both.",
+                    "enum": ["union", "intersection"],
+                    "default": "intersection"
+                }
+            }
+        },
+        "performance_options": {
+            "title": "Performance options",
+            "type": "object",
+            "fa_icon": "fas fa-terminal",
+            "description": "Parameters related to performance.",
+            "properties": {
+                "maxCpu": {
+                    "type": "integer",
+                    "description": "Max cpu threads used by ANNEXA.",
+                    "default": "8"
+                },
+                "maxMemory": {
+                    "type": "integer",
+                    "description": "Max memory (in GB) used by ANNEXA.",
+                    "default": "40"
+                }
+            }
+        },
+        "nextflow_options": {
+            "title": "Nextflow options",
+            "type": "object",
+            "fa_icon": "fas fa-terminal",
+            "description": "Parameters related to Nextflow.",
+            "properties": {
+                "resume": {
+                    "description": "Resume task from cached work (useful for recovering from errors when using singularity)."
+                },
+                "with-report": {
+                    "description": "Create an HTML execution report with metrics such as resource usage for each workflow process."
+                }
+            }
+        }
+    },
+    "allOf": [
+        {
+            "$ref": "#/defs/required_options"
+        },
+        {
+            "$ref": "#/defs/profile_options"
+        },
+        {
+            "$ref": "#/defs/main_options"
+        },
+        {
+            "$ref": "#/defs/bambu_options"
+        },
+        {
+            "$ref": "#/defs/filtering_options"
+        },
+        {
+            "$ref": "#/defs/performance_options"
+        },
+        {
+            "$ref": "#/defs/nextflow_options"
+        }
+    ]
+}