Skip to content

Commit

Permalink
Merge pull request #26 from N-Hoffmann/stringtie
Browse files Browse the repository at this point in the history
Help message and bug fixes
  • Loading branch information
N-Hoffmann authored Sep 26, 2024
2 parents 5f2a5df + c49d9e9 commit 0ae3bec
Show file tree
Hide file tree
Showing 7 changed files with 267 additions and 10 deletions.
24 changes: 20 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,27 +59,41 @@ Required:
--gtf : Path to reference annotation.
Optional:
Profile options:
-profile test : Run annexa on toy dataset.
-profile slurm : Run annexa on slurm executor.
-profile singularity: Run annexa in singularity container.
-profile conda : Run annexa in conda environment.
-profile docker : Run annexa in docker container.
Main options:
--tx_discovery : Specify which transcriptome reconstruction tool to use. Options: "bambu" (default) or "stringtie2".
--filter : Perform or not the filtering step. false by default.
--withGeneCoverage : Run RSeQC (can be long depending on annotation and bam sizes). False by default.
Bambu options:
--bambu_strand : Run bambu with stranded data. true by default.
--prefilter_ndr : When using ANNEXA with bambu, prefilter before the filtering step. false by default.
--bambu_threshold : bambu NDR threshold below which new transcripts are retained.
Filtering options:
--tfkmers_tokenizer : Path to TransforKmers tokenizer. Required if filter activated.
--tfkmers_model : Path to TransforKmers model. Required if filter activated.
--bambu_threshold : bambu NDR threshold below which new transcripts are retained.
--tfkmers_threshold : TransforKmers prediction threshold below which new transcripts are retained.
--operation : Operation to retained novel transcripts. "union" retain tx validated by either bambu or transforkmers, "intersection" retain tx validated by both.
--withGeneCoverage : Run RSeQC (can be long depending on annotation and bam sizes). False by default.
Performance options:
--maxCpu : max cpu threads used by ANNEXA. 8 by default.
--maxMemory : max memory used by ANNEXA. 40GB by default.
-with-report : Create an HTML execution report with metrics such as resource usage for each workflow process
Nextflow options:
-resume : Resume task from cached work (useful for recovering from errors when using singularity).
-with-report : Create an HTML execution report with metrics such as resource usage for each workflow process.
```

> If the filter argument is set to `true`, TransforKmers model and tokenizer paths have to be given. They can be either downloaded from the [TransforKmers official repository](https://github.com/IGDRion/TransforKmers) or trained in advance by yourself on your own data.
Expand All @@ -101,3 +115,5 @@ To use them, extract the zip, and point `--tfkmers_model` and `--tfkmers_tokeniz
The filtered annotation can be the `union` of these 2 tools, _i.e._ all the transcripts validated by one or both of these tools; or the `intersection`, _i.e._ the transcripts validated by both tools (the latter being the default).

At the end, the QC steps are performed both on the full and filtered extended annotations.

### Prefiltering
9 changes: 4 additions & 5 deletions bin/validate_gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,10 @@
#######################################################

for record in GTF.parse_by_line(args.gtf):
if record.feature =="transcript":
if "gene_biotype" in record:
g_biotype = record["gene_biotype"]
elif "gene_type" in record:
g_biotype = record["gene_type"]
if "gene_biotype" in record:
g_biotype = record["gene_biotype"]
elif "gene_type" in record:
g_biotype = record["gene_type"]
if record.feature == "gene" or record.feature == "transcript":
continue

Expand Down
17 changes: 17 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
///////////////////////////////////////////////////////////////////////////
// PARSE ARGS
///////////////////////////////////////////////////////////////////////////

// Help Message
include { paramsHelp } from 'plugin/nf-schema'
include { helpHeader } from './modules/header.nf'
if (params.help) {
log.info helpHeader()
log.info paramsHelp("nextflow run IGDRion/ANNEXA -profile {test,docker,singularity,conda,slurm} --input samples.txt --gtf /path/to/ref.gtf --fa /path/to/ref.fa")
exit 0
}

if (params.input) { input = file(params.input, checkIfExists: true) }
else { exit 1, "Input file not specified!" }

Expand All @@ -27,6 +37,13 @@ if (params.filter) {
include { logHeader } from './modules/header.nf'
log.info logHeader(params)

if (params.tx_discovery == "bambu"){
log.warn """You are using Bambu as the transcript discovery tool. Please note that Bambu takes into account strandedness by default.
Bambu using strandedness: ${params.bambu_strand}. Please verifiy that it corresponds to your sequencing protocol.
You can change strandedness mode using the --bambu_strand parameter (options: true or false).
"""
}

///////////////////////////////////////////////////////////////////////////
// WORKFLOW
///////////////////////////////////////////////////////////////////////////
Expand Down
6 changes: 5 additions & 1 deletion modules/add_class_code.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@ process ADD_CLASS_CODE {
script:
"""
class_code.R ${class_code_gtf} ${gtf} "class_code.${gtf}"
## Remove header created by gtfsort
# Remove header created by gtfsort
sed -i 1,3d "class_code.${gtf}"
# Add semicolon at end of tx lines
sed -i '/\\ttranscript\\t/s/\$/;/' "class_code.${gtf}"
"""
}
19 changes: 19 additions & 0 deletions modules/header.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,22 @@ Stranded : ${params.bambu_strand}
-${c_dim}-------------------------------------${c_reset}-
""".stripIndent()
}

def helpHeader() {
// Log colors ANSI codes
c_dim = "\033[2m";
c_green = "\033[0;32m";
c_purple = "\033[0;35m";
c_reset = "\033[0m";

return """-${c_dim}-------------------------------------${c_reset}-
${c_green} ___ _ ___ _________ __ ___
/ | / | / / | / / ____/ |/ // |
/ /| | / |/ / |/ / __/ | // /| |
/ ___ |/ /| / /| / /___ / |/ ___ |
/_/ |_/_/ |_/_/ |_/_____//_/|_/_/ |_|
${c_reset}
-${c_dim}-------------------------------------${c_reset}-
${c_purple}github.com/igdrion/ANNEXA${c_reset}
""".stripIndent()
}
5 changes: 5 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
// General
plugins {
id '[email protected]'
}

params {
outdir = "results"
withGeneCoverage = false
Expand All @@ -14,6 +18,7 @@ params {
tfkmers_tokenizer = null
bambu_strand = true
tx_discovery = "bambu"
help = false
}

process {
Expand Down
197 changes: 197 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://raw.githubusercontent.com/IGDRion/ANNEXA/master/nextflow_schema.json",
"title": "IGDRion/ANNEXA pipeline parameters",
"description": "An all-in-one pipeline to analyze LR-RNAseq data,reconstruct and quantify known and novel genes and isoforms.",
"type": "object",
"defs": {
"required": {
"title": "Required options",
"type": "object",
"fa_icon": "fas fa-terminal",
"description": "Define where the pipeline should find input data and save output data.",
"properties": {
"input": {
"type": "string",
"format": "file-path",
"pattern": "^\\S+\\.(csv|tsv|yaml|json)$",
"description": "Path to file listing paths to bam files.",
"fa_icon": "fas fa-file-csv"
},
"fa": {
"type": "string",
"format": "file-path",
"description": "Path to reference genome.",
"fa_icon": "fas fa-folder-open"
},
"gtf": {
"type": "string",
"format": "directory-path",
"description": "Path to reference annotation.",
"fa_icon": "fas fa-folder-open"
}
}
},
"profile_options": {
"title": "Profile options",
"type": "object",
"fa_icon": "fas fa-terminal",
"description": "Define how the pipeline should be executed.",
"properties": {
"profile test": {
"type": "string",
"description": "Run annexa on toy dataset.",
"fa_icon": "fas fa-file-csv"
},
"profile slurm": {
"type": "string",
"description": "Run annexa on slurm executor."
},
"profile singularity": {
"type": "string",
"description": "Run annexa in singularity container."
},
"profile conda": {
"type": "string",
"description": "Run annexa in conda environment."
},
"profile docker": {
"type": "string",
"description": "Run annexa in docker container."
}
}
},
"main_options": {
"title": "Main options",
"type": "object",
"fa_icon": "fas fa-terminal",
"description": "General ANNEXA parameters.",
"properties": {
"tx_discovery": {
"type": "string",
"description": "Specify which transcriptome reconstruction tool to use.",
"default": "bambu",
"enum": ["bambu", "stringtie2"],
"fa_icon": "fas fa-file-csv"
},
"filter": {
"type": "boolean",
"description": "Perform or not the filtering step.",
"default": "true"
},
"withGeneCoverage": {
"type": "boolean",
"description": "Run RSeQC (can be long depending on annotation and bam sizes).",
"default": "false"
}
}
},
"bzmbu_options": {
"title": "Bambu options",
"type": "object",
"fa_icon": "fas fa-terminal",
"description": "Parameters related to Bambu.",
"properties": {
"bambu_strand": {
"type": "boolean",
"description": "Run bambu with stranded data",
"default": "true"
},
"prefilter_ndr": {
"type": "boolean",
"description": "When using ANNEXA with bambu, prefilter before the filtering step. false by default.",
"default": "false"
},
"bambu_threshold": {
"type": "integer",
"description": "bambu NDR threshold below which new transcripts are retained.",
"default": "0.2"
}
}
},
"filtering_options": {
"title": "Filtering options",
"type": "object",
"fa_icon": "fas fa-terminal",
"description": "Parameters related to filtering step.",
"properties": {
"tfkmers_tokenizer": {
"type": "string",
"description": "Path to TransforKmers tokenizer. Required if filter option is activated."
},
"tfkmers_model": {
"type": "string",
"description": "Path to TransforKmers model. Required if filter activated."
},
"tfkmers_threshold": {
"type": "integer",
"description": "TransforKmers prediction threshold below which new transcripts are retained.",
"default": "0.2",
"minimum": "0",
"maximum": "1"
},
"operation": {
"type": "string",
"description": "Operation to retained novel transcripts. 'union' retain tx validated by either bambu or transforkmers, 'intersection' retain tx validated by both.",
"enum": ["union", "intersection"],
"default": "intersection"
}
}
},
"performance_options": {
"title": "Performance options",
"type": "object",
"fa_icon": "fas fa-terminal",
"description": "Parameters related to performance.",
"properties": {
"maxCpu": {
"type": "integer",
"description": "Max cpu threads used by ANNEXA.",
"default": "8"
},
"maxMemory": {
"type": "integer",
"description": "Max memory (in GB) used by ANNEXA.",
"default": "40"
}
}
},
"nextflow_options": {
"title": "Nextflow options",
"type": "object",
"fa_icon": "fas fa-terminal",
"description": "Parameters related to Nextflow.",
"properties": {
"resume": {
"description": "Resume task from cached work (useful for recovering from errors when using singularity)."
},
"with-report": {
"description": "Create an HTML execution report with metrics such as resource usage for each workflow process."
}
}
}
},
"allOf": [
{
"$ref": "#/defs/required_options"
},
{
"$ref": "#/defs/profile_options"
},
{
"$ref": "#/defs/main_options"
},
{
"$ref": "#/defs/bambu_options"
},
{
"$ref": "#/defs/filtering_options"
},
{
"$ref": "#/defs/performance_options"
},
{
"$ref": "#/defs/nextflow_options"
}
]
}

0 comments on commit 0ae3bec

Please sign in to comment.