Merge pull request #19 from N-Hoffmann/stringtie

Add stringtie2 support
IGDRion · Jun 4, 2024 · e8d08fe · e8d08fe
2 parents d38d19a + d802405
commit e8d08fe
Show file tree

Hide file tree

Showing 16 changed files with 390 additions and 37 deletions.
diff --git a/bin/filter_gtf_ndr.py b/bin/filter_gtf_ndr.py
@@ -52,12 +52,17 @@ def parse_ndr(csv, origin, th) -> Tuple[Set[str], Dict[str, StrandRecord]]:
     return s, strand_dict
 
 
-def filter_count_matrix(file, transcripts, wr):
+def filter_count_matrix(file, transcripts, quant_method, wr):
+    if quant_method == "bambu":
+        tx_prefix = "BambuTx"
+    else:
+        tx_prefix = "MSTRG"
+
     print(next(file), file=wr)
     for line in file:
         line_splitted = line.split("\t")
         if (
-            line_splitted[0].startswith("BambuTx")
+            line_splitted[0].startswith(tx_prefix)
             and line_splitted[0].lower() not in transcripts
         ):
             continue
@@ -84,11 +89,12 @@ def filter_count_matrix(file, transcripts, wr):
         type=argparse.FileType("r"),
         required=True,
     )
+    # Removed required=True for --bambu
     parser.add_argument(
         "--bambu",
         help="Path to NDR generated by ANNEXA/bambu",
         type=argparse.FileType("r"),
-        required=True,
+        required=True
     )
     parser.add_argument(
         "--bambu-threshold",
@@ -116,18 +122,30 @@ def filter_count_matrix(file, transcripts, wr):
         default="intersection",
         choices=["union", "intersection"],
     )
+
+    parser.add_argument(
+        "--tx_discovery",
+        help="Quantification method. Choices: bambu, stringtie2",
+        type=str,
+        choices=["bambu", "stringtie2"],
+        required=True,
+    )
     args = parser.parse_args()
 
     ###################################################
-    filter_bambu, _ = parse_ndr(args.bambu, "bambu", args.bambu_threshold)
+    if args.tx_discovery == "bambu":
+        filter_bambu, _ = parse_ndr(args.bambu, "bambu", args.bambu_threshold)
     filter_tfkmers, strand_dict = parse_ndr(
         args.tfkmers, "tfkmers", args.tfkmers_threshold
     )
 
-    if args.operation == "union":
-        filter = filter_bambu | filter_tfkmers
+    if args.tx_discovery == "stringtie2":
+        filter = filter_tfkmers
     else:
-        filter = filter_bambu & filter_tfkmers
+        if args.operation == "union":
+            filter = filter_bambu | filter_tfkmers
+        else:
+            filter = filter_bambu & filter_tfkmers
 
     with open("unformat.novel.filter.gtf", "w") as wr:
         for record in GTF.parse_by_line(args.gtf):
@@ -143,4 +161,4 @@ def filter_count_matrix(file, transcripts, wr):
                     print(record, file=wr)
 
     with open("counts_transcript.filter.txt", "w") as wr:
-        filter_count_matrix(args.counts_tx, filter, wr)
+        filter_count_matrix(args.counts_tx, filter, args.tx_discovery, wr)
diff --git a/bin/gtf2gtf_cleanall.sh b/bin/gtf2gtf_cleanall.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+
+# Format a gtf file with only 12 fields if more (to be used by overlap)
+
+#!/bin/bash
+
+
+# Format a bed file to an tabulated bed
+
+# USAGE
+
+usage() {
+    echo "#" >&2
+    echo "# Format a gtf 12+ in cleaned gtf 12 to be used by overlap...">&2
+    echo "# USAGE: `basename $0` <file.gtf>">&2
+
+}
+################################################################################
+infile=$1
+
+if [ -p /dev/stdin ];then
+
+    #from STDIN
+    cat /dev/stdin | awk '{for (i=1;i<9;i++) {printf("%s\t",$i) }; for (i=9;i<=NF;i++) {printf("%s ",$i)}; print ""}' 
+
+elif [ $# -eq  1 ];then
+
+    awk '{for (i=1;i<9;i++) {printf("%s\t",$i) }; for (i=9;i<=NF;i++) {printf("%s ",$i)}; print ""}' $infile
+
+else
+    echo "#Error! no argument  file or file empty !"
+    usage;
+    exit 0;
+fi
diff --git a/bin/qc_gtf.py b/bin/qc_gtf.py
@@ -37,10 +37,15 @@ def get_ref_length(file):
     return ref
 
 
-def qc_gtf(gtf, gene_counts, ref):
+def qc_gtf(gtf, gene_counts, ref, tx_discovery):
     ref_start_end = get_ref_length(ref)
     gene_counts = parse_gene_counts(gene_counts)
 
+    if tx_discovery == "bambu":
+        gene_prefix,tx_prefix = "BambuGene","BambuTx"
+    else:
+        gene_prefix,tx_prefix = "MSTRG.","MSTRG."
+
     # CSV headers
     gene_str = f"gene_id,gene_biotype,nb_transcripts,length,ext_5,ext_3,discovery,validate_by,presents_in_sample\n"
     transcript_str = (
@@ -55,15 +60,15 @@ def qc_gtf(gtf, gene_counts, ref):
 
         g_id = gene["gene_id"]
         g_biotype = gene["gene_biotype"]
-        g_status = "novel" if g_id.startswith(('BambuGene','unstranded.Gene')) else "known"
+        g_status = "novel" if g_id.startswith((gene_prefix,'unstranded.Gene')) else "known"
         g_count = gene_counts[g_id]["counts"]  # Counts in all samples
         g_samples = gene_counts[g_id]["validates"]  # Found in x samples
         g_nb_tx = len(gene.transcripts)  # Number of isoforms
 
         # Compute genomic extension with start/end in ref
         ext_5 = 0
         ext_3 = 0
-        if not g_id.startswith(('BambuGene','unstranded.Gene')):
+        if not g_id.startswith((gene_prefix,'unstranded.Gene')):
             if gene.strand == "+":
                 ext_5 = ref_start_end[gene["gene_id"]]["start"] - gene.start
                 ext_3 = gene.end - ref_start_end[gene["gene_id"]]["end"]
@@ -82,7 +87,7 @@ def qc_gtf(gtf, gene_counts, ref):
         for transcript in gene.transcripts:
             tx_id = transcript["transcript_id"]
             tx_biotype = transcript["transcript_biotype"]
-            tx_status = "novel" if tx_id.startswith("BambuTx") else "known"
+            tx_status = "novel" if tx_id.startswith(tx_prefix) else "known"
             tx_nb_exons = len(transcript.exons)
             tx_length = sum([len(exon) for exon in transcript.exons])
 
@@ -111,7 +116,7 @@ def qc_gtf(gtf, gene_counts, ref):
     )
     parser.add_argument(
         "-c_gene",
-        help="Path to bambu counts (genes) file.",
+        help="Path to bambu or stringtie2 counts (genes) file.",
         type=argparse.FileType("r"),
         required=True,
     )
@@ -127,9 +132,16 @@ def qc_gtf(gtf, gene_counts, ref):
         type=str,
         required=True,
     )
+    parser.add_argument(
+        "-tx_discovery",
+        help="Quantification method. Choices: bambu, stringtie2",
+        type=str,
+        choices=["bambu", "stringtie2"],
+        required=True,
+    )
     args = parser.parse_args()
 
-    gene, transcript, exon = qc_gtf(args.gtf, args.c_gene, args.ref)
+    gene, transcript, exon = qc_gtf(args.gtf, args.c_gene, args.ref, args.tx_discovery)
     with open(f"{args.prefix}.gene.stats", "w") as fd:
         fd.write(gene)
     with open(f"{args.prefix}.transcript.stats", "w") as fd:

diff --git a/environment.yml b/environment.yml
@@ -16,7 +16,7 @@ dependencies:
   - conda-forge::r-viridis
 
   - conda-forge::procps-ng
-  - conda-forge::mkl==2024.0
+  - conda-forge::mkl<=2024.0
 
   - pip
   - pip:

diff --git a/main.nf b/main.nf
@@ -10,6 +10,9 @@ else { exit 1, "Reference annotation file not specified!" }
 if (params.fa) { ref_fa = file(params.fa, checkIfExists: true) }
 else { exit 1, "Reference genome file not specified!" }
 
+if (params.tx_discovery != 'bambu' && params.tx_discovery != 'stringtie2') {
+  exit 1, "Please specify a valid quantification method ('bambu' (default) or 'stringtie2')."
+}
 
 if (params.filter) {
   if (params.tfkmers_model) {
@@ -31,7 +34,13 @@ nextflow.enable.dsl=2
 include { VALIDATE_INPUT_GTF             } from './modules/input/validate.nf'
 include { INDEX_BAM                      } from './modules/index_bam.nf'
 include { BAMBU                          } from './modules/bambu/bambu.nf'
-include { BAMBU_SPLIT_RESULTS            } from './modules/bambu/split.nf'
+include { STRINGTIE_ASSEMBLE             } from './modules/stringtie/stringtie_assemble.nf'
+include { STRINGTIE_MERGE                } from './modules/stringtie/stringtie_merge.nf'
+include { STRINGTIE_QUANTIFY             } from './modules/stringtie/stringtie_quant.nf'
+include { GFFCOMPARE                     } from './modules/gffcompare/gffcompare.nf'
+include { SPLIT_EXTENDED_ANNOTATION      } from './modules/split_extended_annotation.nf'
+include { SUBREAD_FEATURECOUNTS          } from './modules/subread/subread_featurecounts.nf'
+include { MERGE_COUNTS                   } from './modules/merge_stringtie_quant.nf'
 include { FEELNC_CODPOT                  } from './modules/feelnc/codpot.nf'
 include { FEELNC_FORMAT                  } from './modules/feelnc/format.nf'
 include { RESTORE_BIOTYPE                } from './modules/restore_biotypes.nf'
@@ -57,35 +66,61 @@ workflow {
   ///////////////////////////////////////////////////////////////////////////
   // NEW TRANSCRIPTS DISCOVERY
   ///////////////////////////////////////////////////////////////////////////
-  BAMBU(samples.collect(), VALIDATE_INPUT_GTF.out, ref_fa)
-  BAMBU_SPLIT_RESULTS(BAMBU.out.bambu_gtf)
+  if(params.tx_discovery == "bambu") {
+    BAMBU(samples.collect(), VALIDATE_INPUT_GTF.out, ref_fa)
+    SPLIT_EXTENDED_ANNOTATION(BAMBU.out.bambu_gtf)
+  }
+  else if (params.tx_discovery == "stringtie2") {
+    STRINGTIE_ASSEMBLE(samples, VALIDATE_INPUT_GTF.out)
+    STRINGTIE_MERGE(STRINGTIE_ASSEMBLE.out.stringtie_assemble_gtf.collect(), VALIDATE_INPUT_GTF.out)
+    STRINGTIE_QUANTIFY(samples, STRINGTIE_MERGE.out.stringtie_merged_gtf)
+    GFFCOMPARE(VALIDATE_INPUT_GTF.out, ref_fa, STRINGTIE_MERGE.out.stringtie_merged_gtf)
+    SUBREAD_FEATURECOUNTS(samples, GFFCOMPARE.out.stringtie_gtf)
+    MERGE_COUNTS(SUBREAD_FEATURECOUNTS.out.gene_counts.collect(), SUBREAD_FEATURECOUNTS.out.tx_counts.collect())
+    SPLIT_EXTENDED_ANNOTATION(GFFCOMPARE.out.stringtie_gtf)
+  }
 
   ///////////////////////////////////////////////////////////////////////////
   // EXTRACT AND CLASSIFY NEW TRANSCRIPTS, AND PERFORM QC
   ///////////////////////////////////////////////////////////////////////////
-  FEELNC_CODPOT(VALIDATE_INPUT_GTF.out, ref_fa, BAMBU_SPLIT_RESULTS.out.novel_genes)
+  FEELNC_CODPOT(VALIDATE_INPUT_GTF.out, ref_fa, SPLIT_EXTENDED_ANNOTATION.out.novel_genes)
   FEELNC_FORMAT(FEELNC_CODPOT.out.mRNA, FEELNC_CODPOT.out.lncRNA)
-  RESTORE_BIOTYPE(VALIDATE_INPUT_GTF.out, BAMBU_SPLIT_RESULTS.out.novel_isoforms)
+  RESTORE_BIOTYPE(VALIDATE_INPUT_GTF.out, SPLIT_EXTENDED_ANNOTATION.out.novel_isoforms)
   MERGE_NOVEL(FEELNC_FORMAT.out, RESTORE_BIOTYPE.out)
 
+  if(params.tx_discovery == "bambu") {
+    ch_gene_counts = BAMBU.out.gene_counts
+    ch_tx_counts = BAMBU.out.tx_counts
+    ch_ndr = BAMBU.out.ndr
+  }
+  else if (params.tx_discovery == "stringtie2") {
+    ch_gene_counts = MERGE_COUNTS.out.gene_counts
+    ch_tx_counts = MERGE_COUNTS.out.tx_counts
+    ch_ndr = MERGE_COUNTS.out.ndr
+  }
+
   QC_FULL(samples, 
           INDEX_BAM.out, 
           MERGE_NOVEL.out, 
           VALIDATE_INPUT_GTF.out, 
-          BAMBU.out.gene_counts, 
+          ch_gene_counts,
           "full")
 
   ///////////////////////////////////////////////////////////////////////////
   // FILTER NEW TRANSCRIPTS, AND QC ON FILTERED ANNOTATION
   ///////////////////////////////////////////////////////////////////////////
   if(params.filter) {
-    TFKMERS(MERGE_NOVEL.out, ref_fa, BAMBU.out.ndr, 
-            tokenizer, model, BAMBU.out.tx_counts)
+    TFKMERS(MERGE_NOVEL.out, 
+            ref_fa, 
+            ch_ndr, 
+            tokenizer, 
+            model, 
+            ch_tx_counts)
     QC_FILTER(samples,
               INDEX_BAM.out, 
               TFKMERS.out.gtf, 
               VALIDATE_INPUT_GTF.out, 
-              BAMBU.out.gene_counts, 
+              ch_gene_counts, 
               "filter")
   }
 }
diff --git a/modules/gffcompare/gffcompare.nf b/modules/gffcompare/gffcompare.nf
@@ -0,0 +1,44 @@
+process GFFCOMPARE {
+  conda (params.enable_conda ? "bioconda::gffcompare" : null)
+  container "${ workflow.containerEngine == 'singularity' ?
+      'https://depot.galaxyproject.org/singularity/gffcompare:0.12.6--h9f5acd7_0' :
+      'biocontainers/gffcompare:0.12.6--h9f5acd7_0' }"
+  publishDir "$params.outdir/stringtie2", mode: 'copy', pattern: 'extended_annotations.gtf'
+  cpus params.maxCpu
+
+  input:
+    input:
+    path reference_gtf
+    path fasta
+    path merged_gtf
+
+    output:
+    path("*.annotated.gtf"), emit: annotated_gtf
+    path("*.loci")         , emit: loci
+    path("*.stats")        , emit: stats
+    path("*.tracking")     , emit: tracking
+    path("extended_annotations.gtf"), emit: stringtie_gtf
+
+    shell:
+    '''
+    gffcompare \
+    -r !{reference_gtf} \
+    -s !{fasta} \
+    !{merged_gtf}
+
+    #Reformat the output of gffcompare to correctly match novel isoforms to known genes
+
+    awk 'BEGIN{
+        while(getline<"gffcmp.tracking">0){
+            if ($4 !="u" && $4 !="r"){
+                split($3,gn,"|");
+                split($5,tx,"|"); 
+                final["\\""tx[2]"\\";"]="\\""gn[1]"\\";"
+                }
+            }
+    } {
+        if ($12 in final){
+            $10=final[$12]; print $0} else {print $0}
+    }' !{merged_gtf} | gtf2gtf_cleanall.sh  > extended_annotations.gtf
+    '''
+}
diff --git a/modules/header.nf b/modules/header.nf
@@ -14,17 +14,18 @@ ${c_green}    ___    _   ___   _________  __ ___
                                        ${c_reset}
 -${c_dim}-------------------------------------${c_reset}-
 ${c_purple}github.com/igdrion/ANNEXA${c_reset}
-Reference Annotation: ${params.gtf}
-Reference Genome    : ${params.fa}
-Input Samplesheet   : ${params.input}
+Reference Annotation  : ${params.gtf}
+Reference Genome      : ${params.fa}
+Input Samplesheet     : ${params.input}
 ---
-Filtering           : ${params.filter}
-Tfkmers Model       : ${params.tfkmers_model}
-Tfkmers Tokenizer   : ${params.tfkmers_tokenizer}
-Tfkmers Threshold   : ${params.tfkmers_threshold}
-Bambu Threshold     : ${params.bambu_threshold}
-Filtering operation : ${params.operation}
-Stranded            : ${params.bambu_strand}
+Transcript discovery  : ${params.tx_discovery}
+Filtering             : ${params.filter}
+Tfkmers Model         : ${params.tfkmers_model}
+Tfkmers Tokenizer     : ${params.tfkmers_tokenizer}
+Tfkmers Threshold     : ${params.tfkmers_threshold}
+Bambu Threshold       : ${params.bambu_threshold}
+Filtering operation   : ${params.operation}
+Stranded              : ${params.bambu_strand}
 -${c_dim}-------------------------------------${c_reset}-
 """.stripIndent()
 }
diff --git a/modules/merge_stringtie_quant.nf b/modules/merge_stringtie_quant.nf
@@ -0,0 +1,31 @@
+process MERGE_COUNTS {
+  publishDir "$params.outdir/stringtie2", mode: 'copy', pattern: '*.txt'
+
+  input:
+  path gene_counts
+  path tx_counts
+
+  output:
+  path "counts_gene.txt", emit: gene_counts
+  path "counts_transcript.txt", emit: tx_counts
+  path "empty.ndr", emit: ndr
+
+  shell:
+  '''
+  # Merge the individual outputs of featurecount of each .bam into a single file
+
+  paste !{gene_counts} \
+  | awk '{printf("%s ",$1); for (i=2;i<=NF;i+=2){printf("%s ",$i)}print "\\n"}' \
+  | grep -v -e "^$"  \
+  | awk -v OFS='\\t' '{$1=$1}1' > counts_gene.txt
+
+  paste !{tx_counts} \
+  | awk '{printf("\\n%s %s ",$1,$2); for (i=3;i<=NF;i+=3){printf("%s ",$i)}}' \
+  | grep -v -e "^$"  \
+  | awk -v OFS='\\t' '{$1=$1}1' > counts_transcript.txt
+
+  # Create empty NDR file for TFKMERS to have same workflow as bambu in filtering step 
+  # (placeholder, not used)
+  touch empty.ndr
+  '''
+}