Skip to content

Commit

Permalink
Merge pull request #19 from N-Hoffmann/stringtie
Browse files Browse the repository at this point in the history
Add stringtie2 support
  • Loading branch information
N-Hoffmann authored Jun 4, 2024
2 parents d38d19a + d802405 commit e8d08fe
Show file tree
Hide file tree
Showing 16 changed files with 390 additions and 37 deletions.
34 changes: 26 additions & 8 deletions bin/filter_gtf_ndr.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,17 @@ def parse_ndr(csv, origin, th) -> Tuple[Set[str], Dict[str, StrandRecord]]:
return s, strand_dict


def filter_count_matrix(file, transcripts, wr):
def filter_count_matrix(file, transcripts, quant_method, wr):
if quant_method == "bambu":
tx_prefix = "BambuTx"
else:
tx_prefix = "MSTRG"

print(next(file), file=wr)
for line in file:
line_splitted = line.split("\t")
if (
line_splitted[0].startswith("BambuTx")
line_splitted[0].startswith(tx_prefix)
and line_splitted[0].lower() not in transcripts
):
continue
Expand All @@ -84,11 +89,12 @@ def filter_count_matrix(file, transcripts, wr):
type=argparse.FileType("r"),
required=True,
)
# Removed required=True for --bambu
parser.add_argument(
"--bambu",
help="Path to NDR generated by ANNEXA/bambu",
type=argparse.FileType("r"),
required=True,
required=True
)
parser.add_argument(
"--bambu-threshold",
Expand Down Expand Up @@ -116,18 +122,30 @@ def filter_count_matrix(file, transcripts, wr):
default="intersection",
choices=["union", "intersection"],
)

parser.add_argument(
"--tx_discovery",
help="Quantification method. Choices: bambu, stringtie2",
type=str,
choices=["bambu", "stringtie2"],
required=True,
)
args = parser.parse_args()

###################################################
filter_bambu, _ = parse_ndr(args.bambu, "bambu", args.bambu_threshold)
if args.tx_discovery == "bambu":
filter_bambu, _ = parse_ndr(args.bambu, "bambu", args.bambu_threshold)
filter_tfkmers, strand_dict = parse_ndr(
args.tfkmers, "tfkmers", args.tfkmers_threshold
)

if args.operation == "union":
filter = filter_bambu | filter_tfkmers
if args.tx_discovery == "stringtie2":
filter = filter_tfkmers
else:
filter = filter_bambu & filter_tfkmers
if args.operation == "union":
filter = filter_bambu | filter_tfkmers
else:
filter = filter_bambu & filter_tfkmers

with open("unformat.novel.filter.gtf", "w") as wr:
for record in GTF.parse_by_line(args.gtf):
Expand All @@ -143,4 +161,4 @@ def filter_count_matrix(file, transcripts, wr):
print(record, file=wr)

with open("counts_transcript.filter.txt", "w") as wr:
filter_count_matrix(args.counts_tx, filter, wr)
filter_count_matrix(args.counts_tx, filter, args.tx_discovery, wr)
35 changes: 35 additions & 0 deletions bin/gtf2gtf_cleanall.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash


# Format a gtf file with only 12 fields if more (to be used by overlap)

#!/bin/bash


# Format a bed file to an tabulated bed

# USAGE

usage() {
echo "#" >&2
echo "# Format a gtf 12+ in cleaned gtf 12 to be used by overlap...">&2
echo "# USAGE: `basename $0` <file.gtf>">&2

}
################################################################################
infile=$1

if [ -p /dev/stdin ];then

#from STDIN
cat /dev/stdin | awk '{for (i=1;i<9;i++) {printf("%s\t",$i) }; for (i=9;i<=NF;i++) {printf("%s ",$i)}; print ""}'

elif [ $# -eq 1 ];then

awk '{for (i=1;i<9;i++) {printf("%s\t",$i) }; for (i=9;i<=NF;i++) {printf("%s ",$i)}; print ""}' $infile

else
echo "#Error! no argument file or file empty !"
usage;
exit 0;
fi
24 changes: 18 additions & 6 deletions bin/qc_gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,15 @@ def get_ref_length(file):
return ref


def qc_gtf(gtf, gene_counts, ref):
def qc_gtf(gtf, gene_counts, ref, tx_discovery):
ref_start_end = get_ref_length(ref)
gene_counts = parse_gene_counts(gene_counts)

if tx_discovery == "bambu":
gene_prefix,tx_prefix = "BambuGene","BambuTx"
else:
gene_prefix,tx_prefix = "MSTRG.","MSTRG."

# CSV headers
gene_str = f"gene_id,gene_biotype,nb_transcripts,length,ext_5,ext_3,discovery,validate_by,presents_in_sample\n"
transcript_str = (
Expand All @@ -55,15 +60,15 @@ def qc_gtf(gtf, gene_counts, ref):

g_id = gene["gene_id"]
g_biotype = gene["gene_biotype"]
g_status = "novel" if g_id.startswith(('BambuGene','unstranded.Gene')) else "known"
g_status = "novel" if g_id.startswith((gene_prefix,'unstranded.Gene')) else "known"
g_count = gene_counts[g_id]["counts"] # Counts in all samples
g_samples = gene_counts[g_id]["validates"] # Found in x samples
g_nb_tx = len(gene.transcripts) # Number of isoforms

# Compute genomic extension with start/end in ref
ext_5 = 0
ext_3 = 0
if not g_id.startswith(('BambuGene','unstranded.Gene')):
if not g_id.startswith((gene_prefix,'unstranded.Gene')):
if gene.strand == "+":
ext_5 = ref_start_end[gene["gene_id"]]["start"] - gene.start
ext_3 = gene.end - ref_start_end[gene["gene_id"]]["end"]
Expand All @@ -82,7 +87,7 @@ def qc_gtf(gtf, gene_counts, ref):
for transcript in gene.transcripts:
tx_id = transcript["transcript_id"]
tx_biotype = transcript["transcript_biotype"]
tx_status = "novel" if tx_id.startswith("BambuTx") else "known"
tx_status = "novel" if tx_id.startswith(tx_prefix) else "known"
tx_nb_exons = len(transcript.exons)
tx_length = sum([len(exon) for exon in transcript.exons])

Expand Down Expand Up @@ -111,7 +116,7 @@ def qc_gtf(gtf, gene_counts, ref):
)
parser.add_argument(
"-c_gene",
help="Path to bambu counts (genes) file.",
help="Path to bambu or stringtie2 counts (genes) file.",
type=argparse.FileType("r"),
required=True,
)
Expand All @@ -127,9 +132,16 @@ def qc_gtf(gtf, gene_counts, ref):
type=str,
required=True,
)
parser.add_argument(
"-tx_discovery",
help="Quantification method. Choices: bambu, stringtie2",
type=str,
choices=["bambu", "stringtie2"],
required=True,
)
args = parser.parse_args()

gene, transcript, exon = qc_gtf(args.gtf, args.c_gene, args.ref)
gene, transcript, exon = qc_gtf(args.gtf, args.c_gene, args.ref, args.tx_discovery)
with open(f"{args.prefix}.gene.stats", "w") as fd:
fd.write(gene)
with open(f"{args.prefix}.transcript.stats", "w") as fd:
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ dependencies:
- conda-forge::r-viridis

- conda-forge::procps-ng
- conda-forge::mkl==2024.0
- conda-forge::mkl<=2024.0

- pip
- pip:
Expand Down
53 changes: 44 additions & 9 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ else { exit 1, "Reference annotation file not specified!" }
if (params.fa) { ref_fa = file(params.fa, checkIfExists: true) }
else { exit 1, "Reference genome file not specified!" }

if (params.tx_discovery != 'bambu' && params.tx_discovery != 'stringtie2') {
exit 1, "Please specify a valid quantification method ('bambu' (default) or 'stringtie2')."
}

if (params.filter) {
if (params.tfkmers_model) {
Expand All @@ -31,7 +34,13 @@ nextflow.enable.dsl=2
include { VALIDATE_INPUT_GTF } from './modules/input/validate.nf'
include { INDEX_BAM } from './modules/index_bam.nf'
include { BAMBU } from './modules/bambu/bambu.nf'
include { BAMBU_SPLIT_RESULTS } from './modules/bambu/split.nf'
include { STRINGTIE_ASSEMBLE } from './modules/stringtie/stringtie_assemble.nf'
include { STRINGTIE_MERGE } from './modules/stringtie/stringtie_merge.nf'
include { STRINGTIE_QUANTIFY } from './modules/stringtie/stringtie_quant.nf'
include { GFFCOMPARE } from './modules/gffcompare/gffcompare.nf'
include { SPLIT_EXTENDED_ANNOTATION } from './modules/split_extended_annotation.nf'
include { SUBREAD_FEATURECOUNTS } from './modules/subread/subread_featurecounts.nf'
include { MERGE_COUNTS } from './modules/merge_stringtie_quant.nf'
include { FEELNC_CODPOT } from './modules/feelnc/codpot.nf'
include { FEELNC_FORMAT } from './modules/feelnc/format.nf'
include { RESTORE_BIOTYPE } from './modules/restore_biotypes.nf'
Expand All @@ -57,35 +66,61 @@ workflow {
///////////////////////////////////////////////////////////////////////////
// NEW TRANSCRIPTS DISCOVERY
///////////////////////////////////////////////////////////////////////////
BAMBU(samples.collect(), VALIDATE_INPUT_GTF.out, ref_fa)
BAMBU_SPLIT_RESULTS(BAMBU.out.bambu_gtf)
if(params.tx_discovery == "bambu") {
BAMBU(samples.collect(), VALIDATE_INPUT_GTF.out, ref_fa)
SPLIT_EXTENDED_ANNOTATION(BAMBU.out.bambu_gtf)
}
else if (params.tx_discovery == "stringtie2") {
STRINGTIE_ASSEMBLE(samples, VALIDATE_INPUT_GTF.out)
STRINGTIE_MERGE(STRINGTIE_ASSEMBLE.out.stringtie_assemble_gtf.collect(), VALIDATE_INPUT_GTF.out)
STRINGTIE_QUANTIFY(samples, STRINGTIE_MERGE.out.stringtie_merged_gtf)
GFFCOMPARE(VALIDATE_INPUT_GTF.out, ref_fa, STRINGTIE_MERGE.out.stringtie_merged_gtf)
SUBREAD_FEATURECOUNTS(samples, GFFCOMPARE.out.stringtie_gtf)
MERGE_COUNTS(SUBREAD_FEATURECOUNTS.out.gene_counts.collect(), SUBREAD_FEATURECOUNTS.out.tx_counts.collect())
SPLIT_EXTENDED_ANNOTATION(GFFCOMPARE.out.stringtie_gtf)
}

///////////////////////////////////////////////////////////////////////////
// EXTRACT AND CLASSIFY NEW TRANSCRIPTS, AND PERFORM QC
///////////////////////////////////////////////////////////////////////////
FEELNC_CODPOT(VALIDATE_INPUT_GTF.out, ref_fa, BAMBU_SPLIT_RESULTS.out.novel_genes)
FEELNC_CODPOT(VALIDATE_INPUT_GTF.out, ref_fa, SPLIT_EXTENDED_ANNOTATION.out.novel_genes)
FEELNC_FORMAT(FEELNC_CODPOT.out.mRNA, FEELNC_CODPOT.out.lncRNA)
RESTORE_BIOTYPE(VALIDATE_INPUT_GTF.out, BAMBU_SPLIT_RESULTS.out.novel_isoforms)
RESTORE_BIOTYPE(VALIDATE_INPUT_GTF.out, SPLIT_EXTENDED_ANNOTATION.out.novel_isoforms)
MERGE_NOVEL(FEELNC_FORMAT.out, RESTORE_BIOTYPE.out)

if(params.tx_discovery == "bambu") {
ch_gene_counts = BAMBU.out.gene_counts
ch_tx_counts = BAMBU.out.tx_counts
ch_ndr = BAMBU.out.ndr
}
else if (params.tx_discovery == "stringtie2") {
ch_gene_counts = MERGE_COUNTS.out.gene_counts
ch_tx_counts = MERGE_COUNTS.out.tx_counts
ch_ndr = MERGE_COUNTS.out.ndr
}

QC_FULL(samples,
INDEX_BAM.out,
MERGE_NOVEL.out,
VALIDATE_INPUT_GTF.out,
BAMBU.out.gene_counts,
ch_gene_counts,
"full")

///////////////////////////////////////////////////////////////////////////
// FILTER NEW TRANSCRIPTS, AND QC ON FILTERED ANNOTATION
///////////////////////////////////////////////////////////////////////////
if(params.filter) {
TFKMERS(MERGE_NOVEL.out, ref_fa, BAMBU.out.ndr,
tokenizer, model, BAMBU.out.tx_counts)
TFKMERS(MERGE_NOVEL.out,
ref_fa,
ch_ndr,
tokenizer,
model,
ch_tx_counts)
QC_FILTER(samples,
INDEX_BAM.out,
TFKMERS.out.gtf,
VALIDATE_INPUT_GTF.out,
BAMBU.out.gene_counts,
ch_gene_counts,
"filter")
}
}
44 changes: 44 additions & 0 deletions modules/gffcompare/gffcompare.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
process GFFCOMPARE {
conda (params.enable_conda ? "bioconda::gffcompare" : null)
container "${ workflow.containerEngine == 'singularity' ?
'https://depot.galaxyproject.org/singularity/gffcompare:0.12.6--h9f5acd7_0' :
'biocontainers/gffcompare:0.12.6--h9f5acd7_0' }"
publishDir "$params.outdir/stringtie2", mode: 'copy', pattern: 'extended_annotations.gtf'
cpus params.maxCpu

input:
input:
path reference_gtf
path fasta
path merged_gtf

output:
path("*.annotated.gtf"), emit: annotated_gtf
path("*.loci") , emit: loci
path("*.stats") , emit: stats
path("*.tracking") , emit: tracking
path("extended_annotations.gtf"), emit: stringtie_gtf

shell:
'''
gffcompare \
-r !{reference_gtf} \
-s !{fasta} \
!{merged_gtf}
#Reformat the output of gffcompare to correctly match novel isoforms to known genes
awk 'BEGIN{
while(getline<"gffcmp.tracking">0){
if ($4 !="u" && $4 !="r"){
split($3,gn,"|");
split($5,tx,"|");
final["\\""tx[2]"\\";"]="\\""gn[1]"\\";"
}
}
} {
if ($12 in final){
$10=final[$12]; print $0} else {print $0}
}' !{merged_gtf} | gtf2gtf_cleanall.sh > extended_annotations.gtf
'''
}
21 changes: 11 additions & 10 deletions modules/header.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,18 @@ ${c_green} ___ _ ___ _________ __ ___
${c_reset}
-${c_dim}-------------------------------------${c_reset}-
${c_purple}github.com/igdrion/ANNEXA${c_reset}
Reference Annotation: ${params.gtf}
Reference Genome : ${params.fa}
Input Samplesheet : ${params.input}
Reference Annotation : ${params.gtf}
Reference Genome : ${params.fa}
Input Samplesheet : ${params.input}
---
Filtering : ${params.filter}
Tfkmers Model : ${params.tfkmers_model}
Tfkmers Tokenizer : ${params.tfkmers_tokenizer}
Tfkmers Threshold : ${params.tfkmers_threshold}
Bambu Threshold : ${params.bambu_threshold}
Filtering operation : ${params.operation}
Stranded : ${params.bambu_strand}
Transcript discovery : ${params.tx_discovery}
Filtering : ${params.filter}
Tfkmers Model : ${params.tfkmers_model}
Tfkmers Tokenizer : ${params.tfkmers_tokenizer}
Tfkmers Threshold : ${params.tfkmers_threshold}
Bambu Threshold : ${params.bambu_threshold}
Filtering operation : ${params.operation}
Stranded : ${params.bambu_strand}
-${c_dim}-------------------------------------${c_reset}-
""".stripIndent()
}
31 changes: 31 additions & 0 deletions modules/merge_stringtie_quant.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
process MERGE_COUNTS {
publishDir "$params.outdir/stringtie2", mode: 'copy', pattern: '*.txt'

input:
path gene_counts
path tx_counts

output:
path "counts_gene.txt", emit: gene_counts
path "counts_transcript.txt", emit: tx_counts
path "empty.ndr", emit: ndr

shell:
'''
# Merge the individual outputs of featurecount of each .bam into a single file
paste !{gene_counts} \
| awk '{printf("%s ",$1); for (i=2;i<=NF;i+=2){printf("%s ",$i)}print "\\n"}' \
| grep -v -e "^$" \
| awk -v OFS='\\t' '{$1=$1}1' > counts_gene.txt
paste !{tx_counts} \
| awk '{printf("\\n%s %s ",$1,$2); for (i=3;i<=NF;i+=3){printf("%s ",$i)}}' \
| grep -v -e "^$" \
| awk -v OFS='\\t' '{$1=$1}1' > counts_transcript.txt
# Create empty NDR file for TFKMERS to have same workflow as bambu in filtering step
# (placeholder, not used)
touch empty.ndr
'''
}
Loading

0 comments on commit e8d08fe

Please sign in to comment.