From 75edce44737d07ab7870de7e47aea054d4a0a3fd Mon Sep 17 00:00:00 2001 From: KatharinaHoff Date: Thu, 28 Sep 2023 10:37:35 +0200 Subject: [PATCH 01/13] updated singularity tests to softmasking usage --- example/singularity-tests/test1.sh | 2 +- example/singularity-tests/test2.sh | 2 +- example/singularity-tests/test3.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/example/singularity-tests/test1.sh b/example/singularity-tests/test1.sh index 58c2532..227022c 100755 --- a/example/singularity-tests/test1.sh +++ b/example/singularity-tests/test1.sh @@ -45,7 +45,7 @@ if [ -d $wd ]; then rm -r $wd fi -singularity exec -B ${PWD}:${PWD} ${BRAKER_SIF} braker.pl --genome=/opt/BRAKER/example/genome.fa --bam=/opt/BRAKER/example/RNAseq.bam --softmasking --workingdir=${wd} --GENEMARK_PATH=${ETP}/gmes --threads 8 --gm_max_intergenic 10000 --skipOptimize +singularity exec -B ${PWD}:${PWD} ${BRAKER_SIF} braker.pl --genome=/opt/BRAKER/example/genome.fa --bam=/opt/BRAKER/example/RNAseq.bam --workingdir=${wd} --GENEMARK_PATH=${ETP}/gmes --threads 8 --gm_max_intergenic 10000 --skipOptimize # Important: the options --gm_max_intergenic 10000 --skipOptimize should never be applied to a real life run!!! # They were only introduced to speed up the test. Please delete them from the script if you use it for real data analysis. diff --git a/example/singularity-tests/test2.sh b/example/singularity-tests/test2.sh index ad7a2e3..89f77a0 100755 --- a/example/singularity-tests/test2.sh +++ b/example/singularity-tests/test2.sh @@ -47,7 +47,7 @@ if [ -d $wd ]; then rm -r $wd fi -singularity exec -B ${PWD}:${PWD} ${BRAKER_SIF} braker.pl --genome=/opt/BRAKER/example/genome.fa --prot_seq=/opt/BRAKER/example/proteins.fa --softmasking --workingdir=${wd} --threads 8 --gm_max_intergenic 10000 --skipOptimize +singularity exec -B ${PWD}:${PWD} ${BRAKER_SIF} braker.pl --genome=/opt/BRAKER/example/genome.fa --prot_seq=/opt/BRAKER/example/proteins.fa --workingdir=${wd} --threads 8 --gm_max_intergenic 10000 --skipOptimize # Important: the options --gm_max_intergenic 10000 --skipOptimize should never be applied to a real life run!!! # They were only introduced to speed up the test. Please delete them from the script if you use it for real data analysis. diff --git a/example/singularity-tests/test3.sh b/example/singularity-tests/test3.sh index 6a2f572..be08db8 100755 --- a/example/singularity-tests/test3.sh +++ b/example/singularity-tests/test3.sh @@ -45,7 +45,7 @@ if [ -d $wd ]; then rm -r $wd fi -singularity exec -B ${PWD}:${PWD} ${BRAKER_SIF} braker.pl --genome=/opt/BRAKER/example/genome.fa --prot_seq=/opt/BRAKER/example/proteins.fa --bam=/opt/BRAKER/example/RNAseq.bam --softmasking --workingdir=${wd} \ +singularity exec -B ${PWD}:${PWD} ${BRAKER_SIF} braker.pl --genome=/opt/BRAKER/example/genome.fa --prot_seq=/opt/BRAKER/example/proteins.fa --bam=/opt/BRAKER/example/RNAseq.bam --workingdir=${wd} \ --threads 8 --gm_max_intergenic 10000 --skipOptimize # Important: the options --gm_max_intergenic 10000 --skipOptimize should never be applied to a real life run!!! From 82b70f90c19cbf3627630706d0eecc5e6386fbb2 Mon Sep 17 00:00:00 2001 From: KatharinaHoff Date: Thu, 28 Sep 2023 13:55:56 +0200 Subject: [PATCH 02/13] file transfer --- scripts/braker_cleanup.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/braker_cleanup.pl b/scripts/braker_cleanup.pl index 8a42656..8fa33f2 100755 --- a/scripts/braker_cleanup.pl +++ b/scripts/braker_cleanup.pl @@ -121,7 +121,8 @@ "GeneMark-ET/data", "GeneMark-ET/info", "GeneMark-ET/output", "GeneMark-ET/run", "GeneMark-EP/data", "GeneMark-EP/info", "GeneMark-EP/output", "GeneMark-EP/run", "GeneMark-ETP/etp_data", "GeneMark-ETP/rnaseq/gmst/", "GeneMark-ETP/rnaseq/hisat2/", - "GeneMark-ETP/rnaseq/stringtie/", "GeneMark-ETP/arx", "GeneMark-ETP/proteins.fa/hc/regions/", + # "GeneMark-ETP/rnaseq/stringtie/", + "GeneMark-ETP/arx", "GeneMark-ETP/proteins.fa/hc/regions/", "GeneMark-ETP/proteins.fa/nonhc/prothint","GeneMark-ETP/rnaseq/hints/proteins.fa/prothint/Spaln", "GeneMark-ETP/proteins.fa/nonhc/for_prothint", "GeneMark-ETP/proteins.fa/nonhc/pred_m", "GeneMark-ETP/proteins.fa/nonhc/regions", "GeneMark-ETP/proteins.fa/etr", "GeneMark-ETP/proteins.fa/nonhc/hc", From f47491f71a7d1b809a12a0bee533de0da0eb1066 Mon Sep 17 00:00:00 2001 From: KatharinaHoff Date: Thu, 28 Sep 2023 14:10:24 +0200 Subject: [PATCH 03/13] dummy script --- scripts/stringtie2utr.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 scripts/stringtie2utr.py diff --git a/scripts/stringtie2utr.py b/scripts/stringtie2utr.py new file mode 100644 index 0000000..e6f9a5a --- /dev/null +++ b/scripts/stringtie2utr.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +import argparse + +def main(): + parser = argparse.ArgumentParser(description="Updates BRAKER gene models with UTRs from a StringTie assembly.") + + # Mandatory input arguments + parser.add_argument("-b", "--braker", required=True, help="File with BRAKER gene models in GTF format") + parser.add_argument("-s", "--stringtie", required=True, help="File with StringTie transcript models in GFF format") + + args = parser.parse_args() + + # Now you can access the input file paths using args.braker and args.stringtie + braker_file = args.braker + stringtie_file = args.stringtie + + # Add your code here to process the input files and perform the desired actions + +if __name__ == "__main__": + main() From d3601abcec53966b59e0420c043981d19eec0cea Mon Sep 17 00:00:00 2001 From: KatharinaHoff Date: Thu, 28 Sep 2023 17:45:38 +0200 Subject: [PATCH 04/13] not properly working, need to debug the gene/transcript nesting information --- scripts/stringtie2utr.py | 379 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 378 insertions(+), 1 deletion(-) diff --git a/scripts/stringtie2utr.py b/scripts/stringtie2utr.py index e6f9a5a..1cbe0b5 100644 --- a/scripts/stringtie2utr.py +++ b/scripts/stringtie2utr.py @@ -1,7 +1,359 @@ #!/usr/bin/env python3 import argparse +import re +def read_gtf(gtf_file): + """ + Reads a GTF file and extracts gene and non-gene features. + + Args: + - gtf_file (str): Path to the GTF file. + + Returns: + tuple: (non_gene_dict, gene_dict) where: + - non_gene_dict (dict): Dictionary with transcript IDs as keys and lists of non-gene feature lines as values. + - gene_dict (dict): Dictionary with transcript IDs as keys and the corresponding gene line as value. + """ + non_gene_dict = {} + gene_dict = {} + temp_gene_storage = {} # Temporary storage for gene lines + + transcript_id_pattern = re.compile(r'transcript_id "([^"]+)"') + + with open(gtf_file, 'r') as f: + for line in f: + if not line.startswith("#"): + fields = line.strip().split("\t") + feature_type = fields[2] + last_field = fields[-1] + + transcript_id_match = transcript_id_pattern.search(last_field) + + gene_id = None + transcript_id = None + + # Extract gene_id and transcript_id irrespective of their order in the last_field + for item in last_field.split(';'): + if "gene_id" in item: + gene_id = item.split(' ')[1].replace('"', '').strip() + if "transcript_id" in item: + transcript_id = transcript_id_match.group(1) if transcript_id_match else None + + if feature_type == "gene": + temp_gene_storage[gene_id] = line.strip() + elif feature_type == "transcript": + if transcript_id: + if gene_id in temp_gene_storage: + gene_dict[transcript_id] = temp_gene_storage[gene_id] + else: + # Format 2, no explicit gene line but inferred from the transcript + gene_dict[transcript_id] = line.strip() + non_gene_dict[transcript_id] = [line.strip()] + else: + if transcript_id: # Ensure we have a transcript ID + non_gene_dict.setdefault(transcript_id, []).append(line.strip()) + + return non_gene_dict, gene_dict + +import re + +def add_intron_features(gtf_dict): + """ + Adds intron feature lines based on the exon feature lines. + + Args: + - gtf_dict (dict): Dictionary with transcript_id as key and a list of + GTF entries as values. + + Returns: + dict: Updated GTF dictionary with intron feature lines added. + """ + + for transcript_id, entries in gtf_dict.items(): + sorted_exons = sorted([entry for entry in entries if entry.split('\t')[2] == 'exon'], key=lambda x: int(x.split('\t')[3])) + + new_entries = [] + for i in range(len(sorted_exons) - 1): + curr_exon = sorted_exons[i] + next_exon = sorted_exons[i + 1] + + curr_end = int(curr_exon.split('\t')[4]) + next_start = int(next_exon.split('\t')[3]) + + if next_start - curr_end > 1: + # Construct intron feature line based on exon line, but replace the feature name with "intron" + intron_line = curr_exon.split('\t') + intron_line[2] = 'intron' + intron_line[3] = str(curr_end + 1) + intron_line[4] = str(next_start - 1) + + # Remove exon_number attribute + intron_line[8] = re.sub(r'exon_number "\d+";', '', intron_line[8]).strip() + + # Add the intron line to new entries + new_entries.append('\t'.join(intron_line)) + + # Extend the original entries with new intron feature lines + gtf_dict[transcript_id].extend(new_entries) + + return gtf_dict + + + +def create_introns_hash(non_gene_dict): + """ + Creates a dictionary with intron strings as keys and transcript IDs as values. + + Args: + - non_gene_dict (dict): Dictionary with transcript_id as key and a list of + GTF entries as values. + + Returns: + dict: Dictionary with intron strings as keys and transcript IDs as values. + """ + + introns_hash = {} + + for transcript_id, entries in non_gene_dict.items(): + for entry in entries: + # Check if the feature is an intron + if entry.split('\t')[2] == 'intron': + seqname = entry.split('\t')[0] + start = entry.split('\t')[3] + end = entry.split('\t')[4] + strand = entry.split('\t')[6] + intron_key = f"{seqname}_{start}_{end}_{strand}" + + # Store the transcript ID associated with the intron in the hash + introns_hash[intron_key] = transcript_id + + return introns_hash + + +def find_matching_transcripts(intron_hash1, intron_hash2): + """ + Find matching transcript IDs based on intron patterns. + + Args: + - intron_hash1 (dict): Dictionary with intron strings as keys and transcript IDs from the first dataset as values. + - intron_hash2 (dict): Dictionary with intron strings as keys and transcript IDs from the second dataset as values. + + Returns: + dict: Dictionary with transcript IDs from intron_hash1 as keys and lists of matching transcript IDs from intron_hash2 as values. + """ + + # Reverse the hashes for easy lookup of intron patterns for each transcript + reverse_hash1 = {} + for intron, transcript in intron_hash1.items(): + if transcript not in reverse_hash1: + reverse_hash1[transcript] = [] + reverse_hash1[transcript].append(intron) + + reverse_hash2 = {} + for intron, transcript in intron_hash2.items(): + if transcript not in reverse_hash2: + reverse_hash2[transcript] = [] + reverse_hash2[transcript].append(intron) + + matching_transcripts = {} + + for transcript1, introns1 in reverse_hash1.items(): + matches = set() + + for transcript2, introns2 in reverse_hash2.items(): + if all(intron in introns2 for intron in introns1): + matches.add(transcript2) + + if matches: + matching_transcripts[transcript1] = list(matches) + + return matching_transcripts + + +def tx_len(non_gene_dict): + """ + Calculate the length of each transcript. + + Args: + - non_gene_dict (dict): Dictionary with transcript_id as key and a list of + GTF entries as values. + + Returns: + dict: Dictionary with transcript IDs as keys and their lengths as values. + """ + + transcript_lengths = {} + + for transcript_id, entries in non_gene_dict.items(): + total_length = 0 + + for entry in entries: + fields = entry.split('\t') + start = int(fields[3]) + end = int(fields[4]) + + total_length += (end - start + 1) # +1 because both start and end are inclusive + + transcript_lengths[transcript_id] = total_length + + return transcript_lengths + + +def select_longest_tx(matched_transcripts, tx_lens_stringtie): + """ + For each key in matched_transcripts, select the longest transcript based on lengths provided in tx_lens_stringtie. + + Args: + - matched_transcripts (dict): Dictionary with keys as transcript IDs and values as lists of matching transcript IDs. + - tx_lens_stringtie (dict): Dictionary with transcript IDs as keys and their lengths as values. + + Returns: + dict: Dictionary with keys from matched_transcripts and values as the IDs of the longest transcripts. + """ + + selected_transcripts = {} + + for original_tx, matched_tx_list in matched_transcripts.items(): + # Find the longest transcript in the matched_tx_list + longest_tx = max(matched_tx_list, key=lambda tx: tx_lens_stringtie.get(tx, 0)) + + selected_transcripts[original_tx] = longest_tx + + return selected_transcripts + + +def overlap(exon_start, exon_end, cds_start, cds_end): + """Check if exon overlaps with CDS.""" + return exon_start <= cds_end and exon_end >= cds_start + + +def merge_features(braker_gtf, stringtie_gtf, selected_transcripts): + for braker_tx, stringtie_tx in selected_transcripts.items(): + # Retrieve features for current transcripts + braker_features = braker_gtf[braker_tx] + stringtie_exons = [f for f in stringtie_gtf[stringtie_tx] if f.split('\t')[2] == 'exon'] + braker_cds_features = [f for f in braker_features if f.split('\t')[2] == 'CDS'] + + for exon in stringtie_exons: + exon_parts = exon.split('\t') + exon_start, exon_end = int(exon_parts[3]), int(exon_parts[4]) + + # Flag to determine if current exon should be merged + merge_exon = True + + for cds in braker_cds_features: + cds_parts = cds.split('\t') + cds_start, cds_end = int(cds_parts[3]), int(cds_parts[4]) + + if overlap(exon_start, exon_end, cds_start, cds_end): + # Exon overlaps with CDS. Check the length condition. + if exon_end - exon_start + 1 < cds_end - cds_start + 1: + merge_exon = False + break + + # Add exon to braker_features if it passed the checks + if merge_exon: + braker_features.append(exon) + + # Update the braker_gtf dictionary with new features + braker_gtf[braker_tx] = braker_features + + return braker_gtf + + +import re + +def compute_utr_features(braker_gtf): + """ + Compute the UTR features for each transcript in braker_gtf based on strand information. + + Args: + - braker_gtf (dict): Dictionary with transcript IDs as keys and lists of GTF feature lines as values. + + Returns: + dict: Updated braker_gtf dictionary with UTR features added. + """ + + for transcript_id, features in braker_gtf.items(): + # Sort features by start position + features.sort(key=lambda x: int(x.split('\t')[3])) + + utr_features = [] + for feature in features: + fields = feature.split('\t') + feature_type = fields[2] + start = int(fields[3]) + end = int(fields[4]) + strand = fields[6] + + # If feature is an exon, check if there are overlapping CDS features + if feature_type == "exon": + overlapping_cds = [f for f in features if f.split('\t')[2] == "CDS" and int(f.split('\t')[3]) <= end and int(f.split('\t')[4]) >= start] + + if overlapping_cds: + cds_start = int(overlapping_cds[0].split('\t')[3]) + cds_end = int(overlapping_cds[0].split('\t')[4]) + + # Check for UTR based on strand + if strand == "+": + if start < cds_start: + utr5 = "\t".join(fields[:2] + ["5'UTR"] + fields[3:]) + utr5 = utr5.replace(str(end), str(cds_start - 1)) + utr_features.append(utr5) + + if end > cds_end: + utr3 = "\t".join(fields[:2] + ["3'UTR"] + fields[3:]) + utr3 = utr3.replace(str(start), str(cds_end + 1)) + utr_features.append(utr3) + + elif strand == "-": + if end > cds_end: + utr5 = "\t".join(fields[:2] + ["5'UTR"] + fields[3:]) + utr5 = utr5.replace(str(start), str(cds_end + 1)) + utr_features.append(utr5) + + if start < cds_start: + utr3 = "\t".join(fields[:2] + ["3'UTR"] + fields[3:]) + utr3 = utr3.replace(str(end), str(cds_start - 1)) + utr_features.append(utr3) + + # Add the computed UTR features to the list of features for this transcript + features.extend(utr_features) + + return braker_gtf + + +def print_gtf(gtf_dict, gene_dict): + """ + Print GTF lines based on gene_dict and gtf_dict. + + Args: + - gtf_dict (dict): Dictionary with transcript IDs as keys and lists of GTF feature lines as values. + - gene_dict (dict): Dictionary with transcript IDs as keys and the corresponding gene line as value. + + Returns: + None: Prints the GTF lines to stdout. + """ + + # Iterate over gene_dict entries + for transcript_id, gene_line in gene_dict.items(): + print("I am in a gene") + # Print the gene entry + print(gene_line) + + # Retrieve the features and sort them by start position (4th column in GTF) + sorted_features = sorted(gtf_dict.get(transcript_id, []), key=lambda x: int(x.split('\t')[3])) + + # Print corresponding transcript and other feature lines from gtf_dict + for feature in sorted_features: + # If the feature is a UTR line, remove the exon_number + if "UTR" in feature: + feature = re.sub(r'exon_number "[0-9]+";', '', feature).strip() + print(feature) + + + def main(): parser = argparse.ArgumentParser(description="Updates BRAKER gene models with UTRs from a StringTie assembly.") @@ -15,7 +367,32 @@ def main(): braker_file = args.braker stringtie_file = args.stringtie - # Add your code here to process the input files and perform the desired actions + # read the braker_file + braker_non_gene_dict, braker_gene_dict = read_gtf(braker_file) + + # read the stringtie_file + stringtie_non_gene_dict, stringtie_gene_dict = read_gtf(stringtie_file) + # add intron features to the stringtie_non_gene_dict + stringtie_non_gene_dict = add_intron_features(stringtie_non_gene_dict) + + # create introns hash for braker and stringtie_non_gene_dict) + braker_introns_hash = create_introns_hash(braker_non_gene_dict) + stringtie_introns_hash = create_introns_hash(stringtie_non_gene_dict) + # find matching transcripts + matched_transcripts = find_matching_transcripts(braker_introns_hash, stringtie_introns_hash) + + # calculate the length of each transcript + tx_lens_stringtie = tx_len(stringtie_non_gene_dict) + + # select the longest transcript for each key in matched_transcripts + final_matching_tx = select_longest_tx(matched_transcripts, tx_lens_stringtie) + + # merge features from stringtie_gtf into braker_gtf based on the selected transcripts + braker_gtf = merge_features(braker_non_gene_dict, stringtie_non_gene_dict, final_matching_tx) + # compute UTR features + braker_gtf = compute_utr_features(braker_gtf) + # print the updated braker_gtf + print_gtf(braker_gtf, braker_gene_dict) if __name__ == "__main__": main() From 78d736b63fe10d60e4109a89db882ccb519d2886 Mon Sep 17 00:00:00 2001 From: KatharinaHoff Date: Fri, 29 Sep 2023 12:43:59 +0200 Subject: [PATCH 05/13] adding UTRs for multi-exon transcripts seems to be working --- scripts/stringtie2utr.py | 156 ++++++++++++++++++++++++++------------- 1 file changed, 106 insertions(+), 50 deletions(-) mode change 100644 => 100755 scripts/stringtie2utr.py diff --git a/scripts/stringtie2utr.py b/scripts/stringtie2utr.py old mode 100644 new mode 100755 index 1cbe0b5..5d9d34a --- a/scripts/stringtie2utr.py +++ b/scripts/stringtie2utr.py @@ -3,6 +3,7 @@ import argparse import re + def read_gtf(gtf_file): """ Reads a GTF file and extracts gene and non-gene features. @@ -11,15 +12,18 @@ def read_gtf(gtf_file): - gtf_file (str): Path to the GTF file. Returns: - tuple: (non_gene_dict, gene_dict) where: + tuple: (non_gene_dict, gene_dict, tx_to_gene_dict, tx_dict) where: - non_gene_dict (dict): Dictionary with transcript IDs as keys and lists of non-gene feature lines as values. - - gene_dict (dict): Dictionary with transcript IDs as keys and the corresponding gene line as value. + - gene_dict (dict): Dictionary with gene IDs extracted from the last column of gene feature lines as keys and the corresponding gene line as value. + - tx_to_gene_dict (dict): Dictionary with transcript IDs as keys and the corresponding gene ID as value, extracted from the last column of transcript feature lines. + - tx_dict (dict): Dictionary with transcript IDs as keys and the corresponding transcript line as value. The entire last column is used as the transcript ID. """ non_gene_dict = {} gene_dict = {} - temp_gene_storage = {} # Temporary storage for gene lines - + tx_to_gene_dict = {} + tx_dict = {} transcript_id_pattern = re.compile(r'transcript_id "([^"]+)"') + gene_id_pattern = re.compile(r'gene_id "([^"]+)"') with open(gtf_file, 'r') as f: for line in f: @@ -28,35 +32,28 @@ def read_gtf(gtf_file): feature_type = fields[2] last_field = fields[-1] - transcript_id_match = transcript_id_pattern.search(last_field) - - gene_id = None - transcript_id = None - - # Extract gene_id and transcript_id irrespective of their order in the last_field - for item in last_field.split(';'): - if "gene_id" in item: - gene_id = item.split(' ')[1].replace('"', '').strip() - if "transcript_id" in item: - transcript_id = transcript_id_match.group(1) if transcript_id_match else None - if feature_type == "gene": - temp_gene_storage[gene_id] = line.strip() + gene_dict[last_field] = line.strip() # Use the entire last field as gene_id elif feature_type == "transcript": - if transcript_id: - if gene_id in temp_gene_storage: - gene_dict[transcript_id] = temp_gene_storage[gene_id] - else: - # Format 2, no explicit gene line but inferred from the transcript - gene_dict[transcript_id] = line.strip() - non_gene_dict[transcript_id] = [line.strip()] + tx_dict[last_field] = line.strip() # Use the entire last field as transcript_id else: + transcript_id_match = transcript_id_pattern.search(last_field) + gene_id_match = gene_id_pattern.search(last_field) + gene_id = None + transcript_id = None + if transcript_id_match: + transcript_id = transcript_id_match.group(1) + if gene_id_match: + gene_id = gene_id_match.group(1) if transcript_id: # Ensure we have a transcript ID non_gene_dict.setdefault(transcript_id, []).append(line.strip()) + if transcript_id not in tx_to_gene_dict: + tx_to_gene_dict[transcript_id] = gene_id + + return non_gene_dict, gene_dict, tx_to_gene_dict, tx_dict + - return non_gene_dict, gene_dict -import re def add_intron_features(gtf_dict): """ @@ -262,8 +259,6 @@ def merge_features(braker_gtf, stringtie_gtf, selected_transcripts): return braker_gtf -import re - def compute_utr_features(braker_gtf): """ Compute the UTR features for each transcript in braker_gtf based on strand information. @@ -298,23 +293,23 @@ def compute_utr_features(braker_gtf): # Check for UTR based on strand if strand == "+": if start < cds_start: - utr5 = "\t".join(fields[:2] + ["5'UTR"] + fields[3:]) + utr5 = "\t".join(fields[:2] + ["five_prime_UTR"] + fields[3:]) utr5 = utr5.replace(str(end), str(cds_start - 1)) utr_features.append(utr5) if end > cds_end: - utr3 = "\t".join(fields[:2] + ["3'UTR"] + fields[3:]) + utr3 = "\t".join(fields[:2] + ["three_prime_UTR"] + fields[3:]) utr3 = utr3.replace(str(start), str(cds_end + 1)) utr_features.append(utr3) elif strand == "-": if end > cds_end: - utr5 = "\t".join(fields[:2] + ["5'UTR"] + fields[3:]) + utr5 = "\t".join(fields[:2] + ["five_prime_UTR"] + fields[3:]) utr5 = utr5.replace(str(start), str(cds_end + 1)) utr_features.append(utr5) if start < cds_start: - utr3 = "\t".join(fields[:2] + ["3'UTR"] + fields[3:]) + utr3 = "\t".join(fields[:2] + ["three_prime_UTR"] + fields[3:]) utr3 = utr3.replace(str(end), str(cds_start - 1)) utr_features.append(utr3) @@ -324,33 +319,92 @@ def compute_utr_features(braker_gtf): return braker_gtf -def print_gtf(gtf_dict, gene_dict): +def fix_feature_coordinates(gtf_dict, gene_dict, tx_to_gene_dict, tx_dict): + """ + The list of features in gtf_dict has been expanded compared to the original version. We need to identify the left most and right most coordinate + for each transcript. Then, update the tx_dict lines with these new coordinates. In the end, loop over all tx_dict entries and identify the left most and right + most coordinate per gene, then update the gene lines in gene_dict. + + Args: + - gtf_dict (dict): Dictionary with transcript IDs as keys and lists of non-gene feature lines as values. + - gene_dict (dict): Dictionary with gene IDs extracted from the last column of gene feature lines as keys and the corresponding gene line as value. + - tx_to_gene_dict (dict): Dictionary with transcript IDs as keys and the corresponding gene ID as value, extracted from the last column of transcript feature lines. + - tx_dict (dict): Dictionary with transcript IDs as keys and the corresponding transcript line as value. The entire last column is used as the transcript ID. + + Returns: + gene_dict, tx_dict with updated coordinates + """ + # 1. Update transcript coordinates based on feature coordinates in gtf_dict + for tx_id, features in gtf_dict.items(): + starts = [] + ends = [] + for feature in features: + fields = feature.split('\t') + starts.append(int(fields[3])) + ends.append(int(fields[4])) + + # Find the min and max coordinates for the transcript + min_coord = min(starts) + max_coord = max(ends) + + tx_fields = tx_dict[tx_id].split('\t') + tx_fields[3] = str(min_coord) + tx_fields[4] = str(max_coord) + tx_dict[tx_id] = '\t'.join(tx_fields) + + # 2. Update gene coordinates based on updated transcript coordinates in tx_dict + for gene_id in gene_dict: + associated_transcripts = [tx for tx, gid in tx_to_gene_dict.items() if gid == gene_id] + + starts = [] + ends = [] + for tx_id in associated_transcripts: + tx_fields = tx_dict[tx_id].split('\t') + starts.append(int(tx_fields[3])) + ends.append(int(tx_fields[4])) + + # Find the min and max coordinates for the gene + min_coord = min(starts) + max_coord = max(ends) + + gene_fields = gene_dict[gene_id].split('\t') + gene_fields[3] = str(min_coord) + gene_fields[4] = str(max_coord) + gene_dict[gene_id] = '\t'.join(gene_fields) + + return gene_dict, tx_dict + + +def print_gtf(gtf_dict, gene_dict, tx_to_gene_dict, tx_dict): """ Print GTF lines based on gene_dict and gtf_dict. Args: - - gtf_dict (dict): Dictionary with transcript IDs as keys and lists of GTF feature lines as values. - - gene_dict (dict): Dictionary with transcript IDs as keys and the corresponding gene line as value. + - gtf_dict (dict): Dictionary with transcript IDs as keys and lists of non-gene feature lines as values. + - gene_dict (dict): Dictionary with gene IDs extracted from the last column of gene feature lines as keys and the corresponding gene line as value. + - tx_to_gene_dict (dict): Dictionary with transcript IDs as keys and the corresponding gene ID as value, extracted from the last column of transcript feature lines. + - tx_dict (dict): Dictionary with transcript IDs as keys and the corresponding transcript line as value. The entire last column is used as the transcript ID. Returns: None: Prints the GTF lines to stdout. """ - + printed_gene = {} # Iterate over gene_dict entries - for transcript_id, gene_line in gene_dict.items(): - print("I am in a gene") - # Print the gene entry - print(gene_line) - - # Retrieve the features and sort them by start position (4th column in GTF) - sorted_features = sorted(gtf_dict.get(transcript_id, []), key=lambda x: int(x.split('\t')[3])) - - # Print corresponding transcript and other feature lines from gtf_dict + for tx_id, tx_line in tx_dict.items(): + if not tx_id in printed_gene: + print(gene_dict[tx_to_gene_dict[tx_id]]) + gene_printed = True + print(tx_line) + sorted_features = sorted(gtf_dict.get(tx_id, []), key=lambda x: int(x.split('\t')[3])) for feature in sorted_features: # If the feature is a UTR line, remove the exon_number if "UTR" in feature: - feature = re.sub(r'exon_number "[0-9]+";', '', feature).strip() - print(feature) + # split line into fields + fields = feature.split("\t") + # build new gtf line + print(fields[0] + "\tstringtie2utr\t", "\t".join(fields[2:8]), "\ttranscript_id \"" + tx_id + "\"; gene_id \"" + tx_to_gene_dict[tx_id] + "\";") + elif "StringTie" not in feature: + print(feature) @@ -368,10 +422,10 @@ def main(): stringtie_file = args.stringtie # read the braker_file - braker_non_gene_dict, braker_gene_dict = read_gtf(braker_file) + braker_non_gene_dict, braker_gene_line_dict, braker_tx_to_gene_dict, braker_tx_dict = read_gtf(braker_file) # read the stringtie_file - stringtie_non_gene_dict, stringtie_gene_dict = read_gtf(stringtie_file) + stringtie_non_gene_dict, stringtie_gene_line_dict, stringie_tx_to_gene_dict, stringtie_tx_dict = read_gtf(stringtie_file) # add intron features to the stringtie_non_gene_dict stringtie_non_gene_dict = add_intron_features(stringtie_non_gene_dict) @@ -391,8 +445,10 @@ def main(): braker_gtf = merge_features(braker_non_gene_dict, stringtie_non_gene_dict, final_matching_tx) # compute UTR features braker_gtf = compute_utr_features(braker_gtf) + # fix gene and transcript coordinates + braker_gene_line_dict, braker_tx_dict = fix_feature_coordinates(braker_gtf, braker_gene_line_dict, braker_tx_to_gene_dict, braker_tx_dict) # print the updated braker_gtf - print_gtf(braker_gtf, braker_gene_dict) + print_gtf(braker_gtf, braker_gene_line_dict, braker_tx_to_gene_dict, braker_tx_dict) if __name__ == "__main__": main() From 48a3b77c63344324cbf1bd361f57d9d8d3b0df44 Mon Sep 17 00:00:00 2001 From: KatharinaHoff Date: Fri, 29 Sep 2023 16:27:10 +0200 Subject: [PATCH 06/13] adding UTRs seems to work for both single and multi exon genes, but gene lines are still printed double --- scripts/stringtie2utr.py | 294 ++++++++++++++++++++++++++++++++++----- 1 file changed, 258 insertions(+), 36 deletions(-) diff --git a/scripts/stringtie2utr.py b/scripts/stringtie2utr.py index 5d9d34a..53ef6a6 100755 --- a/scripts/stringtie2utr.py +++ b/scripts/stringtie2utr.py @@ -2,6 +2,7 @@ import argparse import re +from intervaltree import IntervalTree, Interval def read_gtf(gtf_file): @@ -131,6 +132,7 @@ def create_introns_hash(non_gene_dict): def find_matching_transcripts(intron_hash1, intron_hash2): """ Find matching transcript IDs based on intron patterns. + This only works for multi-exon genes. Args: - intron_hash1 (dict): Dictionary with intron strings as keys and transcript IDs from the first dataset as values. @@ -168,6 +170,27 @@ def find_matching_transcripts(intron_hash1, intron_hash2): return matching_transcripts +def find_single_exon_genes(non_gene_dict): + """ + Find single exon genes in non_gene_dict. + + Args: + - non_gene_dict (dict): Dictionary with transcript_id as key and a list of + GTF entries as values. + + Returns: + dict: List of transcript IDs for single exon genes. + """ + + single_exon_genes = {} + + for transcript_id, entries in non_gene_dict.items(): + # Check if the transcript has only one exon + if len([entry for entry in entries if entry.split('\t')[2] == 'exon']) == 1: + single_exon_genes[transcript_id] = True + + return single_exon_genes + def tx_len(non_gene_dict): """ Calculate the length of each transcript. @@ -225,12 +248,12 @@ def overlap(exon_start, exon_end, cds_start, cds_end): return exon_start <= cds_end and exon_end >= cds_start -def merge_features(braker_gtf, stringtie_gtf, selected_transcripts): - for braker_tx, stringtie_tx in selected_transcripts.items(): +def merge_features(tsebra_gtf, stringtie_gtf, selected_transcripts): + for tsebra_tx, stringtie_tx in selected_transcripts.items(): # Retrieve features for current transcripts - braker_features = braker_gtf[braker_tx] + tsebra_features = tsebra_gtf[tsebra_tx] stringtie_exons = [f for f in stringtie_gtf[stringtie_tx] if f.split('\t')[2] == 'exon'] - braker_cds_features = [f for f in braker_features if f.split('\t')[2] == 'CDS'] + tsebra_cds_features = [f for f in tsebra_features if f.split('\t')[2] == 'CDS'] for exon in stringtie_exons: exon_parts = exon.split('\t') @@ -239,7 +262,7 @@ def merge_features(braker_gtf, stringtie_gtf, selected_transcripts): # Flag to determine if current exon should be merged merge_exon = True - for cds in braker_cds_features: + for cds in tsebra_cds_features: cds_parts = cds.split('\t') cds_start, cds_end = int(cds_parts[3]), int(cds_parts[4]) @@ -249,28 +272,27 @@ def merge_features(braker_gtf, stringtie_gtf, selected_transcripts): merge_exon = False break - # Add exon to braker_features if it passed the checks + # Add exon to tsebra_features if it passed the checks if merge_exon: - braker_features.append(exon) + tsebra_features.append(exon) - # Update the braker_gtf dictionary with new features - braker_gtf[braker_tx] = braker_features + # Update the tsebra_gtf dictionary with new features + tsebra_gtf[tsebra_tx] = tsebra_features - return braker_gtf + return tsebra_gtf -def compute_utr_features(braker_gtf): +def compute_utr_features(tsebra_gtf): """ - Compute the UTR features for each transcript in braker_gtf based on strand information. + Compute the UTR features for each transcript in tsebra_gtf based on strand information. Args: - - braker_gtf (dict): Dictionary with transcript IDs as keys and lists of GTF feature lines as values. + - tsebra_gtf (dict): Dictionary with transcript IDs as keys and lists of GTF feature lines as values. Returns: - dict: Updated braker_gtf dictionary with UTR features added. + dict: Updated tsebra_gtf dictionary with UTR features added. """ - - for transcript_id, features in braker_gtf.items(): + for transcript_id, features in tsebra_gtf.items(): # Sort features by start position features.sort(key=lambda x: int(x.split('\t')[3])) @@ -312,11 +334,48 @@ def compute_utr_features(braker_gtf): utr3 = "\t".join(fields[:2] + ["three_prime_UTR"] + fields[3:]) utr3 = utr3.replace(str(end), str(cds_start - 1)) utr_features.append(utr3) - + # Add the computed UTR features to the list of features for this transcript features.extend(utr_features) + features.sort(key=lambda x: int(x.split('\t')[3])) + + # Modify StringTie exons to UTR features if needed + seen_cds = False + for idx, feature in enumerate(features): + fields = feature.split('\t') + start = int(fields[3]) + end = int(fields[4]) + strand = fields[6] + featuretype = fields[2] + if featuretype == "CDS": + seen_cds = True + if strand == "+" and not seen_cds: + utr_type = "five_prime_UTR" + elif strand == "-" and not seen_cds: + utr_type = "three_prime_UTR" + elif strand == "+" and seen_cds: + utr_type = "three_prime_UTR" + else: + utr_type = "five_prime_UTR" + + if "StringTie" in fields[1]: # Assuming this condition identifies StringTie exons + # Check if it has the same start as any of its neighbors + same_start = False + for offset in [-1, 1]: + if idx + offset >= 0 and idx + offset < len(features): + neighbor = features[idx + offset].split('\t') + if int(neighbor[3]) == start: + same_start = True + break + + if same_start: + # Remove the StringTie feature + features.pop(idx) + else: + fields[2] = utr_type + features[idx] = "\t".join(fields) - return braker_gtf + return tsebra_gtf def fix_feature_coordinates(gtf_dict, gene_dict, tx_to_gene_dict, tx_dict): @@ -403,10 +462,153 @@ def print_gtf(gtf_dict, gene_dict, tx_to_gene_dict, tx_dict): fields = feature.split("\t") # build new gtf line print(fields[0] + "\tstringtie2utr\t", "\t".join(fields[2:8]), "\ttranscript_id \"" + tx_id + "\"; gene_id \"" + tx_to_gene_dict[tx_id] + "\";") - elif "StringTie" not in feature: + else: + #elif "StringTie" not in feature: print(feature) +def build_tree(data): + """Build an interval tree from data. We will use that to quickly find the overlapping single exon genes/transcript pairs.""" + tree = IntervalTree() + for item in data: + start, end, value = item + tree.addi(start, end, value) + return tree + +def construct_transcript_tree(tx_dict): + """Contructs a dictionary that has sequence name as first key, then strand as second key, and transcript interval tree as value. + Internally, before calling build_tree(data), a data structure for each sequence and strand must be constructed. It looks like this: + transcripts = [(50, 550, "tx1"), (580, 950, "tx2")] + + Args: + - tx_dict (dict): Dictionary with transcript IDs as keys and lists of transcript feature lines as values. + """ + seq_strand_to_transcripts = {} + # Pagtf_dict (dict): Dictionary with transcript_id as key and a list of GTF entries as values.)rse gtf_dict to segregate transcript data by sequence name and strand + for transcript_id, transcript_data in tx_dict.items(): + fields = transcript_data.split('\t') + seq_name = fields[0] + start = int(fields[3]) + end = int(fields[4]) + strand = fields[6] + # Use setdefault to initialize a nested dictionary + seq_strand_dict = seq_strand_to_transcripts.setdefault(seq_name, {}) + # Use setdefault to initialize a list for the current strand + data_list = seq_strand_dict.setdefault(strand, []) + data_list.append((start, end, transcript_id)) + + # Build interval trees for each seq_name-strand combination + for seq_name, strand_data in seq_strand_to_transcripts.items(): + for strand, data_list in strand_data.items(): + seq_strand_to_transcripts[seq_name][strand] = build_tree(data_list) + + return seq_strand_to_transcripts + +def construct_gene_tree(tx_dict, single_exons): + """Contructs a dictionary that has sequence name as first key, then strand as second key, and transcript interval tree as value. + Internally, before calling build_tree(data), a data structure for each sequence and strand must be constructed. It looks like this: + transcripts = [(50, 550, "tx1"), (580, 950, "tx2")] + + Args: + - tx_dict (dict): Dictionary with transcript IDs as keys and lists of transcript feature lines as values. + - single_exons (dict): Dictionary with single-exon transcript IDs. We only want to build the tree for single-exon transcripts. + """ + seq_strand_to_genes = {} + # Parse tx_dict to segregate gene data by sequence name and strand, but only for single-exon transcripts + for transcript_id, line in tx_dict.items(): + # Check if the transcript_id is in single_exons + if transcript_id in single_exons: + fields = line.split('\t') + seq_name = fields[0] + start = int(fields[3]) + end = int(fields[4]) + strand = fields[6] + + # Use setdefault to initialize a nested dictionary + seq_strand_dict = seq_strand_to_genes.setdefault(seq_name, {}) + # Use setdefault to initialize a list for the current strand + data_list = seq_strand_dict.setdefault(strand, []) + data_list.append((start, end, transcript_id)) + + # Build interval trees for each seq_name-strand combination + for seq_name, strand_data in seq_strand_to_genes.items(): + for strand, data_list in strand_data.items(): + seq_strand_to_genes[seq_name][strand] = build_tree(data_list) + + return seq_strand_to_genes + +def find_overlapping_transcripts(gene_tree, transcript_tree): + """Find transcripts that overlap with genes.""" + gene_to_transcripts = {} + + # loop over the sequences in outer dict: + for seq_name, strand_data in gene_tree.items(): + # loop over the strands in the inner dict: + for strand, gene_tree in strand_data.items(): + # loop over the genes in the gene_tree: + for gene_interval in gene_tree: + overlapping_transcripts = transcript_tree[seq_name][strand].overlap(gene_interval.begin, gene_interval.end) + if overlapping_transcripts: + gene_id = gene_interval.data + gene_to_transcripts[gene_id] = [tx.data for tx in overlapping_transcripts] + + return gene_to_transcripts + +def extract_introns_from_dict(tx_dict, tx_id): + """Extract introns directly from stringtie_tx_dict.""" + introns = [] + for feature_line in tx_dict[tx_id]: + feature_type, start, end = feature_line.split("\t")[2:5] + if feature_type == "intron": + introns.append((int(start), int(end))) + return introns + +def check_overlap_compatibility(gene_to_transcripts, tsebra_tx_dict, stringtie_tx_dict): + """Check if the overlapping transcripts are compatible with the gene models. + In particular, we want to make sure that no stringtie transcript contains an intron within + the CDS feature range of the braker transcripts. Only verfy this for those gene/transcript list matches that + are in gene_to_transcripts. If an intron breaks the gene/transcript association, the transcript should be removed from the list in gene_to_transcripts. + If the list is empty, also the key should be removed. + Args: + - gene_to_transcipts (dict): has transcript IDs from braker as keys and lists of transcript IDs from stringtie as values. + - tsebra_tx_dict (dict): Dictionary with transcript IDs as keys and the corresponding transcript line as value. + - stringtie_tx_dict (dict): Dictionary with transcript IDs as keys and the corresponding transcript line as value. + + Returns: + dict: Updated gene_to_transcripts dictionary. + """ + for tsebra_tx_id, stringtie_tx_ids in list(gene_to_transcripts.items()): # Using list() to allow dictionary changes during iteration + tsebra_tx = tsebra_tx_dict[tsebra_tx_id] + for line in tsebra_tx: + if re.search(r'\tCDS\t', line): + fields = line.split('\t') + start = int(fields[3]) + end = int(fields[4]) + break # there's only one CDS because it's a single-exon transcript + # Assuming the CDS range is at the end of the transcript line in the format start-end + tsebra_cds_start, tsebra_cds_end = [start,end] + + # Store compatible transcripts + compatible_tx = [] + + for st_tx_id in stringtie_tx_ids: + tx_id = re.search(r'gene_id "[^"]+"; transcript_id "([^"]+)";', st_tx_id).group(1) + introns = extract_introns_from_dict(stringtie_tx_dict, tx_id) + + # Check if any intron overlaps with the CDS range + overlap = any(intron_start <= tsebra_cds_end and intron_end >= tsebra_cds_start for intron_start, intron_end in introns) + + if not overlap: + compatible_tx.append(tx_id) + + # Update the dictionary + if compatible_tx: + gene_to_transcripts[tsebra_tx_id] = compatible_tx + else: + del gene_to_transcripts[tsebra_tx_id] + + return gene_to_transcripts + def main(): parser = argparse.ArgumentParser(description="Updates BRAKER gene models with UTRs from a StringTie assembly.") @@ -418,37 +620,57 @@ def main(): args = parser.parse_args() # Now you can access the input file paths using args.braker and args.stringtie - braker_file = args.braker + tsebra_file = args.braker # Parse tx_dict to segregate gene data by sequence name and strand, but only for single-exon transcripts stringtie_file = args.stringtie - # read the braker_file - braker_non_gene_dict, braker_gene_line_dict, braker_tx_to_gene_dict, braker_tx_dict = read_gtf(braker_file) + # read the tsebra_file + tsebra_none_gene_dict, tsebra_gene_line_dict, tsebra_tx_to_gene_dict, tsebra_tx_dict = read_gtf(tsebra_file) # read the stringtie_file - stringtie_non_gene_dict, stringtie_gene_line_dict, stringie_tx_to_gene_dict, stringtie_tx_dict = read_gtf(stringtie_file) - # add intron features to the stringtie_non_gene_dict - stringtie_non_gene_dict = add_intron_features(stringtie_non_gene_dict) + stringtie_none_gene_dict, stringtie_gene_line_dict, stringtie_tx_to_gene_dict, stringtie_tx_dict = read_gtf(stringtie_file) - # create introns hash for braker and stringtie_non_gene_dict) - braker_introns_hash = create_introns_hash(braker_non_gene_dict) - stringtie_introns_hash = create_introns_hash(stringtie_non_gene_dict) + # add intron features to the stringtie_none_gene_dict + stringtie_none_gene_dict = add_intron_features(stringtie_none_gene_dict) + + # create introns hash for braker and stringtie_none_gene_dict) + tsebra_introns_hash = create_introns_hash(tsebra_none_gene_dict) + stringtie_introns_hash = create_introns_hash(stringtie_none_gene_dict) # find matching transcripts - matched_transcripts = find_matching_transcripts(braker_introns_hash, stringtie_introns_hash) + matched_transcripts = find_matching_transcripts(tsebra_introns_hash, stringtie_introns_hash) + + # find single exon genes: + single_exon_genes = find_single_exon_genes(tsebra_none_gene_dict) + + # construct transcript trees for single exon overlap identification + seq_strand_to_transcripts = construct_transcript_tree(stringtie_tx_dict) + # construct gene trees for single exon overlap identification + seq_strand_to_genes = construct_gene_tree(tsebra_tx_dict, single_exon_genes) + + # find overlapping transcripts + overlaps = find_overlapping_transcripts(seq_strand_to_genes, seq_strand_to_transcripts) + # the overlaps might include matches where stringtie introns break cds, remove these + overlaps = check_overlap_compatibility(overlaps, tsebra_none_gene_dict, stringtie_none_gene_dict) + + # merge the single exon matches on top of the multi exon matches + matched_transcripts.update(overlaps) # calculate the length of each transcript - tx_lens_stringtie = tx_len(stringtie_non_gene_dict) + tx_lens_stringtie = tx_len(stringtie_none_gene_dict) # select the longest transcript for each key in matched_transcripts final_matching_tx = select_longest_tx(matched_transcripts, tx_lens_stringtie) - # merge features from stringtie_gtf into braker_gtf based on the selected transcripts - braker_gtf = merge_features(braker_non_gene_dict, stringtie_non_gene_dict, final_matching_tx) + # merge features from stringtie_gtf into tsebra_gtf based on the selected transcripts + tsebra_gtf = merge_features(tsebra_none_gene_dict, stringtie_none_gene_dict, final_matching_tx) # compute UTR features - braker_gtf = compute_utr_features(braker_gtf) + tsebra_gtf = compute_utr_features(tsebra_gtf) # fix gene and transcript coordinates - braker_gene_line_dict, braker_tx_dict = fix_feature_coordinates(braker_gtf, braker_gene_line_dict, braker_tx_to_gene_dict, braker_tx_dict) - # print the updated braker_gtf - print_gtf(braker_gtf, braker_gene_line_dict, braker_tx_to_gene_dict, braker_tx_dict) + tsebra_gene_line_dict, tsebra_tx_dict = fix_feature_coordinates(tsebra_gtf, tsebra_gene_line_dict, tsebra_tx_to_gene_dict, tsebra_tx_dict) + # print the updated tsebra_gtf + print_gtf(tsebra_gtf, tsebra_gene_line_dict, tsebra_tx_to_gene_dict, tsebra_tx_dict) + + + print(overlaps) if __name__ == "__main__": main() From d096ca20d5e47e181600b06812ba158307a2f3df Mon Sep 17 00:00:00 2001 From: KatharinaHoff Date: Fri, 29 Sep 2023 16:36:20 +0200 Subject: [PATCH 07/13] I think we have a first working version for further testing --- scripts/stringtie2utr.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/scripts/stringtie2utr.py b/scripts/stringtie2utr.py index 53ef6a6..26148ca 100755 --- a/scripts/stringtie2utr.py +++ b/scripts/stringtie2utr.py @@ -1,5 +1,19 @@ #!/usr/bin/env python3 +""" +Script Name: stringtie2utr.py +Description: This script decorates a gtf file with genes generated by AUGUSTUS, + BRAKER, or TSEBRA, which UTR features from a stringtie gtf file. +Author: Katharina J. Hoff +Email: katharina.hoff@uni-greifswald.de +Date: September 29th 2023 + +Copyright (C) The year of copyright, Katharina J. Hoff, University of Greifswald + +This program is free software; you can redistribute it and/or modify +it under the terms of the Artistic License. +""" + import argparse import re from intervaltree import IntervalTree, Interval @@ -450,9 +464,10 @@ def print_gtf(gtf_dict, gene_dict, tx_to_gene_dict, tx_dict): printed_gene = {} # Iterate over gene_dict entries for tx_id, tx_line in tx_dict.items(): - if not tx_id in printed_gene: + gene_id = tx_to_gene_dict[tx_id] + if not gene_id in printed_gene: print(gene_dict[tx_to_gene_dict[tx_id]]) - gene_printed = True + printed_gene[gene_id] = True print(tx_line) sorted_features = sorted(gtf_dict.get(tx_id, []), key=lambda x: int(x.split('\t')[3])) for feature in sorted_features: @@ -463,7 +478,6 @@ def print_gtf(gtf_dict, gene_dict, tx_to_gene_dict, tx_dict): # build new gtf line print(fields[0] + "\tstringtie2utr\t", "\t".join(fields[2:8]), "\ttranscript_id \"" + tx_id + "\"; gene_id \"" + tx_to_gene_dict[tx_id] + "\";") else: - #elif "StringTie" not in feature: print(feature) def build_tree(data): @@ -669,8 +683,5 @@ def main(): # print the updated tsebra_gtf print_gtf(tsebra_gtf, tsebra_gene_line_dict, tsebra_tx_to_gene_dict, tsebra_tx_dict) - - print(overlaps) - if __name__ == "__main__": main() From 376ec5dfafc2a5c52550066a0dc540303fb7a478 Mon Sep 17 00:00:00 2001 From: KatharinaHoff Date: Fri, 29 Sep 2023 16:51:16 +0200 Subject: [PATCH 08/13] better documentation and output file added --- scripts/stringtie2utr.py | 83 +++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/scripts/stringtie2utr.py b/scripts/stringtie2utr.py index 26148ca..072ac64 100755 --- a/scripts/stringtie2utr.py +++ b/scripts/stringtie2utr.py @@ -448,11 +448,12 @@ def fix_feature_coordinates(gtf_dict, gene_dict, tx_to_gene_dict, tx_dict): return gene_dict, tx_dict -def print_gtf(gtf_dict, gene_dict, tx_to_gene_dict, tx_dict): +def print_gtf(filename, gtf_dict, gene_dict, tx_to_gene_dict, tx_dict): """ Print GTF lines based on gene_dict and gtf_dict. Args: + - filename (str): Path to the output file. - gtf_dict (dict): Dictionary with transcript IDs as keys and lists of non-gene feature lines as values. - gene_dict (dict): Dictionary with gene IDs extracted from the last column of gene feature lines as keys and the corresponding gene line as value. - tx_to_gene_dict (dict): Dictionary with transcript IDs as keys and the corresponding gene ID as value, extracted from the last column of transcript feature lines. @@ -462,23 +463,28 @@ def print_gtf(gtf_dict, gene_dict, tx_to_gene_dict, tx_dict): None: Prints the GTF lines to stdout. """ printed_gene = {} - # Iterate over gene_dict entries - for tx_id, tx_line in tx_dict.items(): - gene_id = tx_to_gene_dict[tx_id] - if not gene_id in printed_gene: - print(gene_dict[tx_to_gene_dict[tx_id]]) - printed_gene[gene_id] = True - print(tx_line) - sorted_features = sorted(gtf_dict.get(tx_id, []), key=lambda x: int(x.split('\t')[3])) - for feature in sorted_features: - # If the feature is a UTR line, remove the exon_number - if "UTR" in feature: - # split line into fields - fields = feature.split("\t") - # build new gtf line - print(fields[0] + "\tstringtie2utr\t", "\t".join(fields[2:8]), "\ttranscript_id \"" + tx_id + "\"; gene_id \"" + tx_to_gene_dict[tx_id] + "\";") - else: - print(feature) + try: + with open(filename, 'w') as f: + # Iterate over gene_dict entries + for tx_id, tx_line in tx_dict.items(): + gene_id = tx_to_gene_dict[tx_id] + if not gene_id in printed_gene: + f.write(gene_dict[tx_to_gene_dict[tx_id]] + "\n") + printed_gene[gene_id] = True + f.write(tx_line + "\n") + sorted_features = sorted(gtf_dict.get(tx_id, []), key=lambda x: int(x.split('\t')[3])) + for feature in sorted_features: + # If the feature is a UTR line, remove the exon_number + if "UTR" in feature: + # split line into fields + fields = feature.split("\t") + # build new gtf line + f.write(fields[0] + "\tstringtie2utr\t" + "\t".join(fields[2:8]) + "\ttranscript_id \"" + tx_id + "\"; gene_id \"" + tx_to_gene_dict[tx_id] + "\";" + "\n") + else: + f.write(feature + "\n") + except IOError: + print("Could not write to file: " + filename) + exit(1) def build_tree(data): """Build an interval tree from data. We will use that to quickly find the overlapping single exon genes/transcript pairs.""" @@ -625,63 +631,60 @@ def check_overlap_compatibility(gene_to_transcripts, tsebra_tx_dict, stringtie_t def main(): - parser = argparse.ArgumentParser(description="Updates BRAKER gene models with UTRs from a StringTie assembly.") + parser = argparse.ArgumentParser(description="Updates GaiusAugustus gene models with UTRs from a StringTie assembly.") # Mandatory input arguments - parser.add_argument("-b", "--braker", required=True, help="File with BRAKER gene models in GTF format") + parser.add_argument("-g", "--genes", required=True, help="File with gene gene models in GTF format, typically the output of Augustus, BRAKER, or TSEBRA.") parser.add_argument("-s", "--stringtie", required=True, help="File with StringTie transcript models in GFF format") + parser.add_argument("-o", "--output", required=True, help="Output file name, file is in GTF format.") args = parser.parse_args() - # Now you can access the input file paths using args.braker and args.stringtie - tsebra_file = args.braker # Parse tx_dict to segregate gene data by sequence name and strand, but only for single-exon transcripts + tsebra_file = args.genes stringtie_file = args.stringtie - # read the tsebra_file + # read the tsebra_file with gene models tsebra_none_gene_dict, tsebra_gene_line_dict, tsebra_tx_to_gene_dict, tsebra_tx_dict = read_gtf(tsebra_file) - # read the stringtie_file stringtie_none_gene_dict, stringtie_gene_line_dict, stringtie_tx_to_gene_dict, stringtie_tx_dict = read_gtf(stringtie_file) - # add intron features to the stringtie_none_gene_dict stringtie_none_gene_dict = add_intron_features(stringtie_none_gene_dict) - # create introns hash for braker and stringtie_none_gene_dict) + # in order to match genes and transcripts efficiently, we will take 2 different approaches. For multi-exon genes, + # we create intron hashes for matching transcripts. For single-exon genes, we construct interval trees. + # first the multi-exon gene matching tsebra_introns_hash = create_introns_hash(tsebra_none_gene_dict) stringtie_introns_hash = create_introns_hash(stringtie_none_gene_dict) - # find matching transcripts matched_transcripts = find_matching_transcripts(tsebra_introns_hash, stringtie_introns_hash) - # find single exon genes: + # find single exon genes, these now do not have UTRs yet single_exon_genes = find_single_exon_genes(tsebra_none_gene_dict) - # construct transcript trees for single exon overlap identification + # construct transcript trees seq_strand_to_transcripts = construct_transcript_tree(stringtie_tx_dict) - # construct gene trees for single exon overlap identification seq_strand_to_genes = construct_gene_tree(tsebra_tx_dict, single_exon_genes) - - # find overlapping transcripts overlaps = find_overlapping_transcripts(seq_strand_to_genes, seq_strand_to_transcripts) # the overlaps might include matches where stringtie introns break cds, remove these overlaps = check_overlap_compatibility(overlaps, tsebra_none_gene_dict, stringtie_none_gene_dict) - # merge the single exon matches on top of the multi exon matches matched_transcripts.update(overlaps) - # calculate the length of each transcript + # We may now have several alternative RNA-Seq inferred transcripts that match a single predicted transcript + # we will select the longest of these. For this calculate the length of each transcript tx_lens_stringtie = tx_len(stringtie_none_gene_dict) - - # select the longest transcript for each key in matched_transcripts final_matching_tx = select_longest_tx(matched_transcripts, tx_lens_stringtie) - # merge features from stringtie_gtf into tsebra_gtf based on the selected transcripts + # merge features from stringtie_gtf into tsebra_gtf based on the selected transcripts, this is a simple concatenation, + # UTR features are not inferred, yet tsebra_gtf = merge_features(tsebra_none_gene_dict, stringtie_none_gene_dict, final_matching_tx) - # compute UTR features + # compute UTR features, remove StringTie exon features tsebra_gtf = compute_utr_features(tsebra_gtf) - # fix gene and transcript coordinates + + # expanding gene/transcripts by UTRs shifts the coordinates of these features, we need to update the coordinates in the gene and transcript lines tsebra_gene_line_dict, tsebra_tx_dict = fix_feature_coordinates(tsebra_gtf, tsebra_gene_line_dict, tsebra_tx_to_gene_dict, tsebra_tx_dict) + # print the updated tsebra_gtf - print_gtf(tsebra_gtf, tsebra_gene_line_dict, tsebra_tx_to_gene_dict, tsebra_tx_dict) + print_gtf(args.output, tsebra_gtf, tsebra_gene_line_dict, tsebra_tx_to_gene_dict, tsebra_tx_dict) if __name__ == "__main__": main() From 88d15f5a1251715d8720b8d092df4e1427a942f5 Mon Sep 17 00:00:00 2001 From: KatharinaHoff Date: Fri, 29 Sep 2023 17:09:41 +0200 Subject: [PATCH 09/13] fixing a comment --- scripts/stringtie2utr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/stringtie2utr.py b/scripts/stringtie2utr.py index 072ac64..9f32e0c 100755 --- a/scripts/stringtie2utr.py +++ b/scripts/stringtie2utr.py @@ -503,7 +503,7 @@ def construct_transcript_tree(tx_dict): - tx_dict (dict): Dictionary with transcript IDs as keys and lists of transcript feature lines as values. """ seq_strand_to_transcripts = {} - # Pagtf_dict (dict): Dictionary with transcript_id as key and a list of GTF entries as values.)rse gtf_dict to segregate transcript data by sequence name and strand + # Parse gtf_dict to segregate transcript data by sequence name and strand for transcript_id, transcript_data in tx_dict.items(): fields = transcript_data.split('\t') seq_name = fields[0] From 8904c4a68dc9605fc7f3f6f275b67fbeaf157431 Mon Sep 17 00:00:00 2001 From: KatharinaHoff Date: Mon, 2 Oct 2023 11:32:47 +0200 Subject: [PATCH 10/13] fixing that no overlap is searched if seq & strand are empty for stringtie --- scripts/stringtie2utr.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/stringtie2utr.py b/scripts/stringtie2utr.py index 9f32e0c..07ac137 100755 --- a/scripts/stringtie2utr.py +++ b/scripts/stringtie2utr.py @@ -566,10 +566,12 @@ def find_overlapping_transcripts(gene_tree, transcript_tree): for strand, gene_tree in strand_data.items(): # loop over the genes in the gene_tree: for gene_interval in gene_tree: - overlapping_transcripts = transcript_tree[seq_name][strand].overlap(gene_interval.begin, gene_interval.end) - if overlapping_transcripts: - gene_id = gene_interval.data - gene_to_transcripts[gene_id] = [tx.data for tx in overlapping_transcripts] + if seq_name in transcript_tree: + if strand in transcript_tree[seq_name]: + overlapping_transcripts = transcript_tree[seq_name][strand].overlap(gene_interval.begin, gene_interval.end) + if overlapping_transcripts: + gene_id = gene_interval.data + gene_to_transcripts[gene_id] = [tx.data for tx in overlapping_transcripts] return gene_to_transcripts From c5632ab5336d43007c7d646449933e5dab8bec3f Mon Sep 17 00:00:00 2001 From: KatharinaHoff Date: Mon, 16 Oct 2023 14:13:11 +0200 Subject: [PATCH 11/13] intermediate stage before switching branches --- scripts/stringtie2utr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/stringtie2utr.py b/scripts/stringtie2utr.py index 07ac137..03a4669 100755 --- a/scripts/stringtie2utr.py +++ b/scripts/stringtie2utr.py @@ -363,7 +363,7 @@ def compute_utr_features(tsebra_gtf): featuretype = fields[2] if featuretype == "CDS": seen_cds = True - if strand == "+" and not seen_cds: + if strand == "+" and not seen_cds:setfacl -R -m u:gabriell:rwx /home/nas-hs/projs/ utr_type = "five_prime_UTR" elif strand == "-" and not seen_cds: utr_type = "three_prime_UTR" From 77387c7800645023f604300a76c0731697dea2af Mon Sep 17 00:00:00 2001 From: KatharinaHoff Date: Sat, 21 Oct 2023 12:54:29 +0200 Subject: [PATCH 12/13] removing accidental insert --- scripts/stringtie2utr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/stringtie2utr.py b/scripts/stringtie2utr.py index 03a4669..07ac137 100755 --- a/scripts/stringtie2utr.py +++ b/scripts/stringtie2utr.py @@ -363,7 +363,7 @@ def compute_utr_features(tsebra_gtf): featuretype = fields[2] if featuretype == "CDS": seen_cds = True - if strand == "+" and not seen_cds:setfacl -R -m u:gabriell:rwx /home/nas-hs/projs/ + if strand == "+" and not seen_cds: utr_type = "five_prime_UTR" elif strand == "-" and not seen_cds: utr_type = "three_prime_UTR" From 4c36c137f5201d85b851d490c525c7f1d5704388 Mon Sep 17 00:00:00 2001 From: KatharinaHoff Date: Fri, 22 Dec 2023 15:11:33 +0100 Subject: [PATCH 13/13] addressing issue https://github.com/Gaius-Augustus/BRAKER/issues/627 --- scripts/braker.pl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/braker.pl b/scripts/braker.pl index a0fb923..9559f68 100755 --- a/scripts/braker.pl +++ b/scripts/braker.pl @@ -5980,7 +5980,10 @@ sub training_augustus { . (localtime) . ": creating softlink from $gmGtf to $trainGenesGtf.\n" if ($v > 3); - $cmdString = "ln -s $gmGtf $trainGenesGtf"; + # shorten the $gmGtf path to avoid problems with absolute softlink + my @pp = split(/\//, $gmGtf); + my $gmGtfShort = $pp[-2]."/".$pp[-1]; + $cmdString = "ln -s $gmGtfShort $trainGenesGtf"; print LOG "$cmdString\n" if ($v > 3); system($cmdString) == 0 or clean_abort("$AUGUSTUS_CONFIG_PATH/species/$species",