diff --git a/taranis/__main__.py b/taranis/__main__.py index 880a361..a3011a5 100644 --- a/taranis/__main__.py +++ b/taranis/__main__.py @@ -450,6 +450,24 @@ def reference_alleles( default=False, help="Create alignment files", ) +@click.option( + "-q", + "--proteine-threshold", + required=False, + nargs=1, + default=80, + type=int, + help="Threshold of protein coverage to consider as TPR. default 90", +) +@click.option( + "-i", + "--increase-sequence", + required=False, + nargs=1, + default=20, + type=int, + help="Increase the number of triplet sequences to find the stop codon. default 20", +) @click.option( "--cpus", required=False, @@ -469,6 +487,8 @@ def allele_calling( force: bool, snp: bool, alignment: bool, + proteine_threshold: int, + increase_sequence: int, cpus: int, ): _ = taranis.utils.check_additional_programs_installed( @@ -515,6 +535,8 @@ def allele_calling( inf_allele_obj, snp, alignment, + proteine_threshold, + increase_sequence, ) for assembly_file in assemblies ] @@ -525,23 +547,27 @@ def allele_calling( print(e) continue """ - results.append( - taranis.allele_calling.parallel_execution( - assemblies[0], - schema, - prediction_data, - schema_ref_files, - threshold, - perc_identity, - output, - inf_allele_obj, - snp, - alignment, + for assembly_file in assemblies: + results.append( + taranis.allele_calling.parallel_execution( + assembly_file, + schema, + prediction_data, + schema_ref_files, + threshold, + perc_identity, + output, + inf_allele_obj, + snp, + alignment, + proteine_threshold, + increase_sequence, + ) ) - ) _ = taranis.allele_calling.collect_data( results, output, snp, alignment, schema_ref_files ) finish = time.perf_counter() print(f"Allele calling finish in {round((finish-start)/60, 2)} minutes") + log.info("Allele calling finish in %s minutes", round((finish-start)/60, 2)) # sample_allele_obj.analyze_sample() diff --git a/taranis/allele_calling.py b/taranis/allele_calling.py index c233740..cb886c3 100644 --- a/taranis/allele_calling.py +++ b/taranis/allele_calling.py @@ -8,6 +8,7 @@ from collections import OrderedDict from pathlib import Path +from Bio.Seq import Seq from Bio import SeqIO from io import StringIO @@ -35,6 +36,8 @@ def __init__( inf_alle_obj: object, snp_request: bool = False, aligment_request: bool = False, + tpr_limit: int = 80, + increase_sequence: int = 20, ): """Allele calling initial creation object @@ -48,9 +51,12 @@ def __init__( inf_alle_obj (object): object to infer alleles snp_request (bool, optional): snp saved to file. Defaults to False. aligment_request (bool, optional): allignment saved to file. Defaults to False. + tpr_limit (int, optional): lower threshold to consider trunked proteine. Defaults to 80. + increase_sequence (int, optional): increase sequence to be analysed. Defaults to 20. """ self.prediction_data = annotation # store prediction annotation self.sample_file = sample_file + self.sample_records = taranis.utils.read_fasta_file(self.sample_file, convert_to_dict=True) self.schema = schema self.ref_alleles = reference_alleles self.threshold = threshold @@ -65,9 +71,11 @@ def __init__( self.inf_alle_obj = inf_alle_obj self.snp_request = snp_request self.aligment_request = aligment_request + self.tpr_limit = tpr_limit / 100 + self.increase_sequence = increase_sequence * 3 def assign_allele_type( - self, blast_results: list, allele_file: str, allele_name: str + self, blast_results: list, allele_file: str, allele_name: str, ref_allele_seq: str ) -> list: """Assign allele type to the allele @@ -75,12 +83,16 @@ def assign_allele_type( blast_result (list): information collected by running blast allele_file (str): file name with allele sequence allele_name (str): allele name + ref_allele_seq (str): reference allele sequence Returns: list: containing allele classification, allele match id, and allele details """ + def add_sequences_(column_blast_res: list) -> bool: + pass + def check_if_plot(column_blast_res: list) -> bool: """Check if allele is partial length @@ -91,12 +103,12 @@ def check_if_plot(column_blast_res: list) -> bool: bool: True if allele is partial length """ if ( - column_blast_res[9] == "1" # check at contig start + column_blast_res[8] == "1" # check at contig start # check if contig ends is the same as match allele ends - or column_blast_res[15] == column_blast_res[10] - or column_blast_res[10] == "1" # check reverse at contig end + or column_blast_res[9] == column_blast_res[7] + or column_blast_res[9] == "1" # check reverse at contig end # check if contig start is the same as match allele start reverse - or column_blast_res[9] == column_blast_res[15] + or column_blast_res[8] == column_blast_res[7] ): return True return False @@ -119,7 +131,83 @@ def discard_low_threshold_results(blast_results: list) -> list: valid_blast_result.append(b_result) return valid_blast_result - def get_blast_details(blast_result: str, allele_name: str) -> list: + def extend_sequence_for_finding_stop_codon(split_blast_result: list) -> list: + """Extend match sequence, according the (increase_sequence) for + trying find the stop codon. + + Args: + split_blast_result (list): list having the informaction collected + by running blast + + Returns: + list: updated information if stop codon is found + """ + # collect data for checking PLOT + data_for_plot = [""] * 10 + # cop the contig length + data_for_plot[7] = split_blast_result[15] + # copy start position + data_for_plot[8] = split_blast_result[9] + # copy end position + data_for_plot[9] = split_blast_result[10] + # check if PLOT + if not check_if_plot(data_for_plot): + # fetch the sequence until the last triplet is stop codon + contig_seq = self.sample_records[split_blast_result[1]] + start_seq = int(split_blast_result[9]) + stop_seq = int(split_blast_result[10]) + if stop_seq > start_seq: + # sequence direction is forward + direction = "forward" + # adjust the sequence to be a triplet + interval = (stop_seq - start_seq) // 3 * 3 + new_stop_seq = start_seq + interval + self.increase_sequence + start_seq -= 1 + # if the increased length is higher than the contig length + # adjust the stop sequence to maximun contig length + # multiply by 3. + if stop_seq > len(contig_seq): + stop_seq = len(contig_seq) // 3 * 3 + else: + stop_seq = new_stop_seq -1 + c_sequence = contig_seq[start_seq:stop_seq] + else: + # sequence direction is reverse + direction = "reverse" + # adjust the sequence to be a triplet + interval = (start_seq - stop_seq) // 3 * 3 + new_stop_seq = start_seq - interval - self.increase_sequence + # if the increased length is lower than 0 (contig start) + # position, adjust the start sequence to minumum contig + # length multiply by 3 + if new_stop_seq < 0: + # get the minimum contig length that is multiple by 3 + stop_seq = stop_seq % 3 - 1 + else: + stop_seq = new_stop_seq + # get the sequence in reverse + c_sequence = str(Seq(contig_seq[stop_seq:start_seq]).reverse_complement()) + new_prot_conv_result = taranis.utils.convert_to_protein(c_sequence, force_coding=False, check_additional_bases=False) + # check if stop codon is found in protein sequence + # pdb.set_trace() + if "protein" in new_prot_conv_result and "*" in new_prot_conv_result["protein"]: + new_seq_length = new_prot_conv_result["protein"].index("*") * 3 + match_sequence = c_sequence[:new_seq_length] + split_blast_result[4] = str(new_seq_length) + prot_error_result = "-" + # update the start and stop position + + if direction == "forward": + # pdb.set_trace() + split_blast_result[10] = str(int(split_blast_result[9]) + new_seq_length) + else: + split_blast_result[9] = str(int(split_blast_result[10]) - new_seq_length) + # ignore the previous process if stop codon is not found + + # pdb.set_trace() + return split_blast_result + + def get_blast_details(blast_result: str, allele_name: str, ref_allele_seq) -> list: """Collect blast details and modify the order of the columns Args: @@ -144,6 +232,8 @@ def get_blast_details(blast_result: str, allele_name: str) -> list: blast_details[13] = allele quality blast_details[14] = match sequence in contig blast_details[15] = reference allele sequence + blast_details[16] = protein conversion result + blast_details[17] = predicted protein sequence """ split_blast_result = blast_result.split("\t") match_allele_name = split_blast_result[0] @@ -163,7 +253,21 @@ def get_blast_details(blast_result: str, allele_name: str) -> list: direction = "-" # remove the gaps in sequences match_sequence = split_blast_result[13].replace("-", "") - reference_sequence = split_blast_result[14].replace("-", "") + # check if the sequence is coding + prot_conv_result = taranis.utils.convert_to_protein(match_sequence, force_coding=False, check_additional_bases=True) + prot_error_result = prot_conv_result["error"] if "error" in prot_conv_result else "-" + predicted_prot_seq = prot_conv_result["protein"] if "protein" in prot_conv_result else "-" + # remove if additional sequenced are added at the end of the stop codon + pdb.set_trace() + if "additional bases added after stop codon" in prot_error_result: + new_seq_len = len(match_sequence) // 3 * 3 + match_sequence = match_sequence[:new_seq_len] + split_blast_result[4] = str(new_seq_len) + # add more sequence to find the stop codon + elif "Last sequence is not a stop codon" in prot_error_result: + split_blast_result = extend_sequence_for_finding_stop_codon(split_blast_result) + elif "Sequence does not have a start codon" in prot_error_result: + pass # get blast details blast_details = [ self.s_name, # sample name @@ -181,8 +285,11 @@ def get_blast_details(blast_result: str, allele_name: str) -> list: product_annotation, allele_quality, match_sequence, # match sequence in contig - reference_sequence, # reference allele sequence + ref_allele_seq, # reference allele sequence + prot_error_result, # protein conversion result + predicted_prot_seq, # predicted protein sequence ] + return blast_details def find_match_allele_schema(allele_file: str, match_sequence: str) -> str: @@ -196,7 +303,7 @@ def find_match_allele_schema(allele_file: str, match_sequence: str) -> str: str: allele name in the schema that match the sequence """ grep_result = taranis.utils.grep_execution( - allele_file, match_sequence, "-b1" + allele_file, match_sequence, "-xb1" ) if len(grep_result) > 0: return grep_result[0].split("_")[1] @@ -206,13 +313,13 @@ def find_match_allele_schema(allele_file: str, match_sequence: str) -> str: match_allele_schema = "" if len(valid_blast_results) == 0: # no match results labelled as LNF. details data filled with empty data - return ["LNF", "-", ["-," * 15]] + return ["LNF", "-", ["-"]* 18] if len(valid_blast_results) > 1: # could be NIPHEM or NIPH b_split_data = [] match_allele_seq = [] for valid_blast_result in valid_blast_results: - multi_allele_data = get_blast_details(valid_blast_result, allele_name) + multi_allele_data = get_blast_details(valid_blast_result, allele_name, ref_allele_seq) # get match allele sequence match_allele_seq.append(multi_allele_data[14]) b_split_data.append(multi_allele_data) @@ -232,29 +339,33 @@ def find_match_allele_schema(allele_file: str, match_sequence: str) -> str: for (idx,) in range(len(b_split_data)): b_split_data[idx][4] = classification + "_" + match_allele_schema else: - b_split_data = get_blast_details(valid_blast_results[0], allele_name) + b_split_data = get_blast_details(valid_blast_results[0], allele_name, ref_allele_seq) # found the allele in schema with the match sequence in the contig match_allele_schema = find_match_allele_schema( - allele_file, b_split_data[14] + allele_file, b_split_data[14] ) - # PLOT, ASM, ALM, INF, EXC are possible classifications - if check_if_plot(b_split_data): + # PLOT, TPR, ASM, ALM, INF, EXC are possible classifications + if match_allele_schema != "": + # exact match found labelled as EXC + classification = "EXC" + elif check_if_plot(b_split_data): # match allele is partial length labelled as PLOT classification = "PLOT" - - # check if match allele is shorter than reference allele - elif int(b_split_data[5]) < int(b_split_data[6]): + # check if protein length divided by the length of triplet matched + # sequence is lower the the tpr limit + elif b_split_data[16] == "Multiple stop codons" and b_split_data[17].index("*") / (int(b_split_data[6]) / 3) < self.tpr_limit: + # labelled as TPR + classification = "TPR" + # check if match allele is shorter than reference allele + elif int(b_split_data[6]) < int(b_split_data[5]): classification = "ASM" # check if match allele is longer than reference allele - elif int(b_split_data[5]) > int(b_split_data[6]): + elif int(b_split_data[6]) > int(b_split_data[5]): classification = "ALM" else: # if sequence was not found after running grep labelled as INF - if match_allele_schema == "": - classification = "INF" - else: - # exact match found labelled as EXC - classification = "EXC" + classification = "INF" + # assign an identification value to the new allele if match_allele_schema == "": match_allele_schema = str( @@ -289,11 +400,13 @@ def search_match_allele(self): len(self.ref_alleles), ) - alleles = OrderedDict() + alleles = taranis.utils.read_fasta_file(ref_allele, convert_to_dict=True) match_found = False + """ with open(ref_allele, "r") as fh: for record in SeqIO.parse(fh, "fasta"): alleles[record.id] = str(record.seq) + """ count_2 = 0 for r_id, r_seq in alleles.items(): count_2 += 1 @@ -321,7 +434,7 @@ def search_match_allele(self): result["allele_type"][allele_name], result["allele_match"][allele_name], result["allele_details"][allele_name], - ) = self.assign_allele_type(blast_result, allele_file, allele_name) + ) = self.assign_allele_type(blast_result, allele_file, allele_name, r_seq) else: # Sample does not have a reference allele to be matched # Keep LNF info @@ -330,12 +443,16 @@ def search_match_allele(self): result["allele_details"][allele_name] = "LNF" # prepare the data for snp and alignment analysis - ref_allele_seq = result["allele_details"][allele_name][15] + try: + ref_allele_seq = result["allele_details"][allele_name][15] + except: + pdb.set_trace() allele_seq = result["allele_details"][allele_name][14] ref_allele_name = result["allele_details"][allele_name][3] if self.snp_request and result["allele_type"][allele_name] != "LNF": # run snp analysis + print(allele_name) result["snp_data"][allele_name] = taranis.utils.get_snp_information( ref_allele_seq, allele_seq, ref_allele_name ) @@ -346,6 +463,8 @@ def search_match_allele(self): ] = taranis.utils.get_alignment_data( ref_allele_seq, allele_seq, ref_allele_name ) + # delete blast folder + _ = taranis.utils.delete_folder(self.blast_dir) return result @@ -360,6 +479,8 @@ def parallel_execution( inf_alle_obj: object, snp_request: bool = False, aligment_request: bool = False, + trp_limit: int = 80, + increase_sequence: int = 20, ): allele_obj = AlleleCalling( sample_file, @@ -372,6 +493,8 @@ def parallel_execution( inf_alle_obj, snp_request, aligment_request, + trp_limit, + increase_sequence ) sample_name = Path(sample_file).stem stderr.print(f"[green] Analyzing sample {sample_name}") @@ -389,7 +512,7 @@ def collect_data( def stats_graphics(stats_folder: str, summary_result: dict) -> None: stderr.print("Creating graphics") log.info("Creating graphics") - allele_types = ["NIPHEM", "NIPH", "EXC", "PLOT", "ASM", "ALM", "INF", "LNF"] + allele_types = ["NIPHEM", "NIPH", "EXC", "PLOT", "ASM", "ALM", "INF", "LNF", "TPR"] # inizialize classification data classif_data = {} for allele_type in allele_types: @@ -430,7 +553,7 @@ def read_reference_alleles(ref_alleles: list) -> dict[dict]: summary_result_file = os.path.join(output, "allele_calling_summary.csv") sample_allele_match_file = os.path.join(output, "allele_calling_match.csv") sample_allele_detail_file = os.path.join(output, "matching_contig.csv") - allele_types = ["NIPHEM", "NIPH", "EXC", "PLOT", "ASM", "ALM", "INF", "LNF"] + allele_types = ["NIPHEM", "NIPH", "EXC", "PLOT", "ASM", "ALM", "INF", "LNF", "TPR"] detail_heading = [ "sample", "contig", @@ -446,7 +569,10 @@ def read_reference_alleles(ref_alleles: list) -> dict[dict]: "gene notation", "product notation", "allele quality", - "sequence", + "match sequence", + "reference allele sequence", + "protein conversion result", + "predicted protein sequence", ] summary_result = {} # used for summary file and allele classification graphics @@ -501,25 +627,26 @@ def read_reference_alleles(ref_alleles: list) -> dict[dict]: fo.write(",".join(detail_value) + "\n") # save snp to file if requested if snp_request: - snp_file = os.path.join(output, "snp_data.csv") - with open(snp_file, "w") as fo: - fo.write( - "Sample name,Locus name,Reference allele,Position,Ref,Alt,Codon Ref,Codon Alt,Amino Ref,Amino Alt,Category Ref,Category Alt\n" - ) + for result in results: for sample, values in result.items(): - for allele, snp_data in values["snp_data"].items(): - for ref_allele, snp_info_list in snp_data.items(): - for snp_info in snp_info_list: - fo.write( - sample - + "," - + allele - + "," - + ref_allele - + "," - + ",".join(snp_info) - + "\n" - ) + snp_file = os.path.join(output, sample + "_snp_data.csv") + with open(snp_file, "w") as fo: + fo.write( + "Sample name,Locus name,Reference allele,Position,Ref,Alt,Codon Ref,Codon Alt,Amino Ref,Amino Alt,Category Ref,Category Alt\n" + ) + for allele, snp_data in values["snp_data"].items(): + for ref_allele, snp_info_list in snp_data.items(): + for snp_info in snp_info_list: + fo.write( + sample + + "," + + allele + + "," + + ref_allele + + "," + + ",".join(snp_info) + + "\n" + ) # create alignment files if aligment_request: alignment_folder = os.path.join(output, "alignments") @@ -545,13 +672,11 @@ def read_reference_alleles(ref_alleles: list) -> dict[dict]: for ref_id, ref_seq in ref_alleles_seq[a_list].items(): input_buffer = StringIO() # get the reference allele sequence - input_buffer.write(">" + ref_id + "\n") + input_buffer.write(">Ref_" + ref_id + "\n") input_buffer.write(str(ref_seq) + "\n") # get the sequences for sample on the same allele for result in results: for sample, values in result.items(): - # add sample and allele name - # pdb.set_trace() # discard the allele if it is LNF if values["allele_type"][a_list] == "LNF": continue @@ -568,14 +693,20 @@ def read_reference_alleles(ref_alleles: list) -> dict[dict]: # get the sequence of the allele in sample input_buffer.write(values["allele_details"][a_list][14] + "\n") input_buffer.seek(0) - # pdb.set_trace() allele_multiple_align.append( taranis.utils.get_multiple_alignment(input_buffer) ) + # release memory input_buffer.close() - pdb.set_trace() # save multiple alignment to file + with open( + os.path.join(alignment_folder, a_list + "_multiple_alignment.aln"), "w" + ) as fo: + for alignment in allele_multiple_align: + for align in alignment: + fo.write(align) # Create graphics stats_graphics(output, summary_result) + return diff --git a/taranis/utils.py b/taranis/utils.py index f11c0c0..43beb10 100644 --- a/taranis/utils.py +++ b/taranis/utils.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +import Bio.Data.CodonTable import glob import gzip import io @@ -22,8 +23,13 @@ from pathlib import Path from Bio import SeqIO from Bio.Seq import Seq +from collections import OrderedDict +import warnings +from Bio import BiopythonWarning + +import pdb log = logging.getLogger(__name__) @@ -111,6 +117,51 @@ def check_additional_programs_installed(software_list: list) -> None: sys.exit(1) return +def convert_to_protein(sequence: str, force_coding: bool =False, check_additional_bases: bool = False) -> dict: + """Check if the input sequence is a coding protein. + + Args: + sequence (str): sequence to be checked + force_coding (bool, optional): force to check if sequence is coding. Defaults to False. + check_additional_bases (bool, optional): if not multiple by 3 remove the latest sequences to check they are added after the stop codon. Defaults to False. + + Returns: + dict: protein sequence and/or error message + """ + conv_result = {} + # checck if exists start codon + if sequence[0:3] not in START_CODON_FORWARD: + return {"error": "Sequence does not have a start codon"} + if len(sequence) % 3 != 0: + if not check_additional_bases: + return {"error" : "Sequence is not a multiple of three"} + # Remove the last or second to last bases to check if there is a stop codon + new_seq_len = len(sequence) // 3 * 3 + sequence = sequence[:new_seq_len] + # this error will be overwritten if another error is found + conv_result["error"] = "additional bases added after stop codon" + + seq_sequence = Seq(sequence) + try: + seq_prot = seq_sequence.translate(table=1, cds=force_coding) + except Bio.Data.CodonTable.TranslationError as e: + log.info("Unable to translate sequence. Info message: %s ", e) + return {"error": e} + # get the latest stop codon + last_stop = seq_prot.rfind("*") + # if force_coding is False, check if there are multiple stop codons + if not force_coding: + first_stop = seq_prot.find("*") + if first_stop != last_stop: + return {"error": "Multiple stop codons","protein": str(seq_prot)} + if last_stop != len(seq_prot) - 1: + return {"error": "Last sequence is not a stop codon", "protein": str(seq_prot)} + if "error" in conv_result: + conv_result["protein"] = str(seq_prot) + return conv_result + return {"protein": str(seq_prot)} + + def create_annotation_files( fasta_file: str, @@ -334,7 +385,6 @@ def get_multiple_alignment(input_buffer: io.StringIO) -> list[str]: Returns: list[str]: list of aligned sequences """ - # output_buffer = io.StringIO() # Run MAFFT mafft_command = "mafft --auto --quiet -" # "-" tells MAFFT to read from stdin @@ -353,7 +403,7 @@ def get_multiple_alignment(input_buffer: io.StringIO) -> list[str]: # Close the file objects and process output_buffer.close() - process.close() + process.stdout.close() return multi_result @@ -374,13 +424,24 @@ def get_snp_information( Returns: dict: key: ref_sequence, value: list of snp information """ + # Supress warning that len of alt sequence not a multiple of three + warnings.simplefilter('ignore', BiopythonWarning) snp_info = {} ref_protein = str(Seq(ref_sequence).translate()) - alt_protein = str(Seq(alt_sequence).translate()) + try: + alt_protein = str(Seq(alt_sequence).translate()) + except Exception as e: + import pdb; pdb.set_trace() + + if len(alt_sequence) %3 != 0: + import pdb; pdb.set_trace() snp_line = [] - for idx, (ref, alt) in enumerate(zip(ref_sequence, alt_sequence)): - if alt != ref: + # get the shortest sequence for the loop + length_for_snp = min(len(ref_sequence), len(alt_sequence)) + for idx in range(length_for_snp): + + if ref_sequence[idx] != alt_sequence[idx]: # calculate the triplet index triplet_idx = idx // 3 # get triplet code @@ -395,8 +456,8 @@ def get_snp_information( snp_line.append( [ str(idx), - ref, - alt, + ref_sequence[idx], + alt_sequence[idx], ref_triplet, alt_triplet, ref_aa, @@ -604,7 +665,22 @@ def read_compressed_file( return out_data -def read_fasta_file(fasta_file): +def read_fasta_file(fasta_file: str, convert_to_dict=False) -> dict | str: + """Read the fasta file and return the data as a dictionary if convert_to_dict + + Args: + fasta_file (str): _description_ + convert_to_dict (bool, optional): _description_. Defaults to False. + + Returns: + dict: fasta id as key and sequence as value in str format + """ + conv_fasta = OrderedDict() + if convert_to_dict: + with open(fasta_file, "r") as fh: + for record in SeqIO.parse(fh, "fasta"): + conv_fasta[record.id] = str(record.seq) + return conv_fasta return SeqIO.parse(fasta_file, "fasta")