From 16ebfe027b5d00da97df4879254aeebc0277f993 Mon Sep 17 00:00:00 2001 From: luissian Date: Thu, 7 Mar 2024 00:03:18 +0100 Subject: [PATCH 1/4] working on allele calling, exact match --- taranis/__main__.py | 72 +++++++--------- taranis/allele_calling.py | 163 ++++++++++++++++++----------------- taranis/reference_alleles.py | 2 +- taranis/utils.py | 22 +++++ 4 files changed, 138 insertions(+), 121 deletions(-) diff --git a/taranis/__main__.py b/taranis/__main__.py index f7655e1..4798a5b 100644 --- a/taranis/__main__.py +++ b/taranis/__main__.py @@ -22,6 +22,18 @@ stderr=True, force_terminal=taranis.utils.rich_force_colors() ) +def expand_wildcards(ctx, param, value): + if value: + expanded_paths = [] + for path in value: + # Check if path contains wildcard + if '*' in path: + # Expand wildcard + expanded_paths.extend(glob.glob(path)) + else: + expanded_paths.append(path) + return expanded_paths + return None def run_taranis(): # Set up the rich traceback @@ -352,7 +364,6 @@ def reference_alleles( ) for f_file in schema_files ] - # import pdb; pdb.set_trace() for future in concurrent.futures.as_completed(futures): results.append(future.result()) _ = taranis.reference_alleles.collect_statistics(results, eval_cluster, output) @@ -366,7 +377,7 @@ def reference_alleles( "--schema", required=True, multiple=False, - type=click.Path(), + type=click.Path(exists=True), help="Directory where the schema with the core gene files are located. ", ) @click.option( @@ -374,25 +385,9 @@ def reference_alleles( "--reference", required=True, multiple=False, - type=click.Path(), + type=click.Path(exists=True), help="Directory where the schema reference allele files are located. ", ) -@click.option( - "-g", - "--genome", - required=True, - multiple=False, - type=click.Path(), - help="Genome reference file", -) -@click.option( - "-a", - "--sample", - required=True, - multiple=False, - type=click.Path(), - help="Sample location file in fasta format. ", -) @click.option( "-o", "--output", @@ -401,27 +396,17 @@ def reference_alleles( type=click.Path(), help="Output folder to save reference alleles", ) +@click.argument("assemblies", callback=expand_wildcards, nargs=-1, required=True, type=click.Path(exists=True)) def allele_calling( schema, reference, - genome, - sample, + assemblies, output, ): - folder_to_check = [schema, reference] - for folder in folder_to_check: - if not taranis.utils.folder_exists(folder): - log.error("folder %s does not exists", folder) - stderr.print("[red] Folder does not exist. " + folder + "!") - sys.exit(1) - if not taranis.utils.file_exists(sample): - log.error("file %s does not exists", sample) - stderr.print("[red] File does not exist. " + sample + "!") - sys.exit(1) - schema_files = taranis.utils.get_files_in_folder(schema, "fasta") - if len(schema_files) == 0: - log.error("Schema folder %s does not have any fasta file", schema) - stderr.print("[red] Schema folder does not have any fasta file") + schema_ref_files = taranis.utils.get_files_in_folder(reference, "fasta") + if len(schema_ref_files) == 0: + log.error("Referenc allele folder %s does not have any fasta file", schema) + stderr.print("[red] reference allele folder does not have any fasta file") sys.exit(1) # Check if output folder exists @@ -440,19 +425,24 @@ def allele_calling( os.makedirs(output) except OSError as e: log.info("Unable to create folder at %s with error %s", output, e) - stderr.print("[red] ERROR. Unable to create folder " + output) + stderr.print("[red] ERROR. Unable to create {output} folder" ) sys.exit(1) # Filter fasta files from reference folder - ref_alleles = glob.glob(os.path.join(reference, "*.fasta")) + # ref_alleles = glob.glob(os.path.join(reference, "*.fasta")) # Create predictions + + """ pred_out = os.path.join(output, "prediction") pred_sample = taranis.prediction.Prediction(genome, sample, pred_out) pred_sample.training() pred_sample.prediction() + """ """Analyze the sample file against schema to identify outbreakers """ - sample_allele = taranis.allele_calling.AlleleCalling( - pred_sample, sample, schema, ref_alleles, output - ) - sample_allele.analyze_sample() + results = [] + for assembly_file in assemblies: + results.append(taranis.allele_calling.parallel_execution(assembly_file, schema, schema_ref_files, output)) + + + # sample_allele_obj.analyze_sample() diff --git a/taranis/allele_calling.py b/taranis/allele_calling.py index 00108d4..8c7b2c6 100644 --- a/taranis/allele_calling.py +++ b/taranis/allele_calling.py @@ -1,3 +1,4 @@ +import io import logging import os import rich.console @@ -7,8 +8,11 @@ import taranis.blast # import numpy +from collections import OrderedDict import pandas as pd from pathlib import Path +from Bio import SeqIO + import pdb @@ -23,105 +27,106 @@ class AlleleCalling: - def __init__(self, prediction, sample_file, schema, reference_alleles, out_folder): - self.prediction = prediction + def __init__(self, sample_file: str, schema: str, reference_alleles: list, out_folder:str): + # self.prediction = prediction self.sample_file = sample_file self.schema = schema self.ref_alleles = reference_alleles self.out_folder = out_folder self.s_name = Path(sample_file).stem self.blast_dir = os.path.join(out_folder, "blastdb") - self.blast_sample = os.path.join(self.blast_dir, self.s_name) - self.blast_heading = [ - "qseqid", - "sseqid", - "pident", - "qlen", - "length", - "mismatch", - "gapopen", - "evalue", - "bitscore", - "sstart", - "send", - "qstart", - "qend", - "sseq", - "qseq", - ] - - def assign_allele_type(self, query_seq, allele_name, sample_contig, schema_gene): - """_summary_ - - Args: - query_seq (_type_): _description_ - allele_name (_type_): _description_ - sample_contig (_type_): _description_ - schema_gene (_type_): _description_ - """ - s_alleles_blast = taranis.blast.Blast("nucl") - ref_allele_blast_dir = os.path.join(self.blast_dir, "ref_alleles") - query_path = os.path.join(self.out_folder, "tmp", allele_name) - # Write to file the sequence to find out the loci name that fully match - f_name = taranis.utils.write_fasta_file(query_path, query_seq, allele_name) - query_file = os.path.join(query_path, f_name) - _ = s_alleles_blast.create_blastdb(schema_gene, ref_allele_blast_dir) - # Blast with sample sequence to find the allele in the schema - seq_blast_match = s_alleles_blast.run_blast(query_file, perc_identity=100) - pdb.set_trace() - if len(seq_blast_match) >= 1: + # create blast for sample file + self.blast_obj = taranis.blast.Blast("nucl") + _ = self.blast_obj.create_blastdb(sample_file, self.blast_dir) + + + def assign_allele_type(self, blast_result: list, allele_file: str)->list: + def get_blast_details(blast_result: list, allele_name: str)->list: + # get blast details + blast_details = [ + blast_result[0].split("_")[0], # Core gene + self.s_name, + "gene annotation", + "product annotation", + allele_name, + "allele quality", + blast_result[1], # contig + blast_result[3], # query length + blast_result[9], # contig start + blast_result[10], # contig end + blast_result[13], # contig sequence + ] + return blast_details + + if len(blast_result) > 1: # allele is named as NIPHEM # Hacer un blast con la query esta secuencia y la database del alelo # Create blast db with sample file pass - elif len(seq_blast_match) == 1: - pass + elif len(blast_result) == 1: + column_blast_res = blast_result[0].split("\t") + sequence = column_blast_res[13].replace("-", "") + + grep_result = taranis.utils.grep_execution(allele_file, sequence, "-b1") + # check if sequence match alleles in schema + if len(grep_result) > 0: + allele_name = grep_result[0].split(">")[1] + allele_details = get_blast_details(column_blast_res, allele_name) + # allele is labled as EXACT + return ["EXACT", allele_name, allele_details] else: pass - - def search_alleles(self, ref_allele): - allele_name = Path(ref_allele).stem - schema_gene = os.path.join(self.schema, allele_name + ".fasta") - # allele_name = Path(ref_allele).stem - # run blast with sample as db and reference allele as query - sample_blast_match = self.sample_blast.run_blast(ref_allele) - if len(sample_blast_match) > 0: - pd_lines = pd.DataFrame([item.split("\t") for item in sample_blast_match]) - pd_lines.columns = self.blast_heading - pd_lines["pident"] = pd_lines["pident"].apply(pd.to_numeric) - sel_max = pd_lines.loc[pd_lines["pident"].idxmax()] - query_seq = sel_max["qseq"] - # np_lines = numpy.array(s_lines) - # convert to float the perc_identity to find out the max value - # max_val = numpy.max(np_lines[:,2].astype(float)) - # mask = np_lines[:, 2] ==str(max_val) - # Select rows that match the percent identity. Index 2 in blast results - # sel_row = np_lines[mask, :] = np_lines[mask, :] - # query_seq = sel_row[0,14] - sample_contig = sel_max["sseqid"] - _ = self.assign_allele_type( - query_seq, allele_name, sample_contig, schema_gene - ) - else: - # Sample does not have a reference allele to be matched - # Keep LNF info - # ver el codigo de espe - # lnf_tpr_tag() - pass pdb.set_trace() - def analyze_sample(self): + + def search_match_allele(self): # Create blast db with sample file - self.sample_blast = taranis.blast.Blast("nucl") - _ = self.sample_blast.create_blastdb(self.sample_file, self.blast_dir) - result = {} + + result = {"allele_type":{}, "allele_match":{}, "allele_details":{}} # pdb.set_trace() for ref_allele in self.ref_alleles: # schema_alleles = os.path.join(self.schema, ref_allele) # parallel in all CPUs in cluster node - result[ref_allele] = self.search_alleles(ref_allele) + alleles = OrderedDict() + match_found = False + with open(ref_allele, "r") as fh: + for record in SeqIO.parse(fh, "fasta"): + alleles[record.id] = str(record.seq) + + for r_id, r_seq in alleles.items(): + # create file in memory to increase speed + query_file = io.StringIO() + query_file.write(">" + r_id + "\n" + r_seq) + query_file.seek(0) + blast_result = self.blast_obj.run_blast( + query_file.read(), perc_identity=90, query_type="stdin" + ) + if len(blast_result) > 0: + match_found = True + break + # Close object and discard memory buffer + query_file.close() + if match_found: + allele_file = os.path.join(self.schema, os.path.basename(ref_allele)) + # blast_result = self.blast_obj.run_blast(q_file,perc_identity=100) + allele_name = Path(allele_file).stem + pdb.set_trace() + result["allele_type"][allele_name], result["allele_match"][allele_name], result["allele_details"][allele_name] = self.assign_allele_type(blast_result, allele_file) + pdb.set_trace() + else: + # Sample does not have a reference allele to be matched + # Keep LNF info + # ver el codigo de espe + # lnf_tpr_tag() + pass + pdb.set_trace() - return + return result + +def parallel_execution(sample_file: str, schema: str, reference_alleles: list, out_folder: str): + + allele_obj = AlleleCalling(sample_file, schema, reference_alleles, out_folder) + return allele_obj.search_match_allele() diff --git a/taranis/reference_alleles.py b/taranis/reference_alleles.py index 5646455..7e25ba4 100644 --- a/taranis/reference_alleles.py +++ b/taranis/reference_alleles.py @@ -119,7 +119,7 @@ def save_reference_alleles(self, reference_alleles: list) -> str: for record in SeqIO.parse(fh, "fasta"): if record.id in reference_alleles: record_seq[record.id] = str(record.seq) - ref_allele_file = os.path.join(self.output, self.locus_name + ".fa") + ref_allele_file = os.path.join(self.output, self.locus_name + ".fasta") with open(ref_allele_file, "w") as fo: for r_id, r_seq in record_seq.items(): fo.write(">" + r_id + "\n") diff --git a/taranis/utils.py b/taranis/utils.py index 39c8179..bef5662 100644 --- a/taranis/utils.py +++ b/taranis/utils.py @@ -254,6 +254,28 @@ def get_files_in_folder(folder: str, extension: str = None) -> list[str]: folder_files = os.path.join(folder, "*." + extension) return glob.glob(folder_files) +def grep_execution(input_file: str, pattern: str, parameters: str) -> list: + """_summary_ + + Args: + input_file (str): _description_ + pattern (str): _description_ + parmeters (str): _description_ + + Returns: + list: _description_ + """ + try: + result = subprocess.run( + ["grep", parameters, pattern, input_file] , + capture_output=True, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + log.error("Unable to run grep. Error message: %s ", e.stderr.decode()) + return [] + return result.stdout.split("\n") def prompt_text(msg): source = questionary.text(msg).unsafe_ask() From aabb94c1a4518baf44bab8c3c7975a7fc870fec6 Mon Sep 17 00:00:00 2001 From: luissian Date: Sat, 9 Mar 2024 14:02:59 +0100 Subject: [PATCH 2/4] fixed issue when reading on pandas long files --- taranis/__main__.py | 16 ++++++++++++---- taranis/distance.py | 8 +++++++- taranis/reference_alleles.py | 8 +++++--- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/taranis/__main__.py b/taranis/__main__.py index 4798a5b..f88b998 100644 --- a/taranis/__main__.py +++ b/taranis/__main__.py @@ -15,6 +15,9 @@ import taranis.reference_alleles import taranis.allele_calling +from pathlib import Path +import pdb + log = logging.getLogger() # Set up rich stderr console @@ -364,8 +367,12 @@ def reference_alleles( ) for f_file in schema_files ] - for future in concurrent.futures.as_completed(futures): - results.append(future.result()) + for future in concurrent.futures.as_completed(futures): + try: + results.append(future.result()) + except Exception as e: + print(e) + continue _ = taranis.reference_alleles.collect_statistics(results, eval_cluster, output) finish = time.perf_counter() print(f"Reference alleles finish in {round((finish-start)/60, 2)} minutes") @@ -442,7 +449,8 @@ def allele_calling( """ results = [] for assembly_file in assemblies: - results.append(taranis.allele_calling.parallel_execution(assembly_file, schema, schema_ref_files, output)) - + assembly_name = Path(assembly_file).stem + results.append({assembly_name: taranis.allele_calling.parallel_execution(assembly_file, schema, schema_ref_files, output)}) + pdb.set_trace() # sample_allele_obj.analyze_sample() diff --git a/taranis/distance.py b/taranis/distance.py index 428f596..872275d 100644 --- a/taranis/distance.py +++ b/taranis/distance.py @@ -72,7 +72,13 @@ def create_matrix(self) -> pd.DataFrame: dist_matrix.write("alleles\t" + "\t".join(allele_names) + "\n") dist_matrix.write("\n".join(out_data[1:])) dist_matrix.seek(0) - matrix_pd = pd.read_csv(dist_matrix, sep="\t", index_col="alleles").fillna(0) + file_test = "/media/lchapado/Reference_data/proyectos_isciii/taranis/test/reference_alleles_testing_full_schema_17a/error.csv" + #with open(file_test, "w") as fo: + # fo.write("alleles\t" + "\t".join(allele_names) + "\n") + # fo.write("\n".join(out_data[1:])) + + #import pdb; pdb.set_trace() + matrix_pd = pd.read_csv(dist_matrix, sep="\t", index_col="alleles", engine="python").fillna(0) # Close object and discard memory buffer dist_matrix.close() log.debug(f"create distance for {allele_name}") diff --git a/taranis/reference_alleles.py b/taranis/reference_alleles.py index 7e25ba4..ee23750 100644 --- a/taranis/reference_alleles.py +++ b/taranis/reference_alleles.py @@ -121,9 +121,9 @@ def save_reference_alleles(self, reference_alleles: list) -> str: record_seq[record.id] = str(record.seq) ref_allele_file = os.path.join(self.output, self.locus_name + ".fasta") with open(ref_allele_file, "w") as fo: - for r_id, r_seq in record_seq.items(): - fo.write(">" + r_id + "\n") - fo.write(r_seq + "\n") + for ref_allele in reference_alleles: + fo.write(">" + ref_allele + "\n") + fo.write(record_seq[ref_allele] + "\n") return ref_allele_file def create_ref_alleles(self) -> dict: @@ -255,6 +255,8 @@ def stats_graphics(stats_folder: str, cluster_alleles: dict) -> None: eval_data = [] cluster_data_graph = {} clusters_list = [] + stderr.print("Process starts for collecting statistics") + log.info("Process starts for collecting statistics") # split the data into cluster and evaluation for d_allele in data_alleles: cluster_data.append(d_allele["cluster_data"]) From 86ae8a2e5785792984ed20b8df1588f24f66787e Mon Sep 17 00:00:00 2001 From: luissian Date: Sat, 9 Mar 2024 14:03:41 +0100 Subject: [PATCH 3/4] added classification --- taranis/allele_calling.py | 55 +++++++++++++++++++++++++++++---------- taranis/blast.py | 2 +- taranis/utils.py | 4 ++- 3 files changed, 45 insertions(+), 16 deletions(-) diff --git a/taranis/allele_calling.py b/taranis/allele_calling.py index 8c7b2c6..8936a5e 100644 --- a/taranis/allele_calling.py +++ b/taranis/allele_calling.py @@ -40,7 +40,7 @@ def __init__(self, sample_file: str, schema: str, reference_alleles: list, out_ _ = self.blast_obj.create_blastdb(sample_file, self.blast_dir) - def assign_allele_type(self, blast_result: list, allele_file: str)->list: + def assign_allele_type(self, blast_result: list, allele_file: str, allele_name: str)->list: def get_blast_details(blast_result: list, allele_name: str)->list: # get blast details blast_details = [ @@ -63,21 +63,48 @@ def get_blast_details(blast_result: list, allele_name: str)->list: # Hacer un blast con la query esta secuencia y la database del alelo # Create blast db with sample file - - pass + pdb.set_trace() + elif len(blast_result) == 1: column_blast_res = blast_result[0].split("\t") - sequence = column_blast_res[13].replace("-", "") + column_blast_res[13] = column_blast_res[13].replace("-", "") + allele_details = get_blast_details(column_blast_res, allele_name) - grep_result = taranis.utils.grep_execution(allele_file, sequence, "-b1") + grep_result = taranis.utils.grep_execution(allele_file, column_blast_res[13], "-b1") # check if sequence match alleles in schema if len(grep_result) > 0: allele_name = grep_result[0].split(">")[1] - allele_details = get_blast_details(column_blast_res, allele_name) + # allele is labled as EXACT - return ["EXACT", allele_name, allele_details] - else: - pass + pdb.set_trace() + return ["EXC", allele_name, allele_details] + # check if contig is shorter than allele + pdb.set_trace() + if int(column_blast_res[3]) > int(column_blast_res[4]): + # check if sequence is shorter because it starts or ends at the contig + if ( + column_blast_res[9] == 1 # check at contig start + or column_blast_res[14] == column_blast_res[10] # check at contig end + or column_blast_res[10] == 1 # check reverse at contig end + or column_blast_res[9] == column_blast_res[15] # check reverse at contig start + ): + # allele is labled as PLOT + pdb.set_trace() + return ["PLOT", allele_name, allele_details] + # allele is labled as ASM + pdb.set_trace() + return ["ASM", allele_name, allele_details] + # check if contig is longer than allele + if int(column_blast_res[3]) < int(column_blast_res[4]): + # allele is labled as ALM + pdb.set_trace() + return ["ALM", allele_name, allele_details] + if int(column_blast_res[3]) == int(column_blast_res[4]): + # allele is labled as INF + pdb.set_trace() + return ["INF", allele_name, allele_details] + + pdb.set_trace() @@ -112,18 +139,18 @@ def search_match_allele(self): allele_file = os.path.join(self.schema, os.path.basename(ref_allele)) # blast_result = self.blast_obj.run_blast(q_file,perc_identity=100) allele_name = Path(allele_file).stem - pdb.set_trace() - result["allele_type"][allele_name], result["allele_match"][allele_name], result["allele_details"][allele_name] = self.assign_allele_type(blast_result, allele_file) - pdb.set_trace() + # pdb.set_trace() + result["allele_type"][allele_name], result["allele_match"][allele_name], result["allele_details"][allele_name] = self.assign_allele_type(blast_result, allele_file, allele_name) + # pdb.set_trace() else: # Sample does not have a reference allele to be matched # Keep LNF info # ver el codigo de espe # lnf_tpr_tag() - pass + pdb.set_trace() - pdb.set_trace() + return result def parallel_execution(sample_file: str, schema: str, reference_alleles: list, out_folder: str): diff --git a/taranis/blast.py b/taranis/blast.py index a51d2ad..981454c 100644 --- a/taranis/blast.py +++ b/taranis/blast.py @@ -96,7 +96,7 @@ def run_blast( if query_type == "stdin": stdin_query = query query = "-" - blast_parameters = '"6 , qseqid , sseqid , pident , qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , qseq"' + blast_parameters = '"6 , qseqid , sseqid , pident , qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , slen"' cline = NcbiblastnCommandline( task="blastn", db=self.out_blast_dir, diff --git a/taranis/utils.py b/taranis/utils.py index bef5662..4572013 100644 --- a/taranis/utils.py +++ b/taranis/utils.py @@ -21,6 +21,7 @@ from pathlib import Path from Bio import SeqIO +import pdb log = logging.getLogger(__name__) @@ -273,7 +274,8 @@ def grep_execution(input_file: str, pattern: str, parameters: str) -> list: text=True, ) except subprocess.CalledProcessError as e: - log.error("Unable to run grep. Error message: %s ", e.stderr.decode()) + pdb.set_trace() + log.error("Unable to run grep. Error message: %s ", e) return [] return result.stdout.split("\n") From 84b90880043a98c6e702c09343ced320660f07f3 Mon Sep 17 00:00:00 2001 From: luissian Date: Mon, 11 Mar 2024 10:29:09 +0100 Subject: [PATCH 4/4] liting --- taranis/__main__.py | 24 ++++++++++--- taranis/allele_calling.py | 66 ++++++++++++++++++++---------------- taranis/distance.py | 10 ++---- taranis/eval_cluster.py | 6 ++-- taranis/reference_alleles.py | 1 - taranis/utils.py | 5 ++- 6 files changed, 66 insertions(+), 46 deletions(-) diff --git a/taranis/__main__.py b/taranis/__main__.py index f88b998..911b521 100644 --- a/taranis/__main__.py +++ b/taranis/__main__.py @@ -25,12 +25,13 @@ stderr=True, force_terminal=taranis.utils.rich_force_colors() ) + def expand_wildcards(ctx, param, value): if value: expanded_paths = [] for path in value: # Check if path contains wildcard - if '*' in path: + if "*" in path: # Expand wildcard expanded_paths.extend(glob.glob(path)) else: @@ -38,6 +39,7 @@ def expand_wildcards(ctx, param, value): return expanded_paths return None + def run_taranis(): # Set up the rich traceback rich.traceback.install(console=stderr, width=200, word_wrap=True, extra_lines=1) @@ -403,7 +405,13 @@ def reference_alleles( type=click.Path(), help="Output folder to save reference alleles", ) -@click.argument("assemblies", callback=expand_wildcards, nargs=-1, required=True, type=click.Path(exists=True)) +@click.argument( + "assemblies", + callback=expand_wildcards, + nargs=-1, + required=True, + type=click.Path(exists=True), +) def allele_calling( schema, reference, @@ -432,7 +440,7 @@ def allele_calling( os.makedirs(output) except OSError as e: log.info("Unable to create folder at %s with error %s", output, e) - stderr.print("[red] ERROR. Unable to create {output} folder" ) + stderr.print("[red] ERROR. Unable to create {output} folder") sys.exit(1) # Filter fasta files from reference folder # ref_alleles = glob.glob(os.path.join(reference, "*.fasta")) @@ -450,7 +458,13 @@ def allele_calling( results = [] for assembly_file in assemblies: assembly_name = Path(assembly_file).stem - results.append({assembly_name: taranis.allele_calling.parallel_execution(assembly_file, schema, schema_ref_files, output)}) + results.append( + { + assembly_name: taranis.allele_calling.parallel_execution( + assembly_file, schema, schema_ref_files, output + ) + } + ) pdb.set_trace() - + # sample_allele_obj.analyze_sample() diff --git a/taranis/allele_calling.py b/taranis/allele_calling.py index 8936a5e..c9e11f8 100644 --- a/taranis/allele_calling.py +++ b/taranis/allele_calling.py @@ -14,7 +14,6 @@ from Bio import SeqIO - import pdb log = logging.getLogger(__name__) @@ -27,7 +26,9 @@ class AlleleCalling: - def __init__(self, sample_file: str, schema: str, reference_alleles: list, out_folder:str): + def __init__( + self, sample_file: str, schema: str, reference_alleles: list, out_folder: str + ): # self.prediction = prediction self.sample_file = sample_file self.schema = schema @@ -39,22 +40,23 @@ def __init__(self, sample_file: str, schema: str, reference_alleles: list, out_ self.blast_obj = taranis.blast.Blast("nucl") _ = self.blast_obj.create_blastdb(sample_file, self.blast_dir) - - def assign_allele_type(self, blast_result: list, allele_file: str, allele_name: str)->list: - def get_blast_details(blast_result: list, allele_name: str)->list: + def assign_allele_type( + self, blast_result: list, allele_file: str, allele_name: str + ) -> list: + def get_blast_details(blast_result: list, allele_name: str) -> list: # get blast details blast_details = [ - blast_result[0].split("_")[0], # Core gene + blast_result[0].split("_")[0], # Core gene self.s_name, "gene annotation", "product annotation", - allele_name, + allele_name, "allele quality", - blast_result[1], # contig - blast_result[3], # query length - blast_result[9], # contig start - blast_result[10], # contig end - blast_result[13], # contig sequence + blast_result[1], # contig + blast_result[3], # query length + blast_result[9], # contig start + blast_result[10], # contig end + blast_result[13], # contig sequence ] return blast_details @@ -64,17 +66,19 @@ def get_blast_details(blast_result: list, allele_name: str)->list: # Hacer un blast con la query esta secuencia y la database del alelo # Create blast db with sample file pdb.set_trace() - + elif len(blast_result) == 1: column_blast_res = blast_result[0].split("\t") column_blast_res[13] = column_blast_res[13].replace("-", "") allele_details = get_blast_details(column_blast_res, allele_name) - - grep_result = taranis.utils.grep_execution(allele_file, column_blast_res[13], "-b1") + + grep_result = taranis.utils.grep_execution( + allele_file, column_blast_res[13], "-b1" + ) # check if sequence match alleles in schema if len(grep_result) > 0: allele_name = grep_result[0].split(">")[1] - + # allele is labled as EXACT pdb.set_trace() return ["EXC", allele_name, allele_details] @@ -83,10 +87,12 @@ def get_blast_details(blast_result: list, allele_name: str)->list: if int(column_blast_res[3]) > int(column_blast_res[4]): # check if sequence is shorter because it starts or ends at the contig if ( - column_blast_res[9] == 1 # check at contig start - or column_blast_res[14] == column_blast_res[10] # check at contig end - or column_blast_res[10] == 1 # check reverse at contig end - or column_blast_res[9] == column_blast_res[15] # check reverse at contig start + column_blast_res[9] == 1 # check at contig start + or column_blast_res[14] + == column_blast_res[10] # check at contig end + or column_blast_res[10] == 1 # check reverse at contig end + or column_blast_res[9] + == column_blast_res[15] # check reverse at contig start ): # allele is labled as PLOT pdb.set_trace() @@ -103,15 +109,13 @@ def get_blast_details(blast_result: list, allele_name: str)->list: # allele is labled as INF pdb.set_trace() return ["INF", allele_name, allele_details] - - - pdb.set_trace() + pdb.set_trace() def search_match_allele(self): # Create blast db with sample file - - result = {"allele_type":{}, "allele_match":{}, "allele_details":{}} + + result = {"allele_type": {}, "allele_match": {}, "allele_details": {}} # pdb.set_trace() for ref_allele in self.ref_alleles: # schema_alleles = os.path.join(self.schema, ref_allele) @@ -140,7 +144,11 @@ def search_match_allele(self): # blast_result = self.blast_obj.run_blast(q_file,perc_identity=100) allele_name = Path(allele_file).stem # pdb.set_trace() - result["allele_type"][allele_name], result["allele_match"][allele_name], result["allele_details"][allele_name] = self.assign_allele_type(blast_result, allele_file, allele_name) + ( + result["allele_type"][allele_name], + result["allele_match"][allele_name], + result["allele_details"][allele_name], + ) = self.assign_allele_type(blast_result, allele_file, allele_name) # pdb.set_trace() else: # Sample does not have a reference allele to be matched @@ -148,12 +156,12 @@ def search_match_allele(self): # ver el codigo de espe # lnf_tpr_tag() pdb.set_trace() - - return result -def parallel_execution(sample_file: str, schema: str, reference_alleles: list, out_folder: str): +def parallel_execution( + sample_file: str, schema: str, reference_alleles: list, out_folder: str +): allele_obj = AlleleCalling(sample_file, schema, reference_alleles, out_folder) return allele_obj.search_match_allele() diff --git a/taranis/distance.py b/taranis/distance.py index 872275d..df19477 100644 --- a/taranis/distance.py +++ b/taranis/distance.py @@ -72,13 +72,9 @@ def create_matrix(self) -> pd.DataFrame: dist_matrix.write("alleles\t" + "\t".join(allele_names) + "\n") dist_matrix.write("\n".join(out_data[1:])) dist_matrix.seek(0) - file_test = "/media/lchapado/Reference_data/proyectos_isciii/taranis/test/reference_alleles_testing_full_schema_17a/error.csv" - #with open(file_test, "w") as fo: - # fo.write("alleles\t" + "\t".join(allele_names) + "\n") - # fo.write("\n".join(out_data[1:])) - - #import pdb; pdb.set_trace() - matrix_pd = pd.read_csv(dist_matrix, sep="\t", index_col="alleles", engine="python").fillna(0) + matrix_pd = pd.read_csv( + dist_matrix, sep="\t", index_col="alleles", engine="python" + ).fillna(0) # Close object and discard memory buffer dist_matrix.close() log.debug(f"create distance for {allele_name}") diff --git a/taranis/eval_cluster.py b/taranis/eval_cluster.py index be2e041..5f30820 100644 --- a/taranis/eval_cluster.py +++ b/taranis/eval_cluster.py @@ -175,9 +175,9 @@ def evaluate_clusters( "alleles_not_found" ] if len(result_eval["alleles_not_in_cluster"]) > 0: - evaluation_alleles[cluster_id]["alleles_not_in_cluster"] = ( - result_eval["alleles_not_in_cluster"] - ) + evaluation_alleles[cluster_id][ + "alleles_not_in_cluster" + ] = result_eval["alleles_not_in_cluster"] else: evaluation_alleles[cluster_id]["result"] = "OK" return self.summary(evaluation_alleles) diff --git a/taranis/reference_alleles.py b/taranis/reference_alleles.py index ee23750..04f8fa1 100644 --- a/taranis/reference_alleles.py +++ b/taranis/reference_alleles.py @@ -80,7 +80,6 @@ def create_distance_matrix(self) -> list: def processing_cluster_data( self, cluster_data: np.array, cluster_ptrs: np.array, position_to_allele: dict ) -> dict: - reference_alleles = [] for cluster_id, values in cluster_data.items(): center_allele = position_to_allele[values["center_id"]] diff --git a/taranis/utils.py b/taranis/utils.py index 4572013..51148aa 100644 --- a/taranis/utils.py +++ b/taranis/utils.py @@ -22,6 +22,7 @@ from Bio import SeqIO import pdb + log = logging.getLogger(__name__) @@ -255,6 +256,7 @@ def get_files_in_folder(folder: str, extension: str = None) -> list[str]: folder_files = os.path.join(folder, "*." + extension) return glob.glob(folder_files) + def grep_execution(input_file: str, pattern: str, parameters: str) -> list: """_summary_ @@ -268,7 +270,7 @@ def grep_execution(input_file: str, pattern: str, parameters: str) -> list: """ try: result = subprocess.run( - ["grep", parameters, pattern, input_file] , + ["grep", parameters, pattern, input_file], capture_output=True, check=True, text=True, @@ -279,6 +281,7 @@ def grep_execution(input_file: str, pattern: str, parameters: str) -> list: return [] return result.stdout.split("\n") + def prompt_text(msg): source = questionary.text(msg).unsafe_ask() return source