From 16ebfe027b5d00da97df4879254aeebc0277f993 Mon Sep 17 00:00:00 2001
From: luissian <chapado.l@gmail.com>
Date: Thu, 7 Mar 2024 00:03:18 +0100
Subject: [PATCH 1/4] working on allele calling, exact match

---
 taranis/__main__.py          |  72 +++++++---------
 taranis/allele_calling.py    | 163 ++++++++++++++++++-----------------
 taranis/reference_alleles.py |   2 +-
 taranis/utils.py             |  22 +++++
 4 files changed, 138 insertions(+), 121 deletions(-)

diff --git a/taranis/__main__.py b/taranis/__main__.py
index f7655e1..4798a5b 100644
--- a/taranis/__main__.py
+++ b/taranis/__main__.py
@@ -22,6 +22,18 @@
     stderr=True, force_terminal=taranis.utils.rich_force_colors()
 )
 
+def expand_wildcards(ctx, param, value):
+    if value:
+        expanded_paths = []
+        for path in value:
+            # Check if path contains wildcard
+            if '*' in path:
+                # Expand wildcard
+                expanded_paths.extend(glob.glob(path))
+            else:
+                expanded_paths.append(path)
+        return expanded_paths
+    return None
 
 def run_taranis():
     # Set up the rich traceback
@@ -352,7 +364,6 @@ def reference_alleles(
             )
             for f_file in schema_files
         ]
-    # import pdb; pdb.set_trace()
     for future in concurrent.futures.as_completed(futures):
         results.append(future.result())
     _ = taranis.reference_alleles.collect_statistics(results, eval_cluster, output)
@@ -366,7 +377,7 @@ def reference_alleles(
     "--schema",
     required=True,
     multiple=False,
-    type=click.Path(),
+    type=click.Path(exists=True),
     help="Directory where the schema with the core gene files are located. ",
 )
 @click.option(
@@ -374,25 +385,9 @@ def reference_alleles(
     "--reference",
     required=True,
     multiple=False,
-    type=click.Path(),
+    type=click.Path(exists=True),
     help="Directory where the schema reference allele files are located. ",
 )
-@click.option(
-    "-g",
-    "--genome",
-    required=True,
-    multiple=False,
-    type=click.Path(),
-    help="Genome reference file",
-)
-@click.option(
-    "-a",
-    "--sample",
-    required=True,
-    multiple=False,
-    type=click.Path(),
-    help="Sample location file in fasta format. ",
-)
 @click.option(
     "-o",
     "--output",
@@ -401,27 +396,17 @@ def reference_alleles(
     type=click.Path(),
     help="Output folder to save reference alleles",
 )
+@click.argument("assemblies", callback=expand_wildcards, nargs=-1, required=True, type=click.Path(exists=True))
 def allele_calling(
     schema,
     reference,
-    genome,
-    sample,
+    assemblies,
     output,
 ):
-    folder_to_check = [schema, reference]
-    for folder in folder_to_check:
-        if not taranis.utils.folder_exists(folder):
-            log.error("folder %s does not exists", folder)
-            stderr.print("[red] Folder does not exist. " + folder + "!")
-            sys.exit(1)
-    if not taranis.utils.file_exists(sample):
-        log.error("file %s does not exists", sample)
-        stderr.print("[red] File does not exist. " + sample + "!")
-        sys.exit(1)
-    schema_files = taranis.utils.get_files_in_folder(schema, "fasta")
-    if len(schema_files) == 0:
-        log.error("Schema folder %s does not have any fasta file", schema)
-        stderr.print("[red] Schema folder does not have any fasta file")
+    schema_ref_files = taranis.utils.get_files_in_folder(reference, "fasta")
+    if len(schema_ref_files) == 0:
+        log.error("Referenc allele folder %s does not have any fasta file", schema)
+        stderr.print("[red] reference allele folder does not have any fasta file")
         sys.exit(1)
 
     # Check if output folder exists
@@ -440,19 +425,24 @@ def allele_calling(
             os.makedirs(output)
         except OSError as e:
             log.info("Unable to create folder at %s with error %s", output, e)
-            stderr.print("[red] ERROR. Unable to create folder  " + output)
+            stderr.print("[red] ERROR. Unable to create {output} folder" )
             sys.exit(1)
     # Filter fasta files from reference folder
-    ref_alleles = glob.glob(os.path.join(reference, "*.fasta"))
+    # ref_alleles = glob.glob(os.path.join(reference, "*.fasta"))
     # Create predictions
+
+    """
     pred_out = os.path.join(output, "prediction")
     pred_sample = taranis.prediction.Prediction(genome, sample, pred_out)
     pred_sample.training()
     pred_sample.prediction()
+    """
 
     """Analyze the sample file against schema to identify outbreakers
     """
-    sample_allele = taranis.allele_calling.AlleleCalling(
-        pred_sample, sample, schema, ref_alleles, output
-    )
-    sample_allele.analyze_sample()
+    results = []
+    for assembly_file in assemblies:
+        results.append(taranis.allele_calling.parallel_execution(assembly_file, schema, schema_ref_files, output))
+
+    
+    # sample_allele_obj.analyze_sample()
diff --git a/taranis/allele_calling.py b/taranis/allele_calling.py
index 00108d4..8c7b2c6 100644
--- a/taranis/allele_calling.py
+++ b/taranis/allele_calling.py
@@ -1,3 +1,4 @@
+import io
 import logging
 import os
 import rich.console
@@ -7,8 +8,11 @@
 import taranis.blast
 
 # import numpy
+from collections import OrderedDict
 import pandas as pd
 from pathlib import Path
+from Bio import SeqIO
+
 
 
 import pdb
@@ -23,105 +27,106 @@
 
 
 class AlleleCalling:
-    def __init__(self, prediction, sample_file, schema, reference_alleles, out_folder):
-        self.prediction = prediction
+    def __init__(self,  sample_file: str, schema: str, reference_alleles: list, out_folder:str):
+        # self.prediction = prediction
         self.sample_file = sample_file
         self.schema = schema
         self.ref_alleles = reference_alleles
         self.out_folder = out_folder
         self.s_name = Path(sample_file).stem
         self.blast_dir = os.path.join(out_folder, "blastdb")
-        self.blast_sample = os.path.join(self.blast_dir, self.s_name)
-        self.blast_heading = [
-            "qseqid",
-            "sseqid",
-            "pident",
-            "qlen",
-            "length",
-            "mismatch",
-            "gapopen",
-            "evalue",
-            "bitscore",
-            "sstart",
-            "send",
-            "qstart",
-            "qend",
-            "sseq",
-            "qseq",
-        ]
-
-    def assign_allele_type(self, query_seq, allele_name, sample_contig, schema_gene):
-        """_summary_
-
-        Args:
-            query_seq (_type_): _description_
-            allele_name (_type_): _description_
-            sample_contig (_type_): _description_
-            schema_gene (_type_): _description_
-        """
-        s_alleles_blast = taranis.blast.Blast("nucl")
-        ref_allele_blast_dir = os.path.join(self.blast_dir, "ref_alleles")
-        query_path = os.path.join(self.out_folder, "tmp", allele_name)
-        # Write to file the sequence to find out the loci name that fully match
-        f_name = taranis.utils.write_fasta_file(query_path, query_seq, allele_name)
-        query_file = os.path.join(query_path, f_name)
-        _ = s_alleles_blast.create_blastdb(schema_gene, ref_allele_blast_dir)
-        # Blast with sample sequence to find the allele in the schema
-        seq_blast_match = s_alleles_blast.run_blast(query_file, perc_identity=100)
-        pdb.set_trace()
-        if len(seq_blast_match) >= 1:
+        # create blast for sample file
+        self.blast_obj = taranis.blast.Blast("nucl")
+        _ = self.blast_obj.create_blastdb(sample_file, self.blast_dir)
+
+
+    def assign_allele_type(self, blast_result: list, allele_file: str)->list:
+        def get_blast_details(blast_result: list, allele_name: str)->list:
+            # get blast details
+            blast_details = [
+                blast_result[0].split("_")[0], # Core gene
+                self.s_name,
+                "gene annotation",
+                "product annotation",
+                allele_name, 
+                "allele quality",
+                blast_result[1], # contig
+                blast_result[3], # query length
+                blast_result[9], # contig start
+                blast_result[10], # contig end
+                blast_result[13], # contig sequence
+            ]
+            return blast_details
+
+        if len(blast_result) > 1:
             # allele is named as NIPHEM
 
             # Hacer un blast con la query esta secuencia y la database del alelo
             # Create  blast db with sample file
 
             pass
-        elif len(seq_blast_match) == 1:
-            pass
+        elif len(blast_result) == 1:
+            column_blast_res = blast_result[0].split("\t")
+            sequence = column_blast_res[13].replace("-", "")
+           
+            grep_result = taranis.utils.grep_execution(allele_file, sequence, "-b1")
+            # check if sequence match alleles in schema
+            if len(grep_result) > 0:
+                allele_name = grep_result[0].split(">")[1]
+                allele_details = get_blast_details(column_blast_res, allele_name)
+                # allele is labled as EXACT
+                return ["EXACT", allele_name, allele_details]
         else:
             pass
-
-    def search_alleles(self, ref_allele):
-        allele_name = Path(ref_allele).stem
-        schema_gene = os.path.join(self.schema, allele_name + ".fasta")
-        # allele_name = Path(ref_allele).stem
-        # run blast with sample as db and reference allele as query
-        sample_blast_match = self.sample_blast.run_blast(ref_allele)
-        if len(sample_blast_match) > 0:
-            pd_lines = pd.DataFrame([item.split("\t") for item in sample_blast_match])
-            pd_lines.columns = self.blast_heading
-            pd_lines["pident"] = pd_lines["pident"].apply(pd.to_numeric)
-            sel_max = pd_lines.loc[pd_lines["pident"].idxmax()]
-            query_seq = sel_max["qseq"]
-            # np_lines = numpy.array(s_lines)
-            # convert to float the perc_identity to find out the max value
-            # max_val = numpy.max(np_lines[:,2].astype(float))
-            # mask = np_lines[:, 2] ==str(max_val)
-            # Select rows that match the percent identity. Index 2 in blast results
-            # sel_row = np_lines[mask, :] = np_lines[mask, :]
-            # query_seq = sel_row[0,14]
-            sample_contig = sel_max["sseqid"]
-            _ = self.assign_allele_type(
-                query_seq, allele_name, sample_contig, schema_gene
-            )
-        else:
-            # Sample does not have a reference allele to be matched
-            # Keep LNF info
-            # ver el codigo de espe
-            # lnf_tpr_tag()
-            pass
         pdb.set_trace()
 
-    def analyze_sample(self):
+
+    def search_match_allele(self):
         # Create  blast db with sample file
-        self.sample_blast = taranis.blast.Blast("nucl")
-        _ = self.sample_blast.create_blastdb(self.sample_file, self.blast_dir)
-        result = {}
+        
+        result = {"allele_type":{}, "allele_match":{}, "allele_details":{}}
         # pdb.set_trace()
         for ref_allele in self.ref_alleles:
             # schema_alleles = os.path.join(self.schema, ref_allele)
             # parallel in all CPUs in cluster node
-            result[ref_allele] = self.search_alleles(ref_allele)
+            alleles = OrderedDict()
+            match_found = False
+            with open(ref_allele, "r") as fh:
+                for record in SeqIO.parse(fh, "fasta"):
+                    alleles[record.id] = str(record.seq)
+
+            for r_id, r_seq in alleles.items():
+                # create file in memory to increase speed
+                query_file = io.StringIO()
+                query_file.write(">" + r_id + "\n" + r_seq)
+                query_file.seek(0)
+                blast_result = self.blast_obj.run_blast(
+                    query_file.read(), perc_identity=90, query_type="stdin"
+                )
+                if len(blast_result) > 0:
+                    match_found = True
+                    break
+            # Close object and discard memory buffer
+            query_file.close()
+            if match_found:
+                allele_file = os.path.join(self.schema, os.path.basename(ref_allele))
+                # blast_result = self.blast_obj.run_blast(q_file,perc_identity=100)
+                allele_name = Path(allele_file).stem
+                pdb.set_trace()
+                result["allele_type"][allele_name], result["allele_match"][allele_name], result["allele_details"][allele_name] = self.assign_allele_type(blast_result, allele_file)
+                pdb.set_trace()
+            else:
+                # Sample does not have a reference allele to be matched
+                # Keep LNF info
+                # ver el codigo de espe
+                # lnf_tpr_tag()
+                pass
+            
 
         pdb.set_trace()
-        return
+        return result
+
+def parallel_execution(sample_file: str, schema: str, reference_alleles: list, out_folder: str):
+
+    allele_obj = AlleleCalling(sample_file, schema, reference_alleles, out_folder)
+    return allele_obj.search_match_allele()
diff --git a/taranis/reference_alleles.py b/taranis/reference_alleles.py
index 5646455..7e25ba4 100644
--- a/taranis/reference_alleles.py
+++ b/taranis/reference_alleles.py
@@ -119,7 +119,7 @@ def save_reference_alleles(self, reference_alleles: list) -> str:
             for record in SeqIO.parse(fh, "fasta"):
                 if record.id in reference_alleles:
                     record_seq[record.id] = str(record.seq)
-        ref_allele_file = os.path.join(self.output, self.locus_name + ".fa")
+        ref_allele_file = os.path.join(self.output, self.locus_name + ".fasta")
         with open(ref_allele_file, "w") as fo:
             for r_id, r_seq in record_seq.items():
                 fo.write(">" + r_id + "\n")
diff --git a/taranis/utils.py b/taranis/utils.py
index 39c8179..bef5662 100644
--- a/taranis/utils.py
+++ b/taranis/utils.py
@@ -254,6 +254,28 @@ def get_files_in_folder(folder: str, extension: str = None) -> list[str]:
     folder_files = os.path.join(folder, "*." + extension)
     return glob.glob(folder_files)
 
+def grep_execution(input_file: str, pattern: str, parameters: str) -> list:
+    """_summary_
+
+    Args:
+        input_file (str): _description_
+        pattern (str): _description_
+        parmeters (str): _description_
+
+    Returns:
+        list: _description_
+    """
+    try:
+        result = subprocess.run(
+            ["grep", parameters, pattern, input_file] ,
+            capture_output=True,
+            check=True,
+            text=True,
+        )
+    except subprocess.CalledProcessError as e:
+        log.error("Unable to run grep. Error message: %s ", e.stderr.decode())
+        return []
+    return result.stdout.split("\n")
 
 def prompt_text(msg):
     source = questionary.text(msg).unsafe_ask()

From aabb94c1a4518baf44bab8c3c7975a7fc870fec6 Mon Sep 17 00:00:00 2001
From: luissian <chapado.l@gmail.com>
Date: Sat, 9 Mar 2024 14:02:59 +0100
Subject: [PATCH 2/4] fixed issue when reading on pandas long files

---
 taranis/__main__.py          | 16 ++++++++++++----
 taranis/distance.py          |  8 +++++++-
 taranis/reference_alleles.py |  8 +++++---
 3 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/taranis/__main__.py b/taranis/__main__.py
index 4798a5b..f88b998 100644
--- a/taranis/__main__.py
+++ b/taranis/__main__.py
@@ -15,6 +15,9 @@
 import taranis.reference_alleles
 import taranis.allele_calling
 
+from pathlib import Path
+import pdb
+
 log = logging.getLogger()
 
 # Set up rich stderr console
@@ -364,8 +367,12 @@ def reference_alleles(
             )
             for f_file in schema_files
         ]
-    for future in concurrent.futures.as_completed(futures):
-        results.append(future.result())
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                results.append(future.result())
+            except Exception as e:
+                print(e)
+                continue
     _ = taranis.reference_alleles.collect_statistics(results, eval_cluster, output)
     finish = time.perf_counter()
     print(f"Reference alleles finish in {round((finish-start)/60, 2)} minutes")
@@ -442,7 +449,8 @@ def allele_calling(
     """
     results = []
     for assembly_file in assemblies:
-        results.append(taranis.allele_calling.parallel_execution(assembly_file, schema, schema_ref_files, output))
-
+        assembly_name = Path(assembly_file).stem
+        results.append({assembly_name: taranis.allele_calling.parallel_execution(assembly_file, schema, schema_ref_files, output)})
+        pdb.set_trace()
     
     # sample_allele_obj.analyze_sample()
diff --git a/taranis/distance.py b/taranis/distance.py
index 428f596..872275d 100644
--- a/taranis/distance.py
+++ b/taranis/distance.py
@@ -72,7 +72,13 @@ def create_matrix(self) -> pd.DataFrame:
         dist_matrix.write("alleles\t" + "\t".join(allele_names) + "\n")
         dist_matrix.write("\n".join(out_data[1:]))
         dist_matrix.seek(0)
-        matrix_pd = pd.read_csv(dist_matrix, sep="\t", index_col="alleles").fillna(0)
+        file_test = "/media/lchapado/Reference_data/proyectos_isciii/taranis/test/reference_alleles_testing_full_schema_17a/error.csv"
+        #with open(file_test, "w") as fo:
+        #    fo.write("alleles\t" + "\t".join(allele_names) + "\n")
+        #    fo.write("\n".join(out_data[1:]))
+
+        #import pdb; pdb.set_trace()
+        matrix_pd = pd.read_csv(dist_matrix, sep="\t", index_col="alleles", engine="python").fillna(0)
         # Close object and discard memory buffer
         dist_matrix.close()
         log.debug(f"create distance for {allele_name}")
diff --git a/taranis/reference_alleles.py b/taranis/reference_alleles.py
index 7e25ba4..ee23750 100644
--- a/taranis/reference_alleles.py
+++ b/taranis/reference_alleles.py
@@ -121,9 +121,9 @@ def save_reference_alleles(self, reference_alleles: list) -> str:
                     record_seq[record.id] = str(record.seq)
         ref_allele_file = os.path.join(self.output, self.locus_name + ".fasta")
         with open(ref_allele_file, "w") as fo:
-            for r_id, r_seq in record_seq.items():
-                fo.write(">" + r_id + "\n")
-                fo.write(r_seq + "\n")
+            for ref_allele in reference_alleles:
+                fo.write(">" + ref_allele + "\n")
+                fo.write(record_seq[ref_allele] + "\n")
         return ref_allele_file
 
     def create_ref_alleles(self) -> dict:
@@ -255,6 +255,8 @@ def stats_graphics(stats_folder: str, cluster_alleles: dict) -> None:
     eval_data = []
     cluster_data_graph = {}
     clusters_list = []
+    stderr.print("Process starts for collecting statistics")
+    log.info("Process starts for collecting statistics")
     # split the data into cluster and evaluation
     for d_allele in data_alleles:
         cluster_data.append(d_allele["cluster_data"])

From 86ae8a2e5785792984ed20b8df1588f24f66787e Mon Sep 17 00:00:00 2001
From: luissian <chapado.l@gmail.com>
Date: Sat, 9 Mar 2024 14:03:41 +0100
Subject: [PATCH 3/4] added classification

---
 taranis/allele_calling.py | 55 +++++++++++++++++++++++++++++----------
 taranis/blast.py          |  2 +-
 taranis/utils.py          |  4 ++-
 3 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/taranis/allele_calling.py b/taranis/allele_calling.py
index 8c7b2c6..8936a5e 100644
--- a/taranis/allele_calling.py
+++ b/taranis/allele_calling.py
@@ -40,7 +40,7 @@ def __init__(self,  sample_file: str, schema: str, reference_alleles: list, out_
         _ = self.blast_obj.create_blastdb(sample_file, self.blast_dir)
 
 
-    def assign_allele_type(self, blast_result: list, allele_file: str)->list:
+    def assign_allele_type(self, blast_result: list, allele_file: str, allele_name: str)->list:
         def get_blast_details(blast_result: list, allele_name: str)->list:
             # get blast details
             blast_details = [
@@ -63,21 +63,48 @@ def get_blast_details(blast_result: list, allele_name: str)->list:
 
             # Hacer un blast con la query esta secuencia y la database del alelo
             # Create  blast db with sample file
-
-            pass
+            pdb.set_trace()
+            
         elif len(blast_result) == 1:
             column_blast_res = blast_result[0].split("\t")
-            sequence = column_blast_res[13].replace("-", "")
+            column_blast_res[13] = column_blast_res[13].replace("-", "")
+            allele_details = get_blast_details(column_blast_res, allele_name)
            
-            grep_result = taranis.utils.grep_execution(allele_file, sequence, "-b1")
+            grep_result = taranis.utils.grep_execution(allele_file, column_blast_res[13], "-b1")
             # check if sequence match alleles in schema
             if len(grep_result) > 0:
                 allele_name = grep_result[0].split(">")[1]
-                allele_details = get_blast_details(column_blast_res, allele_name)
+                
                 # allele is labled as EXACT
-                return ["EXACT", allele_name, allele_details]
-        else:
-            pass
+                pdb.set_trace()
+                return ["EXC", allele_name, allele_details]
+            # check if contig is shorter than allele
+            pdb.set_trace()
+            if int(column_blast_res[3]) > int(column_blast_res[4]):
+                # check if sequence is shorter because it starts or ends at the contig
+                if (
+                    column_blast_res[9] == 1 # check  at contig start
+                    or column_blast_res[14] == column_blast_res[10] # check at contig end
+                    or column_blast_res[10] == 1 # check reverse at contig end
+                    or column_blast_res[9] == column_blast_res[15] # check reverse at contig start
+                ):
+                    # allele is labled as PLOT
+                    pdb.set_trace()
+                    return ["PLOT", allele_name, allele_details]
+                # allele is labled as ASM
+                pdb.set_trace()
+                return ["ASM", allele_name, allele_details]
+            # check if contig is longer than allele
+            if int(column_blast_res[3]) < int(column_blast_res[4]):
+                # allele is labled as ALM
+                pdb.set_trace()
+                return ["ALM", allele_name, allele_details]
+            if int(column_blast_res[3]) == int(column_blast_res[4]):
+                # allele is labled as INF
+                pdb.set_trace()
+                return ["INF", allele_name, allele_details]
+            
+            
         pdb.set_trace()
 
 
@@ -112,18 +139,18 @@ def search_match_allele(self):
                 allele_file = os.path.join(self.schema, os.path.basename(ref_allele))
                 # blast_result = self.blast_obj.run_blast(q_file,perc_identity=100)
                 allele_name = Path(allele_file).stem
-                pdb.set_trace()
-                result["allele_type"][allele_name], result["allele_match"][allele_name], result["allele_details"][allele_name] = self.assign_allele_type(blast_result, allele_file)
-                pdb.set_trace()
+                # pdb.set_trace()
+                result["allele_type"][allele_name], result["allele_match"][allele_name], result["allele_details"][allele_name] = self.assign_allele_type(blast_result, allele_file, allele_name)
+                # pdb.set_trace()
             else:
                 # Sample does not have a reference allele to be matched
                 # Keep LNF info
                 # ver el codigo de espe
                 # lnf_tpr_tag()
-                pass
+                pdb.set_trace()
             
 
-        pdb.set_trace()
+        
         return result
 
 def parallel_execution(sample_file: str, schema: str, reference_alleles: list, out_folder: str):
diff --git a/taranis/blast.py b/taranis/blast.py
index a51d2ad..981454c 100644
--- a/taranis/blast.py
+++ b/taranis/blast.py
@@ -96,7 +96,7 @@ def run_blast(
         if query_type == "stdin":
             stdin_query = query
             query = "-"
-        blast_parameters = '"6 , qseqid , sseqid , pident ,  qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , qseq"'
+        blast_parameters = '"6 , qseqid , sseqid , pident ,  qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , slen"'
         cline = NcbiblastnCommandline(
             task="blastn",
             db=self.out_blast_dir,
diff --git a/taranis/utils.py b/taranis/utils.py
index bef5662..4572013 100644
--- a/taranis/utils.py
+++ b/taranis/utils.py
@@ -21,6 +21,7 @@
 from pathlib import Path
 from Bio import SeqIO
 
+import pdb
 log = logging.getLogger(__name__)
 
 
@@ -273,7 +274,8 @@ def grep_execution(input_file: str, pattern: str, parameters: str) -> list:
             text=True,
         )
     except subprocess.CalledProcessError as e:
-        log.error("Unable to run grep. Error message: %s ", e.stderr.decode())
+        pdb.set_trace()
+        log.error("Unable to run grep. Error message: %s ", e)
         return []
     return result.stdout.split("\n")
 

From 84b90880043a98c6e702c09343ced320660f07f3 Mon Sep 17 00:00:00 2001
From: luissian <chapado.l@gmail.com>
Date: Mon, 11 Mar 2024 10:29:09 +0100
Subject: [PATCH 4/4] liting

---
 taranis/__main__.py          | 24 ++++++++++---
 taranis/allele_calling.py    | 66 ++++++++++++++++++++----------------
 taranis/distance.py          | 10 ++----
 taranis/eval_cluster.py      |  6 ++--
 taranis/reference_alleles.py |  1 -
 taranis/utils.py             |  5 ++-
 6 files changed, 66 insertions(+), 46 deletions(-)

diff --git a/taranis/__main__.py b/taranis/__main__.py
index f88b998..911b521 100644
--- a/taranis/__main__.py
+++ b/taranis/__main__.py
@@ -25,12 +25,13 @@
     stderr=True, force_terminal=taranis.utils.rich_force_colors()
 )
 
+
 def expand_wildcards(ctx, param, value):
     if value:
         expanded_paths = []
         for path in value:
             # Check if path contains wildcard
-            if '*' in path:
+            if "*" in path:
                 # Expand wildcard
                 expanded_paths.extend(glob.glob(path))
             else:
@@ -38,6 +39,7 @@ def expand_wildcards(ctx, param, value):
         return expanded_paths
     return None
 
+
 def run_taranis():
     # Set up the rich traceback
     rich.traceback.install(console=stderr, width=200, word_wrap=True, extra_lines=1)
@@ -403,7 +405,13 @@ def reference_alleles(
     type=click.Path(),
     help="Output folder to save reference alleles",
 )
-@click.argument("assemblies", callback=expand_wildcards, nargs=-1, required=True, type=click.Path(exists=True))
+@click.argument(
+    "assemblies",
+    callback=expand_wildcards,
+    nargs=-1,
+    required=True,
+    type=click.Path(exists=True),
+)
 def allele_calling(
     schema,
     reference,
@@ -432,7 +440,7 @@ def allele_calling(
             os.makedirs(output)
         except OSError as e:
             log.info("Unable to create folder at %s with error %s", output, e)
-            stderr.print("[red] ERROR. Unable to create {output} folder" )
+            stderr.print("[red] ERROR. Unable to create {output} folder")
             sys.exit(1)
     # Filter fasta files from reference folder
     # ref_alleles = glob.glob(os.path.join(reference, "*.fasta"))
@@ -450,7 +458,13 @@ def allele_calling(
     results = []
     for assembly_file in assemblies:
         assembly_name = Path(assembly_file).stem
-        results.append({assembly_name: taranis.allele_calling.parallel_execution(assembly_file, schema, schema_ref_files, output)})
+        results.append(
+            {
+                assembly_name: taranis.allele_calling.parallel_execution(
+                    assembly_file, schema, schema_ref_files, output
+                )
+            }
+        )
         pdb.set_trace()
-    
+
     # sample_allele_obj.analyze_sample()
diff --git a/taranis/allele_calling.py b/taranis/allele_calling.py
index 8936a5e..c9e11f8 100644
--- a/taranis/allele_calling.py
+++ b/taranis/allele_calling.py
@@ -14,7 +14,6 @@
 from Bio import SeqIO
 
 
-
 import pdb
 
 log = logging.getLogger(__name__)
@@ -27,7 +26,9 @@
 
 
 class AlleleCalling:
-    def __init__(self,  sample_file: str, schema: str, reference_alleles: list, out_folder:str):
+    def __init__(
+        self, sample_file: str, schema: str, reference_alleles: list, out_folder: str
+    ):
         # self.prediction = prediction
         self.sample_file = sample_file
         self.schema = schema
@@ -39,22 +40,23 @@ def __init__(self,  sample_file: str, schema: str, reference_alleles: list, out_
         self.blast_obj = taranis.blast.Blast("nucl")
         _ = self.blast_obj.create_blastdb(sample_file, self.blast_dir)
 
-
-    def assign_allele_type(self, blast_result: list, allele_file: str, allele_name: str)->list:
-        def get_blast_details(blast_result: list, allele_name: str)->list:
+    def assign_allele_type(
+        self, blast_result: list, allele_file: str, allele_name: str
+    ) -> list:
+        def get_blast_details(blast_result: list, allele_name: str) -> list:
             # get blast details
             blast_details = [
-                blast_result[0].split("_")[0], # Core gene
+                blast_result[0].split("_")[0],  # Core gene
                 self.s_name,
                 "gene annotation",
                 "product annotation",
-                allele_name, 
+                allele_name,
                 "allele quality",
-                blast_result[1], # contig
-                blast_result[3], # query length
-                blast_result[9], # contig start
-                blast_result[10], # contig end
-                blast_result[13], # contig sequence
+                blast_result[1],  # contig
+                blast_result[3],  # query length
+                blast_result[9],  # contig start
+                blast_result[10],  # contig end
+                blast_result[13],  # contig sequence
             ]
             return blast_details
 
@@ -64,17 +66,19 @@ def get_blast_details(blast_result: list, allele_name: str)->list:
             # Hacer un blast con la query esta secuencia y la database del alelo
             # Create  blast db with sample file
             pdb.set_trace()
-            
+
         elif len(blast_result) == 1:
             column_blast_res = blast_result[0].split("\t")
             column_blast_res[13] = column_blast_res[13].replace("-", "")
             allele_details = get_blast_details(column_blast_res, allele_name)
-           
-            grep_result = taranis.utils.grep_execution(allele_file, column_blast_res[13], "-b1")
+
+            grep_result = taranis.utils.grep_execution(
+                allele_file, column_blast_res[13], "-b1"
+            )
             # check if sequence match alleles in schema
             if len(grep_result) > 0:
                 allele_name = grep_result[0].split(">")[1]
-                
+
                 # allele is labled as EXACT
                 pdb.set_trace()
                 return ["EXC", allele_name, allele_details]
@@ -83,10 +87,12 @@ def get_blast_details(blast_result: list, allele_name: str)->list:
             if int(column_blast_res[3]) > int(column_blast_res[4]):
                 # check if sequence is shorter because it starts or ends at the contig
                 if (
-                    column_blast_res[9] == 1 # check  at contig start
-                    or column_blast_res[14] == column_blast_res[10] # check at contig end
-                    or column_blast_res[10] == 1 # check reverse at contig end
-                    or column_blast_res[9] == column_blast_res[15] # check reverse at contig start
+                    column_blast_res[9] == 1  # check  at contig start
+                    or column_blast_res[14]
+                    == column_blast_res[10]  # check at contig end
+                    or column_blast_res[10] == 1  # check reverse at contig end
+                    or column_blast_res[9]
+                    == column_blast_res[15]  # check reverse at contig start
                 ):
                     # allele is labled as PLOT
                     pdb.set_trace()
@@ -103,15 +109,13 @@ def get_blast_details(blast_result: list, allele_name: str)->list:
                 # allele is labled as INF
                 pdb.set_trace()
                 return ["INF", allele_name, allele_details]
-            
-            
-        pdb.set_trace()
 
+        pdb.set_trace()
 
     def search_match_allele(self):
         # Create  blast db with sample file
-        
-        result = {"allele_type":{}, "allele_match":{}, "allele_details":{}}
+
+        result = {"allele_type": {}, "allele_match": {}, "allele_details": {}}
         # pdb.set_trace()
         for ref_allele in self.ref_alleles:
             # schema_alleles = os.path.join(self.schema, ref_allele)
@@ -140,7 +144,11 @@ def search_match_allele(self):
                 # blast_result = self.blast_obj.run_blast(q_file,perc_identity=100)
                 allele_name = Path(allele_file).stem
                 # pdb.set_trace()
-                result["allele_type"][allele_name], result["allele_match"][allele_name], result["allele_details"][allele_name] = self.assign_allele_type(blast_result, allele_file, allele_name)
+                (
+                    result["allele_type"][allele_name],
+                    result["allele_match"][allele_name],
+                    result["allele_details"][allele_name],
+                ) = self.assign_allele_type(blast_result, allele_file, allele_name)
                 # pdb.set_trace()
             else:
                 # Sample does not have a reference allele to be matched
@@ -148,12 +156,12 @@ def search_match_allele(self):
                 # ver el codigo de espe
                 # lnf_tpr_tag()
                 pdb.set_trace()
-            
 
-        
         return result
 
-def parallel_execution(sample_file: str, schema: str, reference_alleles: list, out_folder: str):
 
+def parallel_execution(
+    sample_file: str, schema: str, reference_alleles: list, out_folder: str
+):
     allele_obj = AlleleCalling(sample_file, schema, reference_alleles, out_folder)
     return allele_obj.search_match_allele()
diff --git a/taranis/distance.py b/taranis/distance.py
index 872275d..df19477 100644
--- a/taranis/distance.py
+++ b/taranis/distance.py
@@ -72,13 +72,9 @@ def create_matrix(self) -> pd.DataFrame:
         dist_matrix.write("alleles\t" + "\t".join(allele_names) + "\n")
         dist_matrix.write("\n".join(out_data[1:]))
         dist_matrix.seek(0)
-        file_test = "/media/lchapado/Reference_data/proyectos_isciii/taranis/test/reference_alleles_testing_full_schema_17a/error.csv"
-        #with open(file_test, "w") as fo:
-        #    fo.write("alleles\t" + "\t".join(allele_names) + "\n")
-        #    fo.write("\n".join(out_data[1:]))
-
-        #import pdb; pdb.set_trace()
-        matrix_pd = pd.read_csv(dist_matrix, sep="\t", index_col="alleles", engine="python").fillna(0)
+        matrix_pd = pd.read_csv(
+            dist_matrix, sep="\t", index_col="alleles", engine="python"
+        ).fillna(0)
         # Close object and discard memory buffer
         dist_matrix.close()
         log.debug(f"create distance for {allele_name}")
diff --git a/taranis/eval_cluster.py b/taranis/eval_cluster.py
index be2e041..5f30820 100644
--- a/taranis/eval_cluster.py
+++ b/taranis/eval_cluster.py
@@ -175,9 +175,9 @@ def evaluate_clusters(
                         "alleles_not_found"
                     ]
                 if len(result_eval["alleles_not_in_cluster"]) > 0:
-                    evaluation_alleles[cluster_id]["alleles_not_in_cluster"] = (
-                        result_eval["alleles_not_in_cluster"]
-                    )
+                    evaluation_alleles[cluster_id][
+                        "alleles_not_in_cluster"
+                    ] = result_eval["alleles_not_in_cluster"]
             else:
                 evaluation_alleles[cluster_id]["result"] = "OK"
         return self.summary(evaluation_alleles)
diff --git a/taranis/reference_alleles.py b/taranis/reference_alleles.py
index ee23750..04f8fa1 100644
--- a/taranis/reference_alleles.py
+++ b/taranis/reference_alleles.py
@@ -80,7 +80,6 @@ def create_distance_matrix(self) -> list:
     def processing_cluster_data(
         self, cluster_data: np.array, cluster_ptrs: np.array, position_to_allele: dict
     ) -> dict:
-
         reference_alleles = []
         for cluster_id, values in cluster_data.items():
             center_allele = position_to_allele[values["center_id"]]
diff --git a/taranis/utils.py b/taranis/utils.py
index 4572013..51148aa 100644
--- a/taranis/utils.py
+++ b/taranis/utils.py
@@ -22,6 +22,7 @@
 from Bio import SeqIO
 
 import pdb
+
 log = logging.getLogger(__name__)
 
 
@@ -255,6 +256,7 @@ def get_files_in_folder(folder: str, extension: str = None) -> list[str]:
     folder_files = os.path.join(folder, "*." + extension)
     return glob.glob(folder_files)
 
+
 def grep_execution(input_file: str, pattern: str, parameters: str) -> list:
     """_summary_
 
@@ -268,7 +270,7 @@ def grep_execution(input_file: str, pattern: str, parameters: str) -> list:
     """
     try:
         result = subprocess.run(
-            ["grep", parameters, pattern, input_file] ,
+            ["grep", parameters, pattern, input_file],
             capture_output=True,
             check=True,
             text=True,
@@ -279,6 +281,7 @@ def grep_execution(input_file: str, pattern: str, parameters: str) -> list:
         return []
     return result.stdout.split("\n")
 
+
 def prompt_text(msg):
     source = questionary.text(msg).unsafe_ask()
     return source