From 450eb61f1d70825401ce2e254aa4c52d914a5549 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Tue, 20 Aug 2024 13:29:59 +0200 Subject: [PATCH] added actions and tests --- rescript/bv_brc.py | 1346 ++++++++++++++++++++++++++++++++- rescript/plugin_setup.py | 136 +++- rescript/testing.py | 15 + rescript/tests/test_bv_brc.py | 450 ++++++++++- 4 files changed, 1900 insertions(+), 47 deletions(-) create mode 100644 rescript/testing.py diff --git a/rescript/bv_brc.py b/rescript/bv_brc.py index 07df950..37fa422 100644 --- a/rescript/bv_brc.py +++ b/rescript/bv_brc.py @@ -6,55 +6,1333 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- from io import StringIO - +import os import qiime2 import pandas as pd import requests -from q2_types.feature_data import MixedCaseDNAFASTAFormat +from q2_types.feature_data import MixedCaseDNAFASTAFormat, ProteinFASTAFormat, TSVTaxonomyDirectoryFormat +from q2_types.genome_data import GenomeSequencesDirectoryFormat + +from rescript.ncbi import _allowed_ranks, _default_ranks +import json + + +def fetch_genomes_bv_brc( + rql_query: str = None, + genome_ids: list = None +) -> GenomeSequencesDirectoryFormat: + + # Parameter validation + rql_query = id_list_handling(rql_query=rql_query, + ids=genome_ids, + parameter_name="genome_ids", + data_field="genome_id" + ) + + # Define output format + genomes = GenomeSequencesDirectoryFormat() + + # Get requests response + response = download_data( + url=f"https://www.bv-brc.org/api/genome_sequence/?{rql_query}", + data_type="genome_sequence", + ) + + # Transform + json_to_fasta(response.json(), str(genomes)) + + return genomes + + +def fetch_metadata_bv_brc(data_type: str, rql_query: str) -> qiime2.Metadata: + + # Get requests response + response = download_data( + url=f"https://www.bv-brc.org/api/{data_type}/?{rql_query}&http_accept=text/tsv", + data_type=data_type + ) + + tsv_data = StringIO(response.text) + metadata = pd.read_csv(tsv_data, sep='\t') + metadata.index.name = "id" + metadata.index = metadata.index.astype(str) + + return qiime2.Metadata(metadata) + + +def fetch_taxonomy_bv_brc( + rql_query: str, + ranks: list = None, + taxon_ids: list = None, +) -> TSVTaxonomyDirectoryFormat: + + # Parameter validation + rql_query = id_list_handling(rql_query=rql_query, + ids=taxon_ids, + parameter_name="taxon_ids", + data_field="taxon_id" + ) + + # Define output format + directory = TSVTaxonomyDirectoryFormat() + + # Get requests response + response = download_data( + url=f"https://www.bv-brc.org/api/taxonomy/?{rql_query}&http_accept=text/tsv", + data_type="taxonomy" + ) + + # Convert to data frame + tsv_data = StringIO(response.text) + metadata = pd.read_csv(tsv_data, sep='\t') + + # Transform metadata to TSVTaxonomyFormat + taxonomy = transform_taxonomy_df(df=metadata, ranks=ranks) + taxonomy.to_csv(os.path.join(str(directory), "taxonomy.tsv"), sep="\t") + + return directory + + +def parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks): + # Set ranks to default if no list is specified + if not ranks: + ranks = _default_ranks + + # Split the lineage names and ranks by ';' + lineage_split = lineage_names.split(';') + rank_split = lineage_ranks.split(';') + + # Dictionary to map taxonomic ranks to their prefixes for the specified ranks + rank_to_prefix = {key: _allowed_ranks[key] for key in ranks if key in ranks} + + # Initialize the list for the parsed lineage + parsed_lineage = [] + + # Loop over each rank and assign the corresponding prefix and name + for rank, name in zip(rank_split, lineage_split): + prefix = rank_to_prefix.get(rank, None) + if prefix: + parsed_lineage.append(f"{prefix}{name}") + else: + pass + # Ensure all taxonomic levels are covered (fill in missing levels with just the + # prefix) + final_lineage = [] + for required_prefix in rank_to_prefix.values(): + # Check if any parsed_lineage item starts with the required prefix + match = next( + (item for item in parsed_lineage if item.startswith(required_prefix)), None) + if match: + final_lineage.append(match) + else: + final_lineage.append(required_prefix) -def json_to_fasta(json: dict): - fasta_output = [] + # Join the parsed lineage names with '; ' + return '; '.join(final_lineage) + + +def transform_taxonomy_df(df, ranks): + # Apply the transformation + df['Taxon'] = df.apply( + lambda row: parse_lineage_names_with_ranks(lineage_names=row['lineage_names'], + lineage_ranks=row['lineage_ranks'], + ranks=ranks), axis=1) + + # Rename columns and set index + df = df.rename(columns={'taxon_id': 'Feature ID'}) + df = df[['Feature ID', 'Taxon']] + df = df.set_index('Feature ID') + return df + + +def fetch_genome_features_bv_brc( + rql_query: str = None, + feature_ids: list = None, +) -> (MixedCaseDNAFASTAFormat, ProteinFASTAFormat): + + # Parameter validation + rql_query = id_list_handling(rql_query=rql_query, + ids=feature_ids, + parameter_name="feature_ids", + data_field="feature_id") + + # Define output formats + genes = MixedCaseDNAFASTAFormat() + proteins = ProteinFASTAFormat() + + # Construct URLs for genes and proteins downloads + base_url = "https://www.bv-brc.org/api/genome_feature/?" + genes_url = base_url + f"{rql_query}&http_accept=application/dna+fasta" + proteins_url = base_url + f"{rql_query}&http_accept=application/protein+fasta" + + # Get requests response for genes and proteins + response_genes = download_data(url=genes_url, data_type="genome_feature") + response_proteins = download_data(url=proteins_url, data_type="genome_feature") + + # Save genes and proteins as FASTA files + fasta_genes = response_genes.text + with genes.open() as file: + file.write(fasta_genes) + + fasta_proteins = response_proteins.text + with proteins.open() as file: + file.write(fasta_proteins) + + return genes, proteins + + +def json_to_fasta(json, output_dir): + # Dictionary to hold sequences grouped by genome_id + fasta_files = {} + + # Loop over all entries in dict for entry in json: - header = (f">accn|{entry['sequence_id']} {entry['description']} " - f"[{entry['genome_name']} | {entry['genome_id']}]") - fasta_output.append(f"{header}\n{entry['sequence']}") - return "\n".join(fasta_output) + genome_id = entry['genome_id'] + if genome_id not in fasta_files: + fasta_files[genome_id] = [] + + # Construct FASTA format to be identical to BV-BRC FASTA headers + header = (f">accn|{entry['accession']} {entry['description']} " + f"[{entry['genome_name']} | {genome_id}]") + fasta_files[genome_id].append(f"{header}\n{entry['sequence'].upper()}") -def fetch_genomes_bv_brc(rql_query: str) -> (MixedCaseDNAFASTAFormat, qiime2.Metadata): - genomes = MixedCaseDNAFASTAFormat() + # Write each genome_id's sequences to a separate FASTA file + for genome_id, sequences in fasta_files.items(): + fasta_content = "\n".join(sequences) + fasta_filename = os.path.join(output_dir, f"{genome_id}.fasta") - # Make the GET request for metadata - url_metadata = f"https://www.bv-brc.org/api/genome/{rql_query}&http_accept=text/tsv" - response_metadata = requests.get(url_metadata) + with open(fasta_filename, 'w') as fasta_file: + fasta_file.write(fasta_content) - if response_metadata.status_code == 200: - # Convert TSV data to dataframe - tsv_data = StringIO(response_metadata.text) - metadata = pd.read_csv(tsv_data, sep='\t', index_col="genome_id") - metadata.index.name = "id" - metadata.index = metadata.index.astype(str) +def download_data(url, data_type): + # Get requests response + response = requests.get(url) - # Extract all genome_ids out of dataframe - genome_ids = metadata.index.tolist() + # If response is correct return it + if response.status_code == 200: + return response + + # Error handling if response incorrect + elif response.status_code == 400: + error_handling(response, data_type) else: - raise ValueError("Error") + raise ValueError(response.text) + + +def error_handling(response, data_type): + # No data found for query or incorrect RQL query + if response.text == "[]": + raise ValueError("No data could be retrieved. Either because of an " + "incorrect RQL query or because no data exists for the " + "query.") + + elif response.text.startswith("A Database Error Occured:"): + + # Parse the response dict + json_str = response.text[response.text.find('{'):] + response_dict = json.loads(json_str) - # Make the GET request for sequences - url_sequences = (f"https://www.bv-brc.org/api/genome_sequence/" - f"?in(genome_id,({','.join(genome_ids)}))") - response_sequences = requests.get(url_sequences) + # Incorrect RQL operator + if response_dict['msg'].startswith("undefined field object"): + raise ValueError( + f"Error code {response_dict['code']}: {response_dict['msg']}. " + f"Incorrect RQL query operator." + ) - if response_sequences.status_code == 200: - # Convert JSON to FASTA - fasta = json_to_fasta(response_sequences.json()) + # Incorrect field for data type + elif response_dict['msg'].startswith("undefined field"): + raise ValueError( + f"Error code {response_dict['code']}: {response_dict['msg']}. \n" + f"Allowed fields for data type {data_type}: \n{data_fields[data_type]}" + ) + + else: + raise ValueError( + f"Error code {response_dict['code']}: {response_dict['msg']}." + ) - # Write FASTA format to file - with genomes.open() as file: - file.write(fasta) else: - raise ValueError("Error") + raise ValueError(response.text) + + +def id_list_handling(rql_query: str, ids: list, parameter_name: str, data_field: str): + # Error if rql_query and ids parameters are given + if rql_query and ids: + raise ValueError(f"Parameters rql_query and {parameter_name} can't be used " + "simultaneously.") + + # Error if rql_query and ids parameters are not given + elif not rql_query and not ids: + raise ValueError("At least one of the parameters rql_query and " + f"{parameter_name} has to be given.") + + # construct the RQL queries + elif ids: + rql_query = f"in({data_field},({','.join(map(str, ids))}))" + + return rql_query + - return genomes, qiime2.Metadata(metadata) +data_fields = { + "antibiotics": [ + "_version_", + "antibiotic_name", + "atc_classification", + "canonical_smiles", + "cas_id", + "date_inserted", + "date_modified", + "description", + "drugbank_interactions", + "inchi_key", + "isomeric_smiles", + "mechanism_of_action", + "molecular_formula", + "molecular_weight", + "pharmacological_classes", + "pharmacology", + "pubchem_cid", + "pubchem_cid_i", + "synonyms" + ], + "enzyme_class_ref": [ + "_version_", + "date_inserted", + "date_modified", + "ec_description", + "ec_number", + "go" + ], + "epitope": [ + "_version_", + "assay_results", + "bcell_assays", + "comments", + "date_inserted", + "date_modified", + "end", + "epitope_id", + "epitope_sequence", + "epitope_type", + "host_name", + "mhc_assays", + "organism", + "protein_accession", + "protein_id", + "protein_name", + "start", + "taxon_id", + "taxon_lineage_ids", + "taxon_lineage_names", + "tcell_assays", + "total_assays" + ], + "epitope_assay": [ + "_version_", + "assay_group", + "assay_id", + "assay_measurement", + "assay_measurement_unit", + "assay_method", + "assay_result", + "assay_type", + "authors", + "date_inserted", + "date_modified", + "end", + "epitope_id", + "epitope_sequence", + "epitope_type", + "host_name", + "host_taxon_id", + "mhc_allele", + "mhc_allele_class", + "organism", + "pdb_id", + "pmid", + "protein_accession", + "protein_id", + "protein_name", + "start", + "taxon_id", + "taxon_lineage_ids", + "taxon_lineage_names", + "title" + ], + "experiment": [ + "_version_", + "additional_data", + "additional_metadata", + "biosets", + "date_inserted", + "date_modified", + "detection_instrument", + "doi", + "exp_description", + "exp_id", + "exp_name", + "exp_poc", + "exp_protocol", + "exp_title", + "exp_type", + "experimenters", + "genome_id", + "measurement_technique", + "organism", + "pmid", + "public_identifier", + "public_repository", + "samples", + "strain", + "study_description", + "study_institution", + "study_name", + "study_pi", + "study_title", + "taxon_id", + "taxon_lineage_ids", + "treatment_amount", + "treatment_duration", + "treatment_name", + "treatment_type" + ], + "bioset": [ + "_version_", + "additional_data", + "additional_metadata", + "analysis_group_1", + "analysis_group_2", + "analysis_method", + "bioset_criteria", + "bioset_description", + "bioset_id", + "bioset_name", + "bioset_result", + "bioset_type", + "date_inserted", + "date_modified", + "entity_count", + "entity_type", + "exp_id", + "exp_name", + "exp_title", + "exp_type", + "genome_id", + "organism", + "protocol", + "result_type", + "strain", + "study_description", + "study_institution", + "study_name", + "study_pi", + "study_title", + "taxon_id", + "taxon_lineage_ids", + "treatment_amount", + "treatment_duration", + "treatment_name", + "treatment_type" + ], + "bioset_result": [ + "_version_", + "bioset_description", + "bioset_id", + "bioset_name", + "bioset_type", + "counts", + "date_inserted", + "date_modified", + "entity_id", + "entity_name", + "entity_type", + "exp_id", + "exp_name", + "exp_title", + "exp_type", + "feature_id", + "fpkm", + "gene", + "gene_id", + "genome_id", + "id", + "locus_tag", + "log2_fc", + "organism", + "other_ids", + "other_value", + "p_value", + "patric_id", + "product", + "protein_id", + "result_type", + "strain", + "taxon_id", + "tpm", + "treatment_amount", + "treatment_duration", + "treatment_name", + "treatment_type", + "uniprot_id", + "z_score" + ], + "gene_ontology_ref": [ + "_version_", + "date_inserted", + "date_modified", + "definition", + "go_id", + "go_name", + "ontology" + ], + "genome": [ + "_version_", + "additional_metadata", + "altitude", + "antimicrobial_resistance", + "antimicrobial_resistance_evidence", + "assembly_accession", + "assembly_method", + "authors", + "bioproject_accession", + "biosample_accession", + "biovar", + "body_sample_site", + "body_sample_subsite", + "cds", + "cds_ratio", + "cell_shape", + "checkm_completeness", + "checkm_contamination", + "chromosomes", + "clade", + "class", + "coarse_consistency", + "collection_date", + "collection_year", + "comments", + "common_name", + "completion_date", + "contig_l50", + "contig_n50", + "contigs", + "core_families", + "core_family_ratio", + "culture_collection", + "date_inserted", + "date_modified", + "depth", + "disease", + "family", + "fine_consistency", + "gc_content", + "genbank_accessions", + "genome_id", + "genome_length", + "genome_name", + "genome_quality", + "genome_quality_flags", + "genome_status", + "genus", + "geographic_group", + "geographic_location", + "gram_stain", + "h1_clade_global", + "h1_clade_us", + "h3_clade", + "h5_clade", + "h_type", + "habitat", + "host_age", + "host_common_name", + "host_gender", + "host_group", + "host_health", + "host_name", + "host_scientific_name", + "hypothetical_cds", + "hypothetical_cds_ratio", + "isolation_comments", + "isolation_country", + "isolation_site", + "isolation_source", + "kingdom", + "lab_host", + "latitude", + "lineage", + "longitude", + "mat_peptide", + "missing_core_family_ids", + "mlst", + "motility", + "n_type", + "ncbi_project_id", + "nearest_genomes", + "optimal_temperature", + "order", + "organism_name", + "other_clinical", + "other_environmental", + "other_names", + "other_typing", + "outgroup_genomes", + "owner", + "oxygen_requirement", + "p2_genome_id", + "partial_cds", + "partial_cds_ratio", + "passage", + "pathovar", + "patric_cds", + "ph1n1_like", + "phenotype", + "phylum", + "plasmids", + "plfam_cds", + "plfam_cds_ratio", + "public", + "publication", + "reference_genome", + "refseq_accessions", + "refseq_cds", + "refseq_project_id", + "rrna", + "salinity", + "season", + "segment", + "segments", + "sequencing_centers", + "sequencing_depth", + "sequencing_platform", + "sequencing_status", + "serovar", + "species", + "sporulation", + "sra_accession", + "state_province", + "strain", + "subclade", + "subtype", + "superkingdom", + "taxon_id", + "taxon_lineage_ids", + "taxon_lineage_names", + "temperature_range", + "trna", + "type_strain", + "user_read", + "user_write" + ], + "strain": [ + "1_pb2", + "2_pb1", + "3_pa", + "4_ha", + "5_np", + "6_na", + "7_mp", + "8_ns", + "_version_", + "collection_date", + "collection_year", + "date_inserted", + "date_modified", + "family", + "genbank_accessions", + "genome_ids", + "genus", + "geographic_group", + "h_type", + "host_common_name", + "host_group", + "host_name", + "id", + "isolation_country", + "l", + "lab_host", + "m", + "n_type", + "other_segments", + "owner", + "passage", + "public", + "s", + "season", + "segment_count", + "species", + "status", + "strain", + "subtype", + "taxon_id", + "taxon_lineage_ids", + "taxon_lineage_names", + "user_read", + "user_write" + ], + "genome_amr": [ + "_version_", + "antibiotic", + "computational_method", + "computational_method_performance", + "computational_method_version", + "date_inserted", + "date_modified", + "evidence", + "genome_id", + "genome_name", + "id", + "laboratory_typing_method", + "laboratory_typing_method_version", + "laboratory_typing_platform", + "measurement", + "measurement_sign", + "measurement_unit", + "measurement_value", + "owner", + "pmid", + "public", + "resistant_phenotype", + "source", + "taxon_id", + "testing_standard", + "testing_standard_year", + "user_read", + "user_write", + "vendor" + ], + "feature_sequence": [ + "_version_", + "date_inserted", + "date_modified", + "md5", + "sequence", + "sequence_type" + ], + "genome_feature": [ + "aa_length", + "aa_sequence_md5", + "accession", + "alt_locus_tag", + "annotation", + "brc_id", + "classifier_round", + "classifier_score", + "codon_start", + "date_inserted", + "date_modified", + "end", + "feature_id", + "feature_type", + "figfam_id", + "gene", + "gene_id", + "genome_id", + "genome_name", + "go", + "location", + "na_length", + "na_sequence_md5", + "notes", + "og_id", + "owner", + "p2_feature_id", + "patric_id", + "pdb_accession", + "pgfam_id", + "plfam_id", + "product", + "property", + "protein_id", + "public", + "refseq_locus_tag", + "segments", + "sequence_id", + "sog_id", + "start", + "strand", + "taxon_id", + "uniprotkb_accession", + "user_read", + "user_write" + ], + "genome_sequence": [ + "_version_", + "accession", + "chromosome", + "date_inserted", + "date_modified", + "description", + "gc_content", + "genome_id", + "genome_name", + "gi", + "length", + "mol_type", + "owner", + "p2_sequence_id", + "plasmid", + "public", + "release_date", + "segment", + "sequence", + "sequence_id", + "sequence_md5", + "sequence_status", + "sequence_type", + "taxon_id", + "topology", + "user_read", + "user_write", + "version" + ], + "id_ref": [ + "_version_", + "date_inserted", + "date_modified", + "id", + "id_type", + "id_value", + "uniprotkb_accession" + ], + "misc_niaid_sgc": [ + "_version_", + "date_inserted", + "date_modified", + "gene_symbol_collection", + "genus", + "has_clones", + "has_proteins", + "selection_criteria", + "species", + "strain", + "target_id", + "target_status" + ], + "pathway": [ + "_version_", + "accession", + "alt_locus_tag", + "annotation", + "date_inserted", + "date_modified", + "ec_description", + "ec_number", + "feature_id", + "gene", + "genome_ec", + "genome_id", + "genome_name", + "id", + "owner", + "pathway_class", + "pathway_ec", + "pathway_id", + "pathway_name", + "patric_id", + "product", + "public", + "refseq_locus_tag", + "sequence_id", + "taxon_id", + "user_read", + "user_write" + ], + "pathway_ref": [ + "_version_", + "date_inserted", + "date_modified", + "ec_description", + "ec_number", + "id", + "map_location", + "map_name", + "map_type", + "occurrence", + "pathway_class", + "pathway_id", + "pathway_name" + ], + "ppi": [ + "_version_", + "category", + "date_inserted", + "date_modified", + "detection_method", + "domain_a", + "domain_b", + "evidence", + "feature_id_a", + "feature_id_b", + "gene_a", + "gene_b", + "genome_id_a", + "genome_id_b", + "genome_name_a", + "genome_name_b", + "id", + "interaction_type", + "interactor_a", + "interactor_b", + "interactor_desc_a", + "interactor_desc_b", + "interactor_type_a", + "interactor_type_b", + "pmid", + "refseq_locus_tag_a", + "refseq_locus_tag_b", + "score", + "source_db", + "source_id", + "taxon_id_a", + "taxon_id_b" + ], + "protein_family_ref": [ + "_version_", + "date_inserted", + "date_modified", + "family_id", + "family_product", + "family_type" + ], + "sequence_feature": [ + "aa_sequence_md5", + "aa_variant", + "additional_metadata", + "comments", + "date_inserted", + "date_modified", + "end", + "evidence_code", + "feature_id", + "genbank_accession", + "gene", + "genome_id", + "genome_name", + "id", + "length", + "patric_id", + "product", + "publication", + "refseq_locus_tag", + "segment", + "segments", + "sf_category", + "sf_id", + "sf_name", + "sf_sequence", + "sf_sequence_md5", + "source", + "source_aa_sequence", + "source_id", + "source_sf_location", + "source_strain", + "start", + "subtype", + "taxon_id", + "variant_types" + ], + "sequence_feature_vt": [ + "additional_metadata", + "comments", + "date_inserted", + "date_modified", + "id", + "sf_category", + "sf_id", + "sf_name", + "sf_sequence", + "sf_sequence_md5", + "sfvt_genome_count", + "sfvt_genome_ids", + "sfvt_id", + "sfvt_sequence", + "sfvt_sequence_md5", + "sfvt_variations" + ], + "sp_gene": [ + "_version_", + "alt_locus_tag", + "antibiotics", + "antibiotics_class", + "classification", + "date_inserted", + "date_modified", + "e_value", + "evidence", + "feature_id", + "function", + "gene", + "genome_id", + "genome_name", + "id", + "identity", + "organism", + "owner", + "patric_id", + "pmid", + "product", + "property", + "property_source", + "public", + "query_coverage", + "refseq_locus_tag", + "same_genome", + "same_genus", + "same_species", + "source", + "source_id", + "subject_coverage", + "taxon_id", + "user_read", + "user_write" + ], + "sp_gene_ref": [ + "_version_", + "antibiotics", + "antibiotics_class", + "assertion", + "classification", + "date_inserted", + "date_modified", + "function", + "gene_id", + "gene_name", + "genus", + "gi", + "id", + "locus_tag", + "organism", + "pmid", + "product", + "property", + "source", + "source_id", + "species" + ], + "spike_lineage": [ + "_version_", + "country", + "date_inserted", + "date_modified", + "growth_rate", + "id", + "lineage", + "lineage_count", + "lineage_of_concern", + "month", + "prevalence", + "region", + "sequence_features", + "total_isolates" + ], + "spike_variant": [ + "_version_", + "aa_variant", + "country", + "date_inserted", + "date_modified", + "growth_rate", + "id", + "lineage_count", + "month", + "prevalence", + "region", + "sequence_features", + "total_isolates" + ], + "structured_assertion": [ + "_version_", + "comment", + "date_inserted", + "date_modified", + "evidence_code", + "feature_id", + "id", + "owner", + "patric_id", + "pmid", + "property", + "public", + "refseq_locus_tag", + "score", + "source", + "user_read", + "user_write", + "value" + ], + "subsystem": [ + "_version_", + "active", + "class", + "date_inserted", + "date_modified", + "feature_id", + "gene", + "genome_id", + "genome_name", + "id", + "owner", + "patric_id", + "product", + "public", + "refseq_locus_tag", + "role_id", + "role_name", + "subclass", + "subsystem_id", + "subsystem_name", + "superclass", + "taxon_id", + "user_read", + "user_write" + ], + "subsystem_ref": [ + "_version_", + "class", + "date_inserted", + "date_modified", + "description", + "id", + "notes", + "pmid", + "role_id", + "role_name", + "subclass", + "subsystem_id", + "subsystem_name", + "superclass" + ], + "taxonomy": [ + "_version_", + "cds_mean", + "cds_sd", + "core_families", + "core_family_ids", + "description", + "division", + "genetic_code", + "genome_count", + "genome_length_mean", + "genome_length_sd", + "genomes", + "genomes_f", + "hypothetical_cds_ratio_mean", + "hypothetical_cds_ratio_sd", + "lineage", + "lineage_ids", + "lineage_names", + "lineage_ranks", + "other_names", + "parent_id", + "plfam_cds_ratio_mean", + "plfam_cds_ratio_sd", + "taxon_id", + "taxon_id_i", + "taxon_name", + "taxon_rank" + ], + "protein_structure": [ + "alignments", + "authors", + "date_inserted", + "date_modified", + "feature_id", + "file_path", + "gene", + "genome_id", + "institution", + "method", + "organism_name", + "patric_id", + "pdb_id", + "pmid", + "product", + "release_date", + "resolution", + "sequence", + "sequence_md5", + "taxon_id", + "taxon_lineage_ids", + "taxon_lineage_names", + "title", + "uniprotkb_accession" + ], + "protein_feature": [ + "aa_sequence_md5", + "classification", + "comments", + "date_inserted", + "date_modified", + "description", + "e_value", + "end", + "evidence", + "feature_id", + "feature_type", + "gene", + "genome_id", + "genome_name", + "id", + "interpro_description", + "interpro_id", + "length", + "patric_id", + "product", + "publication", + "refseq_locus_tag", + "score", + "segments", + "sequence", + "source", + "source_id", + "start", + "taxon_id" + ], + "surveillance": [ + "additional_metadata", + "alcohol_or_other_drug_dependence", + "breastfeeding", + "chest_imaging_interpretation", + "chronic_conditions", + "collection_city", + "collection_country", + "collection_date", + "collection_latitude", + "collection_longitude", + "collection_poi", + "collection_season", + "collection_state_province", + "collection_year", + "collector_institution", + "collector_name", + "comments", + "contact_email_address", + "contributing_institution", + "date_inserted", + "date_modified", + "daycare_attendance", + "days_elapsed_to_disease_status", + "days_elapsed_to_sample_collection", + "days_elapsed_to_vaccination", + "diagnosis", + "dialysis", + "disease_severity", + "disease_status", + "duration_of_exposure", + "duration_of_treatment", + "ecmo", + "education", + "embargo_end_date", + "exposure", + "exposure_type", + "genome_id", + "geographic_group", + "hospitalization_duration", + "hospitalized", + "host_age", + "host_capture_status", + "host_common_name", + "host_ethnicity", + "host_group", + "host_habitat", + "host_health", + "host_height", + "host_id_type", + "host_identifier", + "host_natural_state", + "host_race", + "host_sex", + "host_species", + "host_weight", + "human_leukocyte_antigens", + "id", + "infections_within_five_years", + "influenza_like_illness_over_the_past_year", + "initiation_of_treatment", + "intensive_care_unit", + "last_update_date", + "longitudinal_study", + "maintenance_medication", + "nursing_home_residence", + "onset_hours", + "other_vaccinations", + "oxygen_saturation", + "packs_per_day_for_how_many_years", + "pathogen_test_interpretation", + "pathogen_test_result", + "pathogen_test_type", + "pathogen_type", + "post_visit_medications", + "pre_visit_medications", + "pregnancy", + "primary_living_situation", + "profession", + "project_identifier", + "sample_accession", + "sample_identifier", + "sample_material", + "sample_receipt_date", + "sample_transport_medium", + "sequence_accession", + "source_of_vaccine_information", + "species", + "strain", + "submission_date", + "subtype", + "sudden_onset", + "symptoms", + "taxon_lineage_ids", + "tobacco_use", + "travel_history", + "treatment", + "treatment_dosage", + "treatment_type", + "trimester_of_pregnancy", + "types_of_allergies", + "use_of_personal_protective_equipment", + "vaccination_type", + "vaccine_dosage", + "vaccine_lot_number", + "vaccine_manufacturer", + "ventilation" + ], + "serology": [ + "additional_metadata", + "collection_city", + "collection_country", + "collection_date", + "collection_state", + "collection_year", + "comments", + "contributing_institution", + "date_inserted", + "date_modified", + "genbank_accession", + "geographic_group", + "host_age", + "host_age_group", + "host_common_name", + "host_health", + "host_identifier", + "host_sex", + "host_species", + "host_type", + "id", + "positive_definition", + "project_identifier", + "sample_accession", + "sample_identifier", + "serotype", + "strain", + "taxon_lineage_ids", + "test_antigen", + "test_interpretation", + "test_pathogen", + "test_result", + "test_type", + "virus_identifier" + ] +} diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index 8e7bbc4..5f1393c 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -8,14 +8,15 @@ import importlib -from q2_types.genome_data import GenomeData, Loci, Proteins +from q2_types.genome_data import GenomeData, Loci, Proteins, Genes, DNASequence from q2_types.metadata import ImmutableMetadata from qiime2.core.type import TypeMatch from qiime2.plugin import (Str, Plugin, Choices, List, Citations, Range, Int, Float, Visualization, Bool, TypeMap, Metadata, MetadataColumn, Categorical) -from .bv_brc import fetch_genomes_bv_brc +from .bv_brc import fetch_genomes_bv_brc, fetch_metadata_bv_brc, \ + fetch_genome_features_bv_brc, fetch_taxonomy_bv_brc from .subsample import subsample_fasta from .trim_alignment import trim_alignment from .merge import merge_taxa @@ -1230,21 +1231,142 @@ ] ) +datatypes_metadata = [ + "antibiotics", + "enzyme_class_ref", + "epitope", + "epitope_assay", + "experiment", + "bioset", + "bioset_result", + "gene_ontology_ref", + "genome", + "strain", + "genome_amr", + "feature_sequence", + "genome_feature", + "genome_sequence", + "id_ref", + "misc_niaid_sgc", + "pathway", + "pathway_ref", + "ppi", + "protein_family_ref", + "sequence_feature", + "sequence_feature_vt", + "sp_gene", + "sp_gene_ref", + "spike_lineage", + "spike_variant", + "structured_assertion", + "subsystem", + "subsystem_ref", + "taxonomy", + "protein_structure", + "protein_feature", + "surveillance", + "serology" +] + plugin.methods.register_function( function=fetch_genomes_bv_brc, inputs={}, - parameters={'rql_query': Str}, - outputs=[('genomes', FeatureData[Sequence]), - ('metadata', ImmutableMetadata)], + parameters={ + 'rql_query': Str, + 'genome_ids': List[Str], + }, + outputs=[('genomes', GenomeData[DNASequence])], input_descriptions={}, - parameter_descriptions={'rql_query': 'query'}, + parameter_descriptions={ + 'rql_query': 'Query in RQL format. Check ' + 'https://www.bv-brc.org/api/doc/genome_sequence ' + 'for documentation.', + 'genome_ids': 'List of genome IDs from BV-BRC.', + +}, output_descriptions={ 'genomes': 'genomes', - 'metadata': 'metadata'}, + }, name='fetch genomes', description="fetch genomes", ) +plugin.methods.register_function( + function=fetch_metadata_bv_brc, + inputs={}, + parameters={ + 'data_type': Str % Choices(datatypes_metadata), + 'rql_query': Str + }, + outputs=[('metadata', ImmutableMetadata)], + input_descriptions={}, + parameter_descriptions={ + 'data_type': 'BV-BCR data type. Check https://www.bv-brc.org/api/doc/ for ' + 'documentation.', + 'rql_query': 'Query in RQL format. Check https://www.bv-brc.org/api/doc/ for ' + 'documentation.' + }, + output_descriptions={ + 'metadata': 'metadata'}, + name='Fetch BV-BCR metadata.', + description="Fetch BV-BCR metadata for a specific data type with an RQL query.", +) + +plugin.methods.register_function( + function=fetch_taxonomy_bv_brc, + inputs={}, + parameters={ + 'rql_query': Str, + 'ranks': List[Str % Choices(_allowed_ranks)], + 'taxon_ids': List[Str], + }, + outputs=[('taxonomy', FeatureData[Taxonomy])], + input_descriptions={}, + parameter_descriptions={ + 'rql_query': 'Query in RQL format. Check ' + 'https://www.bv-brc.org/api/doc/taxonomy ' + 'for documentation.', + 'ranks': 'List of taxonomic ranks for building a taxonomy from the ' + "NCBI Taxonomy database. [default: '" + + "', '".join(_default_ranks) + "']", + 'taxon_ids': 'List of taxon IDs from BV-BRC.', + }, + output_descriptions={ + 'taxonomy': 'Taxonomy data.' + +}, + name='Fetch taxonomy data from BV-BRC.', + description='Fetch taxonomy data from BV-BRC.', +) + +plugin.methods.register_function( + function=fetch_genome_features_bv_brc, + inputs={}, + parameters={ + 'rql_query': Str, + 'feature_ids': List[Str], + + }, + outputs=[ + ('genes', GenomeData[Genes]), + ('proteins', GenomeData[Proteins]) + ], + input_descriptions={}, + parameter_descriptions={ + 'rql_query': 'Query in RQL format. Check ' + 'https://www.bv-brc.org/api/doc/genome_feature ' + 'for documentation.', + 'feature_ids': 'List of feature IDs from BV-BRC.', + }, + output_descriptions={ + 'genes': 'genes', + 'proteins': 'proteins' + +}, + name='Fetch genome features from BV-BRC.', + description='Fetch DNA and protein sequences of genome features from BV-BRC.', +) + # Registrations plugin.register_semantic_types(SILVATaxonomy, SILVATaxidMap) plugin.register_semantic_type_to_format( diff --git a/rescript/testing.py b/rescript/testing.py new file mode 100644 index 0000000..d4a63be --- /dev/null +++ b/rescript/testing.py @@ -0,0 +1,15 @@ +import requests + + +response = requests.get("https://www.bv-brc.org/api/genome_sequence/?in(genome_id,(224308.43))") + +# Raise an error if the request was not successful +response.raise_for_status() + +# Load the response data as JSON +data = response.json() + +# Count the number of entries in the JSON dictionary +num_entries = len(data) + +print(num_entries) \ No newline at end of file diff --git a/rescript/tests/test_bv_brc.py b/rescript/tests/test_bv_brc.py index bdbf843..9edfd5c 100644 --- a/rescript/tests/test_bv_brc.py +++ b/rescript/tests/test_bv_brc.py @@ -5,15 +5,453 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +import unittest +from typing import Any +from unittest.mock import Mock, patch, mock_open, MagicMock + +import pandas as pd +from q2_types.feature_data import MixedCaseDNAFASTAFormat, ProteinFASTAFormat, \ + TSVTaxonomyDirectoryFormat +from q2_types.genome_data import GenomeSequencesDirectoryFormat from qiime2.plugin.testing import TestPluginBase -from rescript.bv_brc import fetch_genomes_bv_brc +from rescript.bv_brc import fetch_genomes_bv_brc, fetch_metadata_bv_brc, \ + fetch_genome_features_bv_brc, fetch_taxonomy_bv_brc, id_list_handling, \ + error_handling, download_data, json_to_fasta, transform_taxonomy_df, \ + parse_lineage_names_with_ranks + + +class TestIDListHandling(TestPluginBase): + package = 'rescript.tests' + + def test_error_both_parameters_given(self): + with self.assertRaisesRegex(ValueError, + "Parameters rql_query and ids can't be used " + "simultaneously."): + id_list_handling(rql_query="some_query", + ids=[1, 2, 3], + parameter_name="ids", + data_field="id") + + def test_error_neither_parameter_given(self): + with self.assertRaisesRegex(ValueError, + "At least one of the parameters rql_query and ids " + "has to be given."): + id_list_handling(rql_query="", + ids=[], + parameter_name="ids", + data_field="id") + + def test_correct_rql_query_generation(self): + result = id_list_handling( + rql_query="", + ids=[1, 2, 3], + parameter_name="ids", + data_field="id") + expected_query = "in(id,(1,2,3))" + self.assertEqual(result, expected_query) + + +class TestErrorHandling(TestPluginBase): + package = 'rescript.tests' + + def setUp(self): + super().setUp() + self.response = Mock() + + def test_no_data_found(self): + self.response.text = "[]" + + with self.assertRaisesRegex(ValueError, "No data"): + error_handling(self.response, data_type="genome") + + def test_database_error_occurred_undefined_field_object(self): + self.response.text = ('A Database Error Occured: {"code": 400, ' + '"msg": "undefined field object in RQL"}') + + with self.assertRaisesRegex(ValueError, "undefined field object"): + error_handling(self.response, data_type="genome") + + def test_database_error_occurred_undefined_field(self): + self.response.text = ('A Database Error Occured: {"code": 400, ' + '"msg": "undefined field"}') + + with self.assertRaisesRegex(ValueError, "undefined field"): + error_handling(self.response, data_type="genome") + + def test_database_error_occurred_general_error(self): + self.response.text = ('A Database Error Occured: {"code": 500, "msg": ' + '"Internal Server Error"}') + + with self.assertRaisesRegex(ValueError, "Internal Server Error"): + error_handling(self.response, data_type="genome") + + def test_unhandled_response(self): + self.response.text = "Unexpected error message" + + with self.assertRaisesRegex(ValueError, "Unexpected error"): + error_handling(self.response, data_type="genome") + + +class TestDownloadData(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.requests.get') + @patch('rescript.bv_brc.error_handling') + def test_download_data_success(self, mock_error_handling, mock_requests_get): + # Mock the requests.get response for a successful request + mock_response = Mock() + mock_response.status_code = 200 + mock_requests_get.return_value = mock_response + + url = "http://example.com/data" + data_type = "some_type" + + result = download_data(url, data_type) + + mock_requests_get.assert_called_once_with(url) + self.assertEqual(result, mock_response) + + @patch('rescript.bv_brc.requests.get') + @patch('rescript.bv_brc.error_handling') + def test_download_data_error_400(self, mock_error_handling, mock_requests_get): + # Mock the requests.get response for a 400 Bad Request + mock_response = Mock() + mock_response.status_code = 400 + mock_requests_get.return_value = mock_response + + url = "http://example.com/data" + data_type = "some_type" + + download_data(url, data_type) + + mock_requests_get.assert_called_once_with(url) + mock_error_handling.assert_called_once_with(mock_response, data_type) + + @patch('rescript.bv_brc.requests.get') + @patch('rescript.bv_brc.error_handling') + def test_download_data_other_error(self, mock_error_handling, mock_requests_get): + # Mock the requests.get response for any other error + mock_response = Mock() + mock_response.status_code = 500 + mock_response.text = "Server Error" + mock_requests_get.return_value = mock_response + + url = "http://example.com/data" + data_type = "some_type" + + with self.assertRaisesRegex(ValueError, "Server Error"): + download_data(url, data_type) + + mock_requests_get.assert_called_once_with(url) + mock_error_handling.assert_not_called() + + +class TestJsonToFasta(TestPluginBase): + package = 'rescript.tests' + + def setUp(self): + super().setUp() + + self.json_input_1 = [ + { + "genome_id": "genome1", + "accession": "acc1", + "description": "desc1", + "genome_name": "genome_name1", + "sequence": "ATGC" + } + ] + + self.json_input_2 = [ + { + "genome_id": "genome2", + "accession": "acc2", + "description": "desc2", + "genome_name": "genome_name2", + "sequence": "CGTA" + } + ] + + @patch('rescript.bv_brc.open', new_callable=mock_open) + def test_json_to_fasta_single_genome(self, mock_file): + json_to_fasta(self.json_input_1, "/fake/dir") + + # Expected FASTA content + expected_fasta = ">accn|acc1 desc1 [genome_name1 | genome1]\nATGC" + + # Check if the file was created with the correct path and content + mock_file.assert_called_once_with("/fake/dir/genome1.fasta", 'w') + mock_file().write.assert_called_once_with(expected_fasta) + + @patch('rescript.bv_brc.open', new_callable=mock_open) + def test_json_to_fasta_multiple_genomes(self, mock_file): + json_to_fasta(self.json_input_1 + self.json_input_2, "/fake/dir") + + # Expected FASTA content + expected_fasta_genome1 = ">accn|acc1 desc1 [genome_name1 | genome1]\nATGC" + expected_fasta_genome2 = ">accn|acc2 desc2 [genome_name2 | genome2]\nCGTA" + + # Check if the files were created with the correct path and content + mock_file().write.assert_any_call(expected_fasta_genome1) + mock_file().write.assert_any_call(expected_fasta_genome2) + + @patch('rescript.bv_brc.open', new_callable=mock_open) + def test_json_to_fasta_multiple_sequences_same_genome(self, mock_file): + + json_to_fasta(self.json_input_1 + self.json_input_1, "/fake/dir") + + # Expected FASTA content + expected_fasta = (">accn|acc1 desc1 [genome_name1 | genome1]\nATGC\n" + ">accn|acc1 desc1 [genome_name1 | genome1]\nATGC") + + # Check if the file was created with the correct path and content + mock_file.assert_called_once_with("/fake/dir/genome1.fasta", 'w') + mock_file().write.assert_called_once_with(expected_fasta) -class TestPipelines(TestPluginBase): +class TestFetchGenomeFeaturesBVBR(TestPluginBase): package = 'rescript.tests' - def test_fetch_genomes_bv_brc(self): - query = "?eq(genome_id,224308.43)" - query2 = "?eq(taxon_id,224308)" - fetch_genomes_bv_brc(query2) \ No newline at end of file + @patch('rescript.bv_brc.download_data') + @patch('rescript.bv_brc.id_list_handling') + @patch.object(MixedCaseDNAFASTAFormat, 'open') + @patch.object(ProteinFASTAFormat, 'open') + def test_fetch_genome_features_bv_brc( + self, mock_protein_open, mock_genes_open, mock_id_list_handling, + mock_download_data + ): + # Mock the id_list_handling function + mock_id_list_handling.return_value = "in(feature_id, (feature1,feature2))" + + # Mock the responses from download_data + mock_genes_response = MagicMock() + mock_genes_response.text = ">gene1\nATGC\n>gene2\nATGC" + mock_proteins_response = MagicMock() + mock_proteins_response.text = ">protein1\nMVLSPADKTNVK\n>protein2\nMVLSPADKTNVK" + mock_download_data.side_effect = [mock_genes_response, + mock_proteins_response] + + # Mock file write actions + mock_genes_file = MagicMock() + mock_protein_file = MagicMock() + mock_genes_open.return_value.__enter__.return_value = mock_genes_file + mock_protein_open.return_value.__enter__.return_value = mock_protein_file + + # Call the function + genes, proteins = fetch_genome_features_bv_brc( + rql_query="in(feature_id, (feature1,feature2))", + feature_ids=["feature1", "feature2"] + ) + + # Assertions + mock_id_list_handling.assert_called_once_with( + rql_query="in(feature_id, (feature1,feature2))", + ids=["feature1", "feature2"], + parameter_name="feature_ids", + data_field="feature_id" + ) + + mock_download_data.assert_any_call( + url="https://www.bv-brc.org/api/genome_feature/?in(feature_id, " + "(feature1,feature2))&http_accept=application/dna+fasta", + data_type="genome_feature" + ) + + mock_download_data.assert_any_call( + url="https://www.bv-brc.org/api/genome_feature/?in(feature_id, " + "(feature1,feature2))&http_accept=application/protein+fasta", + data_type="genome_feature" + ) + + # Check that the correct data is written to the correct files + mock_genes_file.write.assert_called_once_with(">gene1\nATGC\n>gene2\nATGC") + mock_protein_file.write.assert_called_once_with( + ">protein1\nMVLSPADKTNVK\n>protein2\nMVLSPADKTNVK") + + self.assertIsInstance(genes, MixedCaseDNAFASTAFormat) + self.assertIsInstance(proteins, ProteinFASTAFormat) + + +class TestFetchGenomesBVBRC(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.json_to_fasta') + @patch('rescript.bv_brc.download_data') + @patch('rescript.bv_brc.id_list_handling') + def test_fetch_genomes_bv_brc( + self, mock_id_list_handling, mock_download_data, mock_json_to_fasta + ): + # Mock the id_list_handling function + mock_id_list_handling.return_value = "genome_id=in(genome1,genome2)" + + # Mock the download_data response + mock_response = MagicMock() + mock_response.json.return_value = {'genomes': ['genome_data']} + mock_download_data.return_value = mock_response + + # Call the function + genomes = fetch_genomes_bv_brc( + rql_query="genome_id=in(genome1,genome2)", + genome_ids=["genome1", "genome2"] + ) + + # Assertions + mock_id_list_handling.assert_called_once_with( + rql_query="genome_id=in(genome1,genome2)", + ids=["genome1", "genome2"], + parameter_name="genome_ids", + data_field="genome_id" + ) + + mock_download_data.assert_called_once_with( + url="https://www.bv-brc.org/api/genome_sequence/" + "?genome_id=in(genome1,genome2)", + data_type="genome_sequence" + ) + + mock_json_to_fasta.assert_called_once_with( + {'genomes': ['genome_data']}, + str(genomes) + ) + + self.assertIsInstance(genomes, GenomeSequencesDirectoryFormat) + + +class TestFetchMetadataBVBR(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.qiime2.Metadata') + @patch('rescript.bv_brc.pd.read_csv') + @patch('rescript.bv_brc.download_data') + def test_fetch_metadata_bv_brc(self, mock_download_data, mock_read_csv, mock_metadata): + # Mock the download_data response + mock_response = MagicMock() + mock_response.text = "id\tcolumn1\tcolumn2\n1\tdata1\tdata2\n2\tdata3\tdata4" + mock_download_data.return_value = mock_response + + # Mock the pandas read_csv return value + mock_df = pd.DataFrame({ + 'column1': ['data1', 'data3'], + 'column2': ['data2', 'data4'] + }, index=pd.Index(['1', '2'], name='id')) + mock_read_csv.return_value = mock_df + + # Mock qiime2.Metadata creation + mock_metadata_instance = MagicMock() + mock_metadata.return_value = mock_metadata_instance + + # Call the function + result = fetch_metadata_bv_brc( + data_type="genome", + rql_query="genome_id=in(1,2)" + ) + + # Assertions + mock_download_data.assert_called_once_with( + url="https://www.bv-brc.org/api/genome/?genome_id=in(1,2)&http_accept=text/tsv", + data_type="genome" + ) + + mock_read_csv.assert_called_once() + args, kwargs = mock_read_csv.call_args + self.assertEqual(kwargs['sep'], '\t') + + self.assertEqual(args[0].getvalue(), "id\tcolumn1\tcolumn2\n1\tdata1\tdata2\n2\tdata3\tdata4") + + mock_metadata.assert_called_once_with(mock_df) + self.assertEqual(result, mock_metadata_instance) + + +class TestFetchTaxonomyBVBR(TestPluginBase): + package = 'rescript.tests' + + @patch('pandas.DataFrame.to_csv') + @patch('rescript.bv_brc.transform_taxonomy_df') + @patch('rescript.bv_brc.download_data') + @patch('rescript.bv_brc.pd.read_csv') + @patch('rescript.bv_brc.id_list_handling') + def test_fetch_taxonomy_bv_brc( + self, mock_id_list_handling, mock_read_csv, mock_download_data, mock_transform_taxonomy_df, mock_to_csv + ): + # Mock the id_list_handling function + mock_id_list_handling.return_value = "taxon_id=in(taxon1,taxon2)" + + # Mock the download_data response + mock_response = MagicMock() + mock_response.text = "id\trank1\trank2\n1\tdata1\tdata2\n2\tdata3\tdata4" + mock_download_data.return_value = mock_response + + + # Prepare mocks for file output + with patch('builtins.open', unittest.mock.mock_open()) as mock_file: + directory = fetch_taxonomy_bv_brc( + rql_query="taxon_id=in(taxon1,taxon2)", + ranks=['rank1', 'rank2'], + taxon_ids=["taxon1", "taxon2"] + ) + + # Assertions + mock_id_list_handling.assert_called_once_with( + rql_query="taxon_id=in(taxon1,taxon2)", + ids=["taxon1", "taxon2"], + parameter_name="taxon_ids", + data_field="taxon_id" + ) + + mock_download_data.assert_called_once_with( + url="https://www.bv-brc.org/api/taxonomy/?taxon_id=in(taxon1,taxon2)&http_accept=text/tsv", + data_type="taxonomy" + ) + + self.assertIsInstance(directory, TSVTaxonomyDirectoryFormat) + + @patch('rescript.bv_brc.parse_lineage_names_with_ranks') + def test_transform_taxonomy_df(self, mock_parse_lineage_names_with_ranks): + # Mock the parse_lineage_names_with_ranks function + mock_parse_lineage_names_with_ranks.side_effect = \ + lambda lineage_names, lineage_ranks, ranks: "Mocked Taxon" + + # Create a sample DataFrame + df = pd.DataFrame({ + 'taxon_id': ['taxon1', 'taxon2'], + 'lineage_names': ['name1;name2', 'name3;name4'], + 'lineage_ranks': ['rank1;rank2', 'rank3;rank4'] + }) + + ranks = ['rank1', 'rank2', 'rank3'] + + # Call the function + result_df = transform_taxonomy_df(df, ranks) + + # Expected DataFrame after transformation + expected_df = pd.DataFrame({ + 'Feature ID': ['taxon1', 'taxon2'], + 'Taxon': ['Mocked Taxon', 'Mocked Taxon'] + }).set_index('Feature ID') + + # Assert that the result matches the expected DataFrame + pd.testing.assert_frame_equal(result_df, expected_df) + + def test_parse_with_missing_ranks(self): + lineage_names = "Bacteria;Proteobacteria;Enterobacteriaceae" + lineage_ranks = "kingdom;phylum;family" + ranks = ['kingdom', 'phylum', 'class', 'order', 'genus', 'species'] + + result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks) + expected = "k__Bacteria; p__Proteobacteria; c__; o__; g__; s__" + + self.assertEqual(result, expected) + + def test_parse_with_no_ranks_provided(self): + lineage_names = ("Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;" + "Enterobacteriaceae;Escherichia;coli") + lineage_ranks = "kingdom;phylum;class;order;family;genus;species" + ranks = None # Should fall back to _default_ranks + + result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks) + expected = ("k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; " + "o__Enterobacterales; f__Enterobacteriaceae; g__Escherichia; " + "s__coli") + + self.assertEqual(result, expected)