From a641c6d1c021780d30aa1e0907d2353eb8ea67f5 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 7 Aug 2024 13:15:38 +0200 Subject: [PATCH] working fetching genome sequence and metadata genome --- rescript/bv_brc.py | 60 +++++++++++++++++++++++++++++++++++ rescript/plugin_setup.py | 17 ++++++++++ rescript/tests/test_bv_brc.py | 19 +++++++++++ 3 files changed, 96 insertions(+) create mode 100644 rescript/bv_brc.py create mode 100644 rescript/tests/test_bv_brc.py diff --git a/rescript/bv_brc.py b/rescript/bv_brc.py new file mode 100644 index 0000000..07df950 --- /dev/null +++ b/rescript/bv_brc.py @@ -0,0 +1,60 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2019-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +from io import StringIO + +import qiime2 +import pandas as pd +import requests +from q2_types.feature_data import MixedCaseDNAFASTAFormat + + +def json_to_fasta(json: dict): + fasta_output = [] + for entry in json: + header = (f">accn|{entry['sequence_id']} {entry['description']} " + f"[{entry['genome_name']} | {entry['genome_id']}]") + fasta_output.append(f"{header}\n{entry['sequence']}") + return "\n".join(fasta_output) + + +def fetch_genomes_bv_brc(rql_query: str) -> (MixedCaseDNAFASTAFormat, qiime2.Metadata): + genomes = MixedCaseDNAFASTAFormat() + + # Make the GET request for metadata + url_metadata = f"https://www.bv-brc.org/api/genome/{rql_query}&http_accept=text/tsv" + response_metadata = requests.get(url_metadata) + + if response_metadata.status_code == 200: + # Convert TSV data to dataframe + tsv_data = StringIO(response_metadata.text) + metadata = pd.read_csv(tsv_data, sep='\t', index_col="genome_id") + + metadata.index.name = "id" + metadata.index = metadata.index.astype(str) + + # Extract all genome_ids out of dataframe + genome_ids = metadata.index.tolist() + else: + raise ValueError("Error") + + # Make the GET request for sequences + url_sequences = (f"https://www.bv-brc.org/api/genome_sequence/" + f"?in(genome_id,({','.join(genome_ids)}))") + response_sequences = requests.get(url_sequences) + + if response_sequences.status_code == 200: + # Convert JSON to FASTA + fasta = json_to_fasta(response_sequences.json()) + + # Write FASTA format to file + with genomes.open() as file: + file.write(fasta) + else: + raise ValueError("Error") + + return genomes, qiime2.Metadata(metadata) diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index 1164d2a..8e7bbc4 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -9,11 +9,13 @@ import importlib from q2_types.genome_data import GenomeData, Loci, Proteins +from q2_types.metadata import ImmutableMetadata from qiime2.core.type import TypeMatch from qiime2.plugin import (Str, Plugin, Choices, List, Citations, Range, Int, Float, Visualization, Bool, TypeMap, Metadata, MetadataColumn, Categorical) +from .bv_brc import fetch_genomes_bv_brc from .subsample import subsample_fasta from .trim_alignment import trim_alignment from .merge import merge_taxa @@ -1228,6 +1230,21 @@ ] ) +plugin.methods.register_function( + function=fetch_genomes_bv_brc, + inputs={}, + parameters={'rql_query': Str}, + outputs=[('genomes', FeatureData[Sequence]), + ('metadata', ImmutableMetadata)], + input_descriptions={}, + parameter_descriptions={'rql_query': 'query'}, + output_descriptions={ + 'genomes': 'genomes', + 'metadata': 'metadata'}, + name='fetch genomes', + description="fetch genomes", +) + # Registrations plugin.register_semantic_types(SILVATaxonomy, SILVATaxidMap) plugin.register_semantic_type_to_format( diff --git a/rescript/tests/test_bv_brc.py b/rescript/tests/test_bv_brc.py new file mode 100644 index 0000000..bdbf843 --- /dev/null +++ b/rescript/tests/test_bv_brc.py @@ -0,0 +1,19 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2019-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +from qiime2.plugin.testing import TestPluginBase + +from rescript.bv_brc import fetch_genomes_bv_brc + + +class TestPipelines(TestPluginBase): + package = 'rescript.tests' + + def test_fetch_genomes_bv_brc(self): + query = "?eq(genome_id,224308.43)" + query2 = "?eq(taxon_id,224308)" + fetch_genomes_bv_brc(query2) \ No newline at end of file