diff --git a/deepsearch/chemistry/__init__.py b/deepsearch/chemistry/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deepsearch/chemistry/queries/__init__.py b/deepsearch/chemistry/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deepsearch/chemistry/queries/molecules.py b/deepsearch/chemistry/queries/molecules.py new file mode 100644 index 00000000..21cc3e04 --- /dev/null +++ b/deepsearch/chemistry/queries/molecules.py @@ -0,0 +1,160 @@ +from enum import Enum +from typing import List, Union + +from pydantic import BaseModel + +from deepsearch.cps.client.queries import Query + +from ..resources import ChemVecDbResource, KnowledgeDbResource + + +class MolQueryType(str, Enum): + SIMILARITY = "similarity" + SUBSTRUCTURE = "substructure" + + +class MolQueryLang(str, Enum): + SMILES = "smiles" + SMARTS = "smarts" + + +class MolIdType(str, Enum): + SMILES = "smiles" + SMARTS = "smarts" + INCHI = "inchi" + INCHIKEY = "inchikey" + + +class MolId(BaseModel): + type: MolIdType + value: str + + +CHEMVECDB_COLLECTIONS = { + MolQueryType.SIMILARITY: "patcid_tanimoto", + MolQueryType.SUBSTRUCTURE: "patcid_substructure", +} + + +def MoleculeQuery( + query: str, + query_type: MolQueryType, + query_lang: MolQueryLang = MolQueryLang.SMILES, + num_items: int = 10, +) -> Query: + """ + Use the vector database in Deep Search for querying molecules + by substructure or similarity. + The result is contained in the `molecules` output of the response. + """ + + mol_query = Query() + vec_search_task = mol_query.add( + "ChemVec", + task_id="vec_search", + parameters={ + "query_type": query_type, + "query_lang": query_lang, + "coll_name": CHEMVECDB_COLLECTIONS[query_type], + "query": query, + "topk": num_items, + }, + coordinates=ChemVecDbResource(), + ) + + projection_task = mol_query.add( + "Projection", + task_id="projection", + parameters={ + "projections": { + "nodes": {"field_path": ["$$map", "id"]}, + } + }, + inputs={"nodes": vec_search_task.output("compounds")}, + ) + + lookup_task = mol_query.add( + "DbSubject", + task_id="db_lookup", + parameters={ + "identifiers": { + "persistent_identifiers": {"#Input": {"db_lookup": "nodes"}} + }, + "limit": num_items, + }, + inputs={"nodes": projection_task.output("nodes")}, + coordinates=KnowledgeDbResource(), + ) + + lookup_task.output("subjects").output_as("molecules") + + return mol_query + + +def MoleculesInPatentsQuery( + patents: Union[str, List[str]], + num_items: int = 10, +) -> Query: + """ + List all molecules contained in a list of patents. + The result is contained in the `molecules` output of the response. + """ + + if isinstance(patents, str): + patents = [patents] + + mol_query = Query() + lookup_task = mol_query.add( + "DbSubject", + task_id="db_lookup", + parameters={ + "references": { + "identifiers": [ + { + "type": "patentid", + "value": v, + } + for v in patents + ] + }, + "limit": num_items, + }, + coordinates=KnowledgeDbResource(), + ) + + lookup_task.output("subjects").output_as("molecules") + + return mol_query + + +def PatentsWithMoleculesQuery( + molecules: List[MolId], + num_items: int = 10, +) -> Query: + """ + List all patents containing any of the input molecules. + The result is contained in the `patents` output of the response. + """ + + doc_query = Query() + lookup_task = doc_query.add( + "DbDocument", + task_id="db_lookup", + parameters={ + "subjects": { + "identifiers": [ + { + "type": item.type.value, + "value": item.value, + } + for item in molecules + ] + }, + "limit": num_items, + }, + coordinates=KnowledgeDbResource(), + ) + + lookup_task.output("documents").output_as("patents") + + return doc_query diff --git a/deepsearch/chemistry/resources.py b/deepsearch/chemistry/resources.py new file mode 100644 index 00000000..51819afa --- /dev/null +++ b/deepsearch/chemistry/resources.py @@ -0,0 +1,8 @@ +class ChemVecDbResource: + def to_resource(self): + return {"type": "dsvecdb", "instance_id": "chem_vecdb"} + + +class KnowledgeDbResource: + def to_resource(self): + return {"type": "db", "instance_id": "knowledge_db"}