From 4154d3b25914a84bf2f6cd71d4173b624f4f8199 Mon Sep 17 00:00:00 2001 From: ilyasosman Date: Wed, 12 Jun 2024 22:00:36 +0300 Subject: [PATCH 1/2] Add Oracle DB as vector store --- application/core/settings.py | 22 +++-- application/requirements.txt | 1 + application/vectorstore/oracledb.py | 101 ++++++++++++++++++++++ application/vectorstore/vector_creator.py | 2 + 4 files changed, 121 insertions(+), 5 deletions(-) create mode 100644 application/vectorstore/oracledb.py diff --git a/application/core/settings.py b/application/core/settings.py index 6ae5475ce..2eeab7114 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -4,12 +4,16 @@ from pydantic_settings import BaseSettings -current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +current_dir = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +) class Settings(BaseSettings): LLM_NAME: str = "docsgpt" - MODEL_NAME: Optional[str] = None # if LLM_NAME is openai, MODEL_NAME can be gpt-4 or gpt-3.5-turbo + MODEL_NAME: Optional[str] = ( + None # if LLM_NAME is openai, MODEL_NAME can be gpt-4 or gpt-3.5-turbo + ) EMBEDDINGS_NAME: str = "huggingface_sentence-transformers/all-mpnet-base-v2" CELERY_BROKER_URL: str = "redis://localhost:6379/0" CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1" @@ -19,16 +23,20 @@ class Settings(BaseSettings): MODEL_TOKEN_LIMITS: dict = {"gpt-3.5-turbo": 4096, "claude-2": 1e5} UPLOAD_FOLDER: str = "inputs" VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" - RETRIEVERS_ENABLED: list = ["classic_rag", "duckduck_search"] # also brave_search + RETRIEVERS_ENABLED: list = ["classic_rag", "duckduck_search"] # also brave_search API_URL: str = "http://localhost:7091" # backend url for celery worker API_KEY: Optional[str] = None # LLM api key - EMBEDDINGS_KEY: Optional[str] = None # api key for embeddings (if using openai, just copy API_KEY) + EMBEDDINGS_KEY: Optional[str] = ( + None # api key for embeddings (if using openai, just copy API_KEY) + ) OPENAI_API_BASE: Optional[str] = None # azure openai api base url OPENAI_API_VERSION: Optional[str] = None # azure openai api version AZURE_DEPLOYMENT_NAME: Optional[str] = None # azure deployment name for answering - AZURE_EMBEDDINGS_DEPLOYMENT_NAME: Optional[str] = None # azure deployment name for embeddings + AZURE_EMBEDDINGS_DEPLOYMENT_NAME: Optional[str] = ( + None # azure deployment name for embeddings + ) # elasticsearch ELASTIC_CLOUD_ID: Optional[str] = None # cloud id for elasticsearch @@ -61,6 +69,10 @@ class Settings(BaseSettings): QDRANT_PATH: Optional[str] = None QDRANT_DISTANCE_FUNC: str = "Cosine" + # Oracle DB config + # ORACLE_URI="your_username/your_password@localhost:1521/docsgpt" + ORACLE_URI: Optional[str] = None # Oracle DB connection string + BRAVE_SEARCH_API_KEY: Optional[str] = None FLASK_DEBUG_MODE: bool = False diff --git a/application/requirements.txt b/application/requirements.txt index b072885d3..b637bd8ef 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -16,6 +16,7 @@ javalang==0.13.0 langchain==0.1.4 langchain-openai==0.0.5 openapi3_parser==1.1.16 +oracledb==2.2.1 pandas==2.2.0 pydantic_settings==2.1.0 pymongo==4.6.3 diff --git a/application/vectorstore/oracledb.py b/application/vectorstore/oracledb.py new file mode 100644 index 000000000..fac642a5b --- /dev/null +++ b/application/vectorstore/oracledb.py @@ -0,0 +1,101 @@ +# oracle_db.py +from application.vectorstore.base import BaseVectorStore +from application.core.settings import settings +from application.vectorstore.document_class import Document + + +class OracleDBVectorStore(BaseVectorStore): + def __init__( + self, + embeddings_key: str = "embeddings", + table: str = "documents", + text_key: str = "text", + embedding_key: str = "embedding", + database: str = "docsgpt", + ): + self._table = table + self._text_key = text_key + self._embedding_key = embedding_key + self._embeddings_key = embeddings_key + self._oracle_uri = settings.ORACLE_URI + self._embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) + + try: + import oracledb + except ImportError: + raise ImportError( + "Could not import oracledb python package. " + "Please install it with `pip install oracledb`." + ) + + self._connection = oracledb.connect(self._oracle_uri) + self._cursor = self._connection.cursor() + + def search(self, question, k=2, *args, **kwargs): + query_vector = self._embedding.embed_query(question) + + query = f""" + SELECT {self._text_key}, {self._embedding_key}, METADATA + FROM {self._table} + ORDER BY SDO_GEOM.SDO_DISTANCE(SDO_GEOMETRY({query_vector}), + SDO_GEOMETRY({self._embedding_key})) ASC + FETCH FIRST {k} ROWS ONLY + """ + + self._cursor.execute(query) + results = [] + for row in self._cursor.fetchall(): + text, embedding, metadata = row + results.append(Document(text, metadata)) + return results + + def _insert_texts(self, texts, metadatas): + if not texts: + return [] + + embeddings = self._embedding.embed_documents(texts) + to_insert = [ + (t, embedding, m) for t, m, embedding in zip(texts, metadatas, embeddings) + ] + + query = f""" + INSERT INTO {self._table} ({self._text_key}, {self._embedding_key}, METADATA) + VALUES (:1, :2, :3) + """ + + self._cursor.executemany(query, to_insert) + self._connection.commit() + return [i[0] for i in self._cursor.fetchall()] + + def add_texts( + self, + texts, + metadatas=None, + ids=None, + refresh_indices=True, + create_index_if_not_exists=True, + bulk_kwargs=None, + **kwargs, + ): + batch_size = 100 + _metadatas = metadatas or ({} for _ in texts) + texts_batch = [] + metadatas_batch = [] + result_ids = [] + + for i, (text, metadata) in enumerate(zip(texts, _metadatas)): + texts_batch.append(text) + metadatas_batch.append(metadata) + if (i + 1) % batch_size == 0: + result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) + texts_batch = [] + metadatas_batch = [] + + if texts_batch: + result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) + return result_ids + + def delete_index(self, *args, **kwargs): + query = f"DELETE FROM {self._table} WHERE 1=1" + self._cursor.execute(query) + self._connection.commit() diff --git a/application/vectorstore/vector_creator.py b/application/vectorstore/vector_creator.py index 27b386450..6a6007a6c 100644 --- a/application/vectorstore/vector_creator.py +++ b/application/vectorstore/vector_creator.py @@ -2,6 +2,7 @@ from application.vectorstore.elasticsearch import ElasticsearchStore from application.vectorstore.mongodb import MongoDBVectorStore from application.vectorstore.qdrant import QdrantStore +from application.vectorstore.oracledb import OracleDBVectorStore class VectorCreator: @@ -10,6 +11,7 @@ class VectorCreator: "elasticsearch": ElasticsearchStore, "mongodb": MongoDBVectorStore, "qdrant": QdrantStore, + "oracledb": OracleDBVectorStore, } @classmethod From b07c1670f1acd5e92d93afdbe73d7264158453f0 Mon Sep 17 00:00:00 2001 From: Ilyas Osman Date: Wed, 12 Jun 2024 22:26:13 +0300 Subject: [PATCH 2/2] Update settings.py --- application/core/settings.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/application/core/settings.py b/application/core/settings.py index 2eeab7114..bdb60b8c2 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -4,16 +4,12 @@ from pydantic_settings import BaseSettings -current_dir = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -) +current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) class Settings(BaseSettings): LLM_NAME: str = "docsgpt" - MODEL_NAME: Optional[str] = ( - None # if LLM_NAME is openai, MODEL_NAME can be gpt-4 or gpt-3.5-turbo - ) + MODEL_NAME: Optional[str] = None # if LLM_NAME is openai, MODEL_NAME can be gpt-4 or gpt-3.5-turbo EMBEDDINGS_NAME: str = "huggingface_sentence-transformers/all-mpnet-base-v2" CELERY_BROKER_URL: str = "redis://localhost:6379/0" CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1" @@ -23,20 +19,16 @@ class Settings(BaseSettings): MODEL_TOKEN_LIMITS: dict = {"gpt-3.5-turbo": 4096, "claude-2": 1e5} UPLOAD_FOLDER: str = "inputs" VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" - RETRIEVERS_ENABLED: list = ["classic_rag", "duckduck_search"] # also brave_search + RETRIEVERS_ENABLED: list = ["classic_rag", "duckduck_search"] # also brave_search API_URL: str = "http://localhost:7091" # backend url for celery worker API_KEY: Optional[str] = None # LLM api key - EMBEDDINGS_KEY: Optional[str] = ( - None # api key for embeddings (if using openai, just copy API_KEY) - ) + EMBEDDINGS_KEY: Optional[str] = None # api key for embeddings (if using openai, just copy API_KEY) OPENAI_API_BASE: Optional[str] = None # azure openai api base url OPENAI_API_VERSION: Optional[str] = None # azure openai api version AZURE_DEPLOYMENT_NAME: Optional[str] = None # azure deployment name for answering - AZURE_EMBEDDINGS_DEPLOYMENT_NAME: Optional[str] = ( - None # azure deployment name for embeddings - ) + AZURE_EMBEDDINGS_DEPLOYMENT_NAME: Optional[str] = None # azure deployment name for embeddings # elasticsearch ELASTIC_CLOUD_ID: Optional[str] = None # cloud id for elasticsearch @@ -69,7 +61,6 @@ class Settings(BaseSettings): QDRANT_PATH: Optional[str] = None QDRANT_DISTANCE_FUNC: str = "Cosine" - # Oracle DB config # ORACLE_URI="your_username/your_password@localhost:1521/docsgpt" ORACLE_URI: Optional[str] = None # Oracle DB connection string