diff --git a/.backend_env.example b/.backend_env.example index 6deab2b04614..02dfeca4402f 100644 --- a/.backend_env.example +++ b/.backend_env.example @@ -10,6 +10,9 @@ GOOGLE_CLOUD_PROJECT= CELERY_BROKER_URL=redis://redis:6379/0 CELEBRY_BROKER_QUEUE_NAME=quivr-preview.fifo +#LOCAL +#OLLAMA_API_BASE_URL=http://host.docker.internal:11434 # local all in one remove comment to use local llm with Ollama + #RESEND diff --git a/backend/llm/api_brain_qa.py b/backend/llm/api_brain_qa.py index e3fdb5103bd3..d2352d844b76 100644 --- a/backend/llm/api_brain_qa.py +++ b/backend/llm/api_brain_qa.py @@ -3,6 +3,7 @@ from uuid import UUID from fastapi import HTTPException +from logger import get_logger from litellm import completion from models.chats import ChatQuestion from models.databases.supabase.chats import CreateChatHistory @@ -17,6 +18,7 @@ get_api_brain_definition_as_json_schema, ) +logger = get_logger(__name__) class APIBrainQA( QABaseBrainPicking, @@ -53,7 +55,6 @@ async def make_completion( brain_id: UUID, ): yield "🧠🧠" - response = completion( model=self.model, temperature=self.temperature, @@ -73,8 +74,7 @@ async def make_completion( if finish_reason == "stop": break - - if "function_call" in chunk.choices[0].delta: + if "function_call" in chunk.choices[0].delta and chunk.choices[0].delta["function_call"]: if "name" in chunk.choices[0].delta["function_call"]: function_call["name"] = chunk.choices[0].delta["function_call"][ "name" @@ -86,10 +86,12 @@ async def make_completion( elif finish_reason == "function_call": try: + logger.info(f"Function call: {function_call}") arguments = json.loads(function_call["arguments"]) + except Exception: arguments = {} - yield f"🧠🧠" + yield f"🧠🧠" try: api_call_response = call_brain_api( @@ -106,7 +108,7 @@ async def make_completion( messages.append( { "role": "function", - "name": function_call["name"], + "name": str(brain_id), "content": api_call_response, } ) diff --git a/backend/llm/qa_base.py b/backend/llm/qa_base.py index 376abd63a755..e99c992599ef 100644 --- a/backend/llm/qa_base.py +++ b/backend/llm/qa_base.py @@ -7,6 +7,7 @@ from langchain.chains import ConversationalRetrievalChain, LLMChain from langchain.chains.question_answering import load_qa_chain from langchain.chat_models import ChatLiteLLM +from langchain.embeddings.ollama import OllamaEmbeddings from langchain.embeddings.openai import OpenAIEmbeddings from langchain.llms.base import BaseLLM from langchain.prompts.chat import ( @@ -84,8 +85,13 @@ def _determine_callback_array( ] @property - def embeddings(self) -> OpenAIEmbeddings: - return OpenAIEmbeddings() # pyright: ignore reportPrivateUsage=none + def embeddings(self): + if self.brain_settings.ollama_api_base_url: + return OllamaEmbeddings( + base_url=self.brain_settings.ollama_api_base_url + ) # pyright: ignore reportPrivateUsage=none + else: + return OpenAIEmbeddings() supabase_client: Optional[Client] = None vector_store: Optional[CustomSupabaseVectorStore] = None @@ -143,6 +149,11 @@ def _create_llm( :param callbacks: Callbacks to be used for streaming :return: Language model instance """ + api_base = None + if self.brain_settings.ollama_api_base_url and model.startswith("ollama"): + api_base = self.brain_settings.ollama_api_base_url + + return ChatLiteLLM( temperature=temperature, max_tokens=self.max_tokens, @@ -150,6 +161,7 @@ def _create_llm( streaming=streaming, verbose=False, callbacks=callbacks, + api_base= api_base ) # pyright: ignore reportPrivateUsage=none def _create_prompt_template(self): diff --git a/backend/llm/qa_headless.py b/backend/llm/qa_headless.py index 89f09b675fc1..fb2e924063e6 100644 --- a/backend/llm/qa_headless.py +++ b/backend/llm/qa_headless.py @@ -7,6 +7,7 @@ from langchain.chains import LLMChain from langchain.chat_models import ChatLiteLLM from langchain.chat_models.base import BaseChatModel +from models import BrainSettings # Importing settings related to the 'brain' from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate from logger import get_logger from models.chats import ChatQuestion @@ -30,6 +31,7 @@ class HeadlessQA(BaseModel): + brain_settings = BrainSettings() model: str temperature: float = 0.0 max_tokens: int = 2000 @@ -78,13 +80,18 @@ def _create_llm( :param callbacks: Callbacks to be used for streaming :return: Language model instance """ + api_base = None + if self.brain_settings.ollama_api_base_url and model.startswith("ollama"): + api_base = self.brain_settings.ollama_api_base_url + return ChatLiteLLM( - temperature=0.1, + temperature=temperature, model=model, streaming=streaming, verbose=True, callbacks=callbacks, max_tokens=self.max_tokens, + api_base=api_base, ) def _create_prompt_template(self): diff --git a/backend/models/settings.py b/backend/models/settings.py index ad73206b5e10..dad83dd3bb05 100644 --- a/backend/models/settings.py +++ b/backend/models/settings.py @@ -1,9 +1,13 @@ -from langchain.embeddings.openai import OpenAIEmbeddings from models.databases.supabase.supabase import SupabaseDB from pydantic import BaseSettings from supabase.client import Client, create_client from vectorstore.supabase import SupabaseVectorStore +from langchain.embeddings.ollama import OllamaEmbeddings +from langchain.embeddings.openai import OpenAIEmbeddings + +from logger import get_logger +logger = get_logger(__name__) class BrainRateLimiting(BaseSettings): max_brain_per_user: int = 5 @@ -15,6 +19,7 @@ class BrainSettings(BaseSettings): supabase_service_key: str resend_api_key: str = "null" resend_email_address: str = "brain@mail.quivr.app" + ollama_api_base_url: str = None class ContactsSettings(BaseSettings): @@ -39,11 +44,14 @@ def get_supabase_db() -> SupabaseDB: return SupabaseDB(supabase_client) -def get_embeddings() -> OpenAIEmbeddings: +def get_embeddings(): settings = BrainSettings() # pyright: ignore reportPrivateUsage=none - embeddings = OpenAIEmbeddings( - openai_api_key=settings.openai_api_key - ) # pyright: ignore reportPrivateUsage=none + if settings.ollama_api_base_url: + embeddings = OllamaEmbeddings( + base_url=settings.ollama_api_base_url, + ) # pyright: ignore reportPrivateUsage=none + else: + embeddings = OpenAIEmbeddings() # pyright: ignore reportPrivateUsage=none return embeddings diff --git a/backend/requirements.txt b/backend/requirements.txt index eb4730a74e63..0cd93f5ad00e 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,8 +1,8 @@ # pymupdf==1.22.3 -langchain==0.0.332 -litellm==0.13.2 +langchain==0.0.341 +litellm==1.7.7 # Markdown==3.4.4 -openai==0.27.8 +openai==1.1.1 GitPython==3.1.36 pdf2image==1.16.3 pypdf==3.9.0 @@ -36,3 +36,4 @@ python-dotenv pytest-mock pytest-celery pytesseract==0.3.10 +async_generator diff --git a/docs/docs/Developers/selfHosted/run_fully_local.md b/docs/docs/Developers/selfHosted/run_fully_local.md index e6d9cd3d19f6..dd3a8797f5ba 100644 --- a/docs/docs/Developers/selfHosted/run_fully_local.md +++ b/docs/docs/Developers/selfHosted/run_fully_local.md @@ -1,9 +1,9 @@ --- sidebar_position: 2 -title: Run Quivr fully locally +title: Run Quivr locally with Ollama --- -# Using Quivr fully locally +# Using Quivr fully locally with Ollama ## Headers @@ -15,13 +15,7 @@ The following is a guide to set up everything for using Quivr locally: - [Embeddings](#embeddings) - [LLM for inference](#llm) -It is a first, working setup, but a lot of work has to be done to e.g. find the appropriate settings for the model. - -Importantly, this will currently only work on tag v0.0.46. - -The guide was put together in collaboration with members of the Quivr Discord, **Using Quivr fully locally** thread. That is a good place to discuss it. - -This worked for me, but I sometimes got strange results (the output contains repeating answers/questions). Maybe because `stopping_criteria=stopping_criteria` must be uncommented in `transformers.pipeline`. Will update this page as I continue learning. +The guide was put together in collaboration with members of the Quivr Discord, **Using Quivr fully locally** thread. That is a good place to discuss it. https://discord.com/invite/HUpRgp2HG8 @@ -37,224 +31,107 @@ Troubleshooting: -## Local embeddings - -First, let's get local embeddings to work with GPT4All. Instead of relying on OpenAI for generating embeddings of both the prompt and the documents we upload, we will use a local LLM for this. - -Remove any existing data from the postgres database: - -- `supabase/docker $ docker compose down -v` -- `supabase/docker $ rm -rf volumes/db/data/` -- `supabase/docker $ docker compose up -d` - -Change the vector dimensions in the necessary Quivr SQL files: - -- Replace all occurrences of 1536 by 768, in Quivr's `scripts\tables.sql` -- Run tables.sql in the Supabase web ui SQL editor: http://localhost:8000 - -Change the Quivr code to use local LLM (GPT4All) and local embeddings: - -- add code to `backend\core\llm\private_gpt4all.py` - -```python - from langchain.embeddings import HuggingFaceEmbeddings - ... - def embeddings(self) -> HuggingFaceEmbeddings: - emb = HuggingFaceEmbeddings( - model_name="sentence-transformers/all-mpnet-base-v2", - model_kwargs={'device': 'cuda'}, - encode_kwargs={'normalize_embeddings': False} - ) - return emb -``` - -Note that there may be better models out there for generating the embeddings: https://huggingface.co/spaces/mteb/leaderboard - -Update Quivr `backend/core/.env`'s Private LLM Variables: - -``` - #Private LLM Variables - PRIVATE=True - MODEL_PATH=./local_models/ggml-gpt4all-j-v1.3-groovy.bin -``` - -Download GPT4All model: - -- `$ cd backend/core/local_models/` -- `wget https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin` - -Ensure the Quivr backend docker container has CUDA and the GPT4All package: +## Ollama -``` - FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel - #FROM python:3.11-bullseye - - ARG DEBIAN_FRONTEND=noninteractive - ENV DEBIAN_FRONTEND=noninteractive - - RUN pip install gpt4all -``` +Ollama is a tool that allows you to run LLMs locally. We are using it to run Llama2, MistralAI and others locally. -Modify the docker-compose yml file (for backend container). The following example is for using 2 GPUs: +### Install Ollama -``` - ... - network_mode: host - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 2 - capabilities: [gpu] -``` +Install Ollama from their [website](https://ollama.ai/). -Install nvidia container toolkit on the host, https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html: +Then run the following command to run Ollama in the background: +```bash +ollama run llama2 ``` -$ wget https://nvidia.github.io/nvidia-docker/gpgkey --no-check-certificate -$ sudo apt-key add gpgkey -$ distribution=$(. /etc/os-release;echo $ID$VERSION_ID) -$ curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list -$ sudo apt-get update - -$ sudo apt-get install -y nvidia-container-toolkit - -$ nvidia-ctk --version -$ sudo systemctl restart docker +### Update Quivr to use Ollama + +In order to have Quivr use Ollama we need to update the tables in Supabase to support the embedding format that Ollama uses. Ollama uses by default llama 2 that produces 4096 dimensional embeddings while OpenAI API produces 1536 dimensional embeddings. + + +Go to supabase and delete your table vectors and create a new table vectors with the following schema: + +```sql +CREATE TABLE IF NOT EXISTS vectors ( + id UUID DEFAULT uuid_generate_v4() PRIMARY KEY, + content TEXT, + file_sha1 TEXT, + metadata JSONB, + embedding VECTOR(4096) +); +``` + +Then run the following command to update the table: + +```sql +CREATE OR REPLACE FUNCTION match_vectors(query_embedding VECTOR(4096), match_count INT, p_brain_id UUID) +RETURNS TABLE( + id UUID, + brain_id UUID, + content TEXT, + metadata JSONB, + embedding VECTOR(4096), + similarity FLOAT +) LANGUAGE plpgsql AS $$ +#variable_conflict use_column +BEGIN + RETURN QUERY + SELECT + vectors.id, + brains_vectors.brain_id, + vectors.content, + vectors.metadata, + vectors.embedding, + 1 - (vectors.embedding <=> query_embedding) AS similarity + FROM + vectors + INNER JOIN + brains_vectors ON vectors.id = brains_vectors.vector_id + WHERE brains_vectors.brain_id = p_brain_id + ORDER BY + vectors.embedding <=> query_embedding + LIMIT match_count; +END; +$$; +``` + +This will update the match_vectors function to use the new embedding format. + + +## Add Ollama Model to Quivr + +Now that you have your model running locally, you need to add it to Quivr. + +In order to allow the user to choose between the OpenAI API and Ollama, we need to add a new model to the Quivr backend. + +Go to supabase and in the table `user_settings` either add by default or to your user the following value to the `models` column: + +```json +[ + "gpt-3.5-turbo-1106", + "ollama/llama2" +] ``` -At this moment, if we try to upload a pdf, we get an error: +This will add the Ollama model to the list of models that the user can choose from. -``` -backend-core | 1989-01-01 21:51:41,211 [ERROR] utils.vectors: Error creating vector for document {'code': '22000', 'details': None, 'hint': None, 'message': 'expected 768 dimensions, not 1536'} -``` +By adding this as default, it means that all new users will have this model by default. If you want to add it to your user only, you can add it to the `models` column in the `user_settings` table. In order for the change to take effect if you put as default your need to drop the entire table with the following command: -This can be remedied by using local embeddings for document embeddings. In backend/core/utils/vectors.py, replace: - -```python - # def create_vector(self, doc, user_openai_api_key=None): - # logger.info("Creating vector for document") - # logger.info(f"Document: {doc}") - # if user_openai_api_key: - # self.commons["documents_vector_store"]._embedding = OpenAIEmbeddings( - # openai_api_key=user_openai_api_key - # ) # pyright: ignore reportPrivateUsage=none - # try: - # sids = self.commons["documents_vector_store"].add_documents([doc]) - # if sids and len(sids) > 0: - # return sids - - # except Exception as e: - # logger.error(f"Error creating vector for document {e}") - - def create_vector(self, doc, user_openai_api_key=None): - logger.info("Creating vector for document") - logger.info(f"Document: {doc}") - self.commons["documents_vector_store"]._embedding = HuggingFaceEmbeddings( - model_name="sentence-transformers/all-mpnet-base-v2", - model_kwargs={'device': 'cuda'}, - encode_kwargs={'normalize_embeddings': False} - ) # pyright: ignore reportPrivateUsage=none - logger.info('||| creating embedding') - try: - sids = self.commons["documents_vector_store"].add_documents([doc]) - if sids and len(sids) > 0: - return sids - - except Exception as e: - logger.error(f"Error creating vector for document {e}") +```sql +DROP TABLE user_settings; ``` - -## Local LLM +## Env Variables -The final step is to use a local model from HuggingFace for inference. (The HF token is optional, only required for certain models on HF.) -Update the Quivr backend dockerfile: +In order to have Quivr use Ollama we need to update the env variables. -``` - ENV HUGGINGFACEHUB_API_TOKEN=hf_XXX +Go to `backend/.env` and add the following env variables: - RUN pip install accelerate +```bash +OLLAMA_API_BASE_URL=http://host.docker.internal:11434 ``` -Update the `private_gpt4all.py` file as follows: - -```python - import langchain - langchain.debug = True - langchain.verbose = True - - import os - import transformers - from langchain.llms import HuggingFacePipeline - from langchain.embeddings import HuggingFaceEmbeddings - ... - - model_id = "stabilityai/StableBeluga-13B" - ... - - def _create_llm( - self, - model, - streaming=False, - callbacks=None, - ) -> BaseLLM: - """ - Override the _create_llm method to enforce the use of a private model. - :param model: Language model name to be used. - :param streaming: Whether to enable streaming of the model - :param callbacks: Callbacks to be used for streaming - :return: Language model instance - """ - - model_path = self.model_path - - logger.info("Using private model: %s", model) - logger.info("Streaming is set to %s", streaming) - logger.info("--- model %s",model) - - logger.info("--- model path %s",model_path) - - model_id = "stabilityai/StableBeluga-13B" - - llm = transformers.AutoModelForCausalLM.from_pretrained( - model_id, - use_cache=True, - load_in_4bit=True, - device_map='auto', - #use_auth_token=hf_auth - ) - logger.info('<<< transformers.AutoModelForCausalLM.from_pretrained') - - llm.eval() - logger.info('<<< eval') - - tokenizer = transformers.AutoTokenizer.from_pretrained( - model_id, - use_auth_token=hf_auth - ) - logger.info('<<< transformers.AutoTokenizer.from_pretrained') - - generate_text = transformers.pipeline( - model=llm, tokenizer=tokenizer, - return_full_text=True, # langchain expects the full text - task='text-generation', - # we pass model parameters here too - #stopping_criteria=stopping_criteria, # without this model rambles during chat - temperature=0.5, # 'randomness' of outputs, 0.0 is the min and 1.0 the max - max_new_tokens=512, # mex number of tokens to generate in the output - repetition_penalty=1.1 # without this output begins repeating - ) - logger.info('<<< generate_text = transformers.pipeline(') - - result = HuggingFacePipeline(pipeline=generate_text) - - logger.info('<<< generate_text = transformers.pipeline(') - - logger.info("<<< created llm HuggingFace") - return result -``` +Then go to the Quivr and you are good to go. \ No newline at end of file diff --git a/frontend/sentry.client.config.ts b/frontend/sentry.client.config.ts index bb2cef54f314..c322e02daaff 100644 --- a/frontend/sentry.client.config.ts +++ b/frontend/sentry.client.config.ts @@ -13,12 +13,12 @@ if (SENTRY_DSN) { dsn: SENTRY_DSN, // Adjust this value in production, or use tracesSampler for greater control - tracesSampleRate: 1, + sampleRate: 0.1, // Setting this option to true will print useful information to the console while you're setting up Sentry. debug: false, - replaysOnErrorSampleRate: 1.0, + replaysOnErrorSampleRate: 0.1, // This sets the sample rate to be 10%. You may want this to be 100% while // in development and sample at a lower rate in production diff --git a/frontend/sentry.edge.config.ts b/frontend/sentry.edge.config.ts index 0805f5be2a9d..c8eee2c86d7d 100644 --- a/frontend/sentry.edge.config.ts +++ b/frontend/sentry.edge.config.ts @@ -9,7 +9,9 @@ if (SENTRY_DSN) { dsn: SENTRY_DSN, // Adjust this value in production, or use tracesSampler for greater control - tracesSampleRate: 1, + tracesSampleRate: 0.05, + sampleRate: 0.05, + // Setting this option to true will print useful information to the console while you're setting up Sentry. debug: false, diff --git a/frontend/sentry.server.config.ts b/frontend/sentry.server.config.ts index 0805f5be2a9d..8fe24ef11ac4 100644 --- a/frontend/sentry.server.config.ts +++ b/frontend/sentry.server.config.ts @@ -9,7 +9,9 @@ if (SENTRY_DSN) { dsn: SENTRY_DSN, // Adjust this value in production, or use tracesSampler for greater control - tracesSampleRate: 1, + tracesSampleRate: 0.1, + sampleRate: 0.1, + // Setting this option to true will print useful information to the console while you're setting up Sentry. debug: false,