Merge branch 'main' into feat/remove_Pydantic_V1

QuivrHQ · Dec 16, 2024 · d51a0c1 · d51a0c1
2 parents 7e34d67 + 09b4811
commit d51a0c1
Show file tree

Hide file tree

Showing 8 changed files with 156 additions and 50 deletions.
diff --git a/.release-please-manifest.json b/.release-please-manifest.json
@@ -1,3 +1,3 @@
 {
-    "core": "0.0.25"
+    "core": "0.0.27"
 }
diff --git a/core/CHANGELOG.md b/core/CHANGELOG.md
@@ -1,5 +1,30 @@
 # Changelog
 
+## [0.0.27](https://github.com/QuivrHQ/quivr/compare/core-0.0.26...core-0.0.27) (2024-12-16)
+
+
+### Features
+
+* ensuring that max_context_tokens is never larger than what supported by models ([#3519](https://github.com/QuivrHQ/quivr/issues/3519)) ([d6e0ed4](https://github.com/QuivrHQ/quivr/commit/d6e0ed44df0ee7edafea85f704a15fd99969bafd))
+* send all to megaparse_sdk ([#3521](https://github.com/QuivrHQ/quivr/issues/3521)) ([e48044d](https://github.com/QuivrHQ/quivr/commit/e48044d36ffda613f65da24641ed8da290195177))
+
+
+### Bug Fixes
+
+* fixing errors arising when the user input contains no tasks ([#3525](https://github.com/QuivrHQ/quivr/issues/3525)) ([e28f7bc](https://github.com/QuivrHQ/quivr/commit/e28f7bcb9ab9534bc011664525ae1f9c2cf6393e))
+
+## [0.0.26](https://github.com/QuivrHQ/quivr/compare/core-0.0.25...core-0.0.26) (2024-12-10)
+
+
+### Features
+
+* first version (V0) of the Workflow Management System ([#3493](https://github.com/QuivrHQ/quivr/issues/3493)) ([6450a49](https://github.com/QuivrHQ/quivr/commit/6450a494e3efa8e8c267ca49aa0a7ec682586b4e))
+
+
+### Bug Fixes
+
+* dealing with empty tool_calls ([#3514](https://github.com/QuivrHQ/quivr/issues/3514)) ([e2f6389](https://github.com/QuivrHQ/quivr/commit/e2f6389189d911a382b2236ab39f28a1270528ac))
+
 ## [0.0.25](https://github.com/QuivrHQ/quivr/compare/core-0.0.24...core-0.0.25) (2024-11-28)
 
 

diff --git a/core/pyproject.toml b/core/pyproject.toml
@@ -1,10 +1,8 @@
 [project]
 name = "quivr-core"
-version = "0.0.25"
+version = "0.0.27"
 description = "Quivr core RAG package"
-authors = [
-    { name = "Stan Girard", email = "[email protected]" }
-]
+authors = [{ name = "Stan Girard", email = "[email protected]" }]
 dependencies = [
     "pydantic>=2.8.2",
     "langchain-core>=0.3,<0.4",
@@ -23,7 +21,7 @@ dependencies = [
     "faiss-cpu>=1.8.0.post1",
     "rapidfuzz>=3.10.1",
     "markupsafe>=2.1.5",
-    "megaparse-sdk==0.1.7",
+    "megaparse-sdk>=0.1.9",
     "langchain-mistralai>=0.2.3",
 ]
 readme = "README.md"

diff --git a/core/quivr_core/processor/implementations/megaparse_processor.py b/core/quivr_core/processor/implementations/megaparse_processor.py
@@ -31,6 +31,7 @@ class MegaparseProcessor(ProcessorBase):
     """
 
     supported_extensions = [
+        FileExtension.txt,
         FileExtension.pdf,
         FileExtension.docx,
         FileExtension.doc,
@@ -42,11 +43,9 @@ class MegaparseProcessor(ProcessorBase):
         FileExtension.bib,
         FileExtension.odt,
         FileExtension.html,
-        FileExtension.py,
         FileExtension.markdown,
         FileExtension.md,
         FileExtension.mdx,
-        FileExtension.ipynb,
     ]
 
     def __init__(

diff --git a/core/quivr_core/processor/registry.py b/core/quivr_core/processor/registry.py
@@ -124,13 +124,21 @@ def defaults_to_proc_entries(
     _append_proc_mapping(
         mapping=base_processors,
         file_exts=[
+            FileExtension.txt,
             FileExtension.pdf,
-            FileExtension.xls,
             FileExtension.docx,
+            FileExtension.doc,
             FileExtension.pptx,
+            FileExtension.xls,
+            FileExtension.xlsx,
+            FileExtension.csv,
             FileExtension.epub,
+            FileExtension.bib,
             FileExtension.odt,
             FileExtension.html,
+            FileExtension.markdown,
+            FileExtension.md,
+            FileExtension.mdx,
         ],
         cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor",
         errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor",

diff --git a/core/quivr_core/rag/entities/config.py b/core/quivr_core/rag/entities/config.py
@@ -75,89 +75,139 @@ class DefaultModelSuppliers(str, Enum):
 
 
 class LLMConfig(QuivrBaseConfig):
-    context: int | None = None
+    max_context_tokens: int | None = None
+    max_output_tokens: int | None = None
     tokenizer_hub: str | None = None
 
 
 class LLMModelConfig:
     _model_defaults: Dict[DefaultModelSuppliers, Dict[str, LLMConfig]] = {
         DefaultModelSuppliers.OPENAI: {
-            "gpt-4o": LLMConfig(context=128000, tokenizer_hub="Xenova/gpt-4o"),
-            "gpt-4o-mini": LLMConfig(context=128000, tokenizer_hub="Xenova/gpt-4o"),
-            "gpt-4-turbo": LLMConfig(context=128000, tokenizer_hub="Xenova/gpt-4"),
-            "gpt-4": LLMConfig(context=8192, tokenizer_hub="Xenova/gpt-4"),
+            "gpt-4o": LLMConfig(
+                max_context_tokens=128000,
+                max_output_tokens=16384,
+                tokenizer_hub="Xenova/gpt-4o",
+            ),
+            "gpt-4o-mini": LLMConfig(
+                max_context_tokens=128000,
+                max_output_tokens=16384,
+                tokenizer_hub="Xenova/gpt-4o",
+            ),
+            "gpt-4-turbo": LLMConfig(
+                max_context_tokens=128000,
+                max_output_tokens=4096,
+                tokenizer_hub="Xenova/gpt-4",
+            ),
+            "gpt-4": LLMConfig(
+                max_context_tokens=8192,
+                max_output_tokens=8192,
+                tokenizer_hub="Xenova/gpt-4",
+            ),
             "gpt-3.5-turbo": LLMConfig(
-                context=16385, tokenizer_hub="Xenova/gpt-3.5-turbo"
+                max_context_tokens=16385,
+                max_output_tokens=4096,
+                tokenizer_hub="Xenova/gpt-3.5-turbo",
             ),
             "text-embedding-3-large": LLMConfig(
-                context=8191, tokenizer_hub="Xenova/text-embedding-ada-002"
+                max_context_tokens=8191, tokenizer_hub="Xenova/text-embedding-ada-002"
             ),
             "text-embedding-3-small": LLMConfig(
-                context=8191, tokenizer_hub="Xenova/text-embedding-ada-002"
+                max_context_tokens=8191, tokenizer_hub="Xenova/text-embedding-ada-002"
             ),
             "text-embedding-ada-002": LLMConfig(
-                context=8191, tokenizer_hub="Xenova/text-embedding-ada-002"
+                max_context_tokens=8191, tokenizer_hub="Xenova/text-embedding-ada-002"
             ),
         },
         DefaultModelSuppliers.ANTHROPIC: {
             "claude-3-5-sonnet": LLMConfig(
-                context=200000, tokenizer_hub="Xenova/claude-tokenizer"
+                max_context_tokens=200000,
+                max_output_tokens=8192,
+                tokenizer_hub="Xenova/claude-tokenizer",
             ),
             "claude-3-opus": LLMConfig(
-                context=200000, tokenizer_hub="Xenova/claude-tokenizer"
+                max_context_tokens=200000,
+                max_output_tokens=4096,
+                tokenizer_hub="Xenova/claude-tokenizer",
             ),
             "claude-3-sonnet": LLMConfig(
-                context=200000, tokenizer_hub="Xenova/claude-tokenizer"
+                max_context_tokens=200000,
+                max_output_tokens=4096,
+                tokenizer_hub="Xenova/claude-tokenizer",
             ),
             "claude-3-haiku": LLMConfig(
-                context=200000, tokenizer_hub="Xenova/claude-tokenizer"
+                max_context_tokens=200000,
+                max_output_tokens=4096,
+                tokenizer_hub="Xenova/claude-tokenizer",
             ),
             "claude-2-1": LLMConfig(
-                context=200000, tokenizer_hub="Xenova/claude-tokenizer"
+                max_context_tokens=200000,
+                max_output_tokens=4096,
+                tokenizer_hub="Xenova/claude-tokenizer",
             ),
             "claude-2-0": LLMConfig(
-                context=100000, tokenizer_hub="Xenova/claude-tokenizer"
+                max_context_tokens=100000,
+                max_output_tokens=4096,
+                tokenizer_hub="Xenova/claude-tokenizer",
             ),
             "claude-instant-1-2": LLMConfig(
-                context=100000, tokenizer_hub="Xenova/claude-tokenizer"
+                max_context_tokens=100000,
+                max_output_tokens=4096,
+                tokenizer_hub="Xenova/claude-tokenizer",
             ),
         },
+        # Unclear for LLAMA models...
+        # see https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct/discussions/6
         DefaultModelSuppliers.META: {
             "llama-3.1": LLMConfig(
-                context=128000, tokenizer_hub="Xenova/Meta-Llama-3.1-Tokenizer"
+                max_context_tokens=128000,
+                max_output_tokens=4096,
+                tokenizer_hub="Xenova/Meta-Llama-3.1-Tokenizer",
             ),
             "llama-3": LLMConfig(
-                context=8192, tokenizer_hub="Xenova/llama3-tokenizer-new"
+                max_context_tokens=8192,
+                max_output_tokens=2048,
+                tokenizer_hub="Xenova/llama3-tokenizer-new",
             ),
-            "llama-2": LLMConfig(context=4096, tokenizer_hub="Xenova/llama2-tokenizer"),
             "code-llama": LLMConfig(
-                context=16384, tokenizer_hub="Xenova/llama-code-tokenizer"
+                max_context_tokens=16384, tokenizer_hub="Xenova/llama-code-tokenizer"
             ),
         },
         DefaultModelSuppliers.GROQ: {
-            "llama-3.1": LLMConfig(
-                context=128000, tokenizer_hub="Xenova/Meta-Llama-3.1-Tokenizer"
+            "llama-3.3-70b": LLMConfig(
+                max_context_tokens=128000,
+                max_output_tokens=32768,
+                tokenizer_hub="Xenova/Meta-Llama-3.1-Tokenizer",
+            ),
+            "llama-3.1-70b": LLMConfig(
+                max_context_tokens=128000,
+                max_output_tokens=32768,
+                tokenizer_hub="Xenova/Meta-Llama-3.1-Tokenizer",
             ),
             "llama-3": LLMConfig(
-                context=8192, tokenizer_hub="Xenova/llama3-tokenizer-new"
+                max_context_tokens=8192, tokenizer_hub="Xenova/llama3-tokenizer-new"
             ),
-            "llama-2": LLMConfig(context=4096, tokenizer_hub="Xenova/llama2-tokenizer"),
             "code-llama": LLMConfig(
-                context=16384, tokenizer_hub="Xenova/llama-code-tokenizer"
+                max_context_tokens=16384, tokenizer_hub="Xenova/llama-code-tokenizer"
             ),
         },
         DefaultModelSuppliers.MISTRAL: {
             "mistral-large": LLMConfig(
-                context=128000, tokenizer_hub="Xenova/mistral-tokenizer-v3"
+                max_context_tokens=128000,
+                max_output_tokens=4096,
+                tokenizer_hub="Xenova/mistral-tokenizer-v3",
             ),
             "mistral-small": LLMConfig(
-                context=128000, tokenizer_hub="Xenova/mistral-tokenizer-v3"
+                max_context_tokens=128000,
+                max_output_tokens=4096,
+                tokenizer_hub="Xenova/mistral-tokenizer-v3",
             ),
             "mistral-nemo": LLMConfig(
-                context=128000, tokenizer_hub="Xenova/Mistral-Nemo-Instruct-Tokenizer"
+                max_context_tokens=128000,
+                max_output_tokens=4096,
+                tokenizer_hub="Xenova/Mistral-Nemo-Instruct-Tokenizer",
             ),
             "codestral": LLMConfig(
-                context=32000, tokenizer_hub="Xenova/mistral-tokenizer-v3"
+                max_context_tokens=32000, tokenizer_hub="Xenova/mistral-tokenizer-v3"
             ),
         },
     }
@@ -193,13 +243,12 @@ def get_llm_model_config(
 class LLMEndpointConfig(QuivrBaseConfig):
     supplier: DefaultModelSuppliers = DefaultModelSuppliers.OPENAI
     model: str = "gpt-4o"
-    context_length: int | None = None
     tokenizer_hub: str | None = None
     llm_base_url: str | None = None
     env_variable_name: str | None = None
     llm_api_key: str | None = None
-    max_context_tokens: int = 2000
-    max_output_tokens: int = 2000
+    max_context_tokens: int = 10000
+    max_output_tokens: int = 4000
     temperature: float = 0.7
     streaming: bool = True
     prompt: CustomPromptsModel | None = None
@@ -240,7 +289,25 @@ def set_llm_model_config(self):
             self.supplier, self.model
         )
         if llm_model_config:
-            self.context_length = llm_model_config.context
+            if llm_model_config.max_context_tokens:
+                _max_context_tokens = (
+                    llm_model_config.max_context_tokens
+                    - llm_model_config.max_output_tokens
+                    if llm_model_config.max_output_tokens
+                    else llm_model_config.max_context_tokens
+                )
+                if self.max_context_tokens > _max_context_tokens:
+                    logger.warning(
+                        f"Lowering max_context_tokens from {self.max_context_tokens} to {_max_context_tokens}"
+                    )
+                    self.max_context_tokens = _max_context_tokens
+            if llm_model_config.max_output_tokens:
+                if self.max_output_tokens > llm_model_config.max_output_tokens:
+                    logger.warning(
+                        f"Lowering max_output_tokens from {self.max_output_tokens} to {llm_model_config.max_output_tokens}"
+                    )
+                    self.max_output_tokens = llm_model_config.max_output_tokens
+
             self.tokenizer_hub = llm_model_config.tokenizer_hub
 
     def set_llm_model(self, model: str):

diff --git a/core/quivr_core/rag/quivr_rag_langgraph.py b/core/quivr_core/rag/quivr_rag_langgraph.py
@@ -71,11 +71,11 @@ class SplittedInput(BaseModel):
 class TasksCompletion(BaseModel):
     is_task_completable_reasoning: Optional[str] = Field(
         default=None,
-        description="The reasoning that leads to identifying whether the user task or question can be completed using the provided context and chat history.",
+        description="The reasoning that leads to identifying whether the user task or question can be completed using the provided context and chat history BEFORE any tool is used.",
     )
 
     is_task_completable: bool = Field(
-        description="Whether the user task or question can be completed using the provided context and chat history.",
+        description="Whether the user task or question can be completed using the provided context and chat history BEFORE any tool is used.",
     )
 
     tool_reasoning: Optional[str] = Field(
@@ -667,7 +667,7 @@ async def dynamic_retrieve(self, state: AgentState) -> AgentState:
         MAX_ITERATIONS = 3
 
         tasks = state["tasks"]
-        if not tasks.has_tasks():
+        if not tasks or not tasks.has_tasks():
             return {**state}
 
         k = self.retrieval_config.k
@@ -1031,7 +1031,7 @@ def _build_rag_prompt_inputs(
         return {
             "context": combine_documents(docs) if docs else "None",
             "question": user_question,
-            "rephrased_task": state["tasks"].definitions,
+            "rephrased_task": state["tasks"].definitions if state["tasks"] else "None",
             "custom_instructions": prompt if prompt else "None",
             "files": files if files else "None",
             "chat_history": state["chat_history"].to_list(),

diff --git a/core/quivr_core/rag/utils.py b/core/quivr_core/rag/utils.py
@@ -96,13 +96,18 @@ def parse_chunk_response(
     """
     rolling_msg += raw_chunk
 
-    if not supports_func_calling or not rolling_msg.tool_calls:
+    tool_calls = rolling_msg.tool_calls
+
+    if not supports_func_calling or not tool_calls:
         new_content = raw_chunk.content  # Just the new chunk's content
         full_content = rolling_msg.content  # The full accumulated content
         return rolling_msg, new_content, full_content
 
-    current_answers = get_answers_from_tool_calls(rolling_msg.tool_calls)
+    current_answers = get_answers_from_tool_calls(tool_calls)
     full_answer = "\n\n".join(current_answers)
+    if not full_answer:
+        full_answer = previous_content
+
     new_content = full_answer[len(previous_content) :]
 
     return rolling_msg, new_content, full_answer
@@ -111,8 +116,12 @@ def parse_chunk_response(
 def get_answers_from_tool_calls(tool_calls):
     answers = []
     for tool_call in tool_calls:
-        if tool_call.get("name") == "cited_answer" and "args" in tool_call:
-            answers.append(tool_call["args"].get("answer", ""))
+        if tool_call.get("name") == "cited_answer":
+            args = tool_call.get("args", {})
+            if isinstance(args, dict):
+                answers.append(args.get("answer", ""))
+            else:
+                logger.warning(f"Expected dict for tool_call args, got {type(args)}")
     return answers