docs(core): init (#3365)

# Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): --------- Co-authored-by: aminediro <[email protected]> Co-authored-by: Jacopo Chevallard <[email protected]> Co-authored-by: chloedia <[email protected]> Co-authored-by: AmineDiro <[email protected]>
QuivrHQ · Oct 11, 2024 · bb572a2 · bb572a2
1 parent 6c2858f
commit bb572a2
Show file tree

Hide file tree

Showing 19 changed files with 743 additions and 43 deletions.
diff --git a/backend/core/quivr_core/base_config.py b/backend/core/quivr_core/base_config.py
@@ -5,10 +5,34 @@
 
 
 class QuivrBaseConfig(BaseModel):
+    """
+    Base configuration class for Quivr.
+
+    This class extends Pydantic's BaseModel and provides a foundation for
+    configuration management in quivr-core.
+
+    Attributes:
+        model_config (ConfigDict): Configuration for the Pydantic model.
+            It's set to forbid extra attributes, ensuring strict adherence
+            to the defined schema.
+
+    Class Methods:
+        from_yaml: Create an instance of the class from a YAML file.
+    """
+
     model_config = ConfigDict(extra="forbid")
 
     @classmethod
     def from_yaml(cls, file_path: str | Path):
+        """
+        Create an instance of the class from a YAML file.
+
+        Args:
+            file_path (str | Path): The path to the YAML file.
+
+        Returns:
+            QuivrBaseConfig: An instance of the class initialized with the data from the YAML file.
+        """
         # Load the YAML file
         with open(file_path, "r") as stream:
             config_data = yaml.safe_load(stream)

diff --git a/backend/core/quivr_core/brain/brain.py b/backend/core/quivr_core/brain/brain.py
@@ -46,6 +46,24 @@
 async def process_files(
     storage: StorageBase, skip_file_error: bool, **processor_kwargs: dict[str, Any]
 ) -> list[Document]:
+    """
+    Process files in storage.
+    This function takes a StorageBase and return a list of langchain documents.
+
+    Args:
+        storage (StorageBase): The storage containing the files to process.
+        skip_file_error (bool): Whether to skip files that cannot be processed.
+        processor_kwargs (dict[str, Any]): Additional arguments for the processor.
+
+    Returns:
+        list[Document]: List of processed documents in the Langchain Document format.
+
+    Raises:
+        ValueError: If a file cannot be processed and skip_file_error is False.
+        Exception: If no processor is found for a file of a specific type and skip_file_error is False.
+
+    """
+
     knowledge = []
     for file in await storage.get_files():
         try:
@@ -71,6 +89,36 @@ async def process_files(
 
 
 class Brain:
+    """
+    A class representing a Brain.
+
+    This class allows for the creation of a Brain, which is a collection of knowledge one wants to retrieve information from.
+
+    A Brain is set to:
+
+    * Store files in the storage of your choice (local, S3, etc.)
+    * Process the files in the storage to extract text and metadata in a wide range of format.
+    * Store the processed files in the vector store of your choice (FAISS, PGVector, etc.) - default to FAISS.
+    * Create an index of the processed files.
+    * Use the *Quivr* workflow for the retrieval augmented generation.
+
+    A Brain is able to:
+
+    * Search for information in the vector store.
+    * Answer questions about the knowledges in the Brain.
+    * Stream the answer to the question.
+
+    Attributes:
+        name (str): The name of the brain.
+        id (UUID): The unique identifier of the brain.
+        storage (StorageBase): The storage used to store the files.
+        llm (LLMEndpoint): The language model used to generate the answer.
+        vector_db (VectorStore): The vector store used to store the processed files.
+        embedder (Embeddings): The embeddings used to create the index of the processed files.
+
+
+    """
+
     def __init__(
         self,
         *,
@@ -106,6 +154,22 @@ def print_info(self):
 
     @classmethod
     def load(cls, folder_path: str | Path) -> Self:
+        """
+        Load a brain from a folder path.
+
+        Args:
+            folder_path (str | Path): The path to the folder containing the brain.
+
+        Returns:
+            Brain: The brain loaded from the folder path.
+
+        Example:
+        ```python
+        brain_loaded = Brain.load("path/to/brain")
+        brain_loaded.print_info()
+        ```
+
+        """
         if isinstance(folder_path, str):
             folder_path = Path(folder_path)
         if not folder_path.exists():
@@ -154,6 +218,20 @@ def load(cls, folder_path: str | Path) -> Self:
         )
 
     async def save(self, folder_path: str | Path):
+        """
+        Save the brain to a folder path.
+
+        Args:
+            folder_path (str | Path): The path to the folder where the brain will be saved.
+
+        Returns:
+            str: The path to the folder where the brain was saved.
+
+        Example:
+        ```python
+        await brain.save("path/to/brain")
+        ```
+        """
         if isinstance(folder_path, str):
             folder_path = Path(folder_path)
 
@@ -247,6 +325,28 @@ async def afrom_files(
         skip_file_error: bool = False,
         processor_kwargs: dict[str, Any] | None = None,
     ):
+        """
+        Create a brain from a list of file paths.
+
+        Args:
+            name (str): The name of the brain.
+            file_paths (list[str | Path]): The list of file paths to add to the brain.
+            vector_db (VectorStore | None): The vector store used to store the processed files.
+            storage (StorageBase): The storage used to store the files.
+            llm (LLMEndpoint | None): The language model used to generate the answer.
+            embedder (Embeddings | None): The embeddings used to create the index of the processed files.
+            skip_file_error (bool): Whether to skip files that cannot be processed.
+            processor_kwargs (dict[str, Any] | None): Additional arguments for the processor.
+
+        Returns:
+            Brain: The brain created from the file paths.
+
+        Example:
+        ```python
+        brain = await Brain.afrom_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"])
+        brain.print_info()
+        ```
+        """
         if llm is None:
             llm = default_llm()
 
@@ -327,6 +427,28 @@ async def afrom_langchain_documents(
         llm: LLMEndpoint | None = None,
         embedder: Embeddings | None = None,
     ) -> Self:
+        """
+        Create a brain from a list of langchain documents.
+
+        Args:
+            name (str): The name of the brain.
+            langchain_documents (list[Document]): The list of langchain documents to add to the brain.
+            vector_db (VectorStore | None): The vector store used to store the processed files.
+            storage (StorageBase): The storage used to store the files.
+            llm (LLMEndpoint | None): The language model used to generate the answer.
+            embedder (Embeddings | None): The embeddings used to create the index of the processed files.
+
+        Returns:
+            Brain: The brain created from the langchain documents.
+
+        Example:
+        ```python
+        from langchain_core.documents import Document
+        documents = [Document(page_content="Hello, world!")]
+        brain = await Brain.afrom_langchain_documents(name="My Brain", langchain_documents=documents)
+        brain.print_info()
+        ```
+        """
         if llm is None:
             llm = default_llm()
 
@@ -357,6 +479,26 @@ async def asearch(
         filter: Callable | Dict[str, Any] | None = None,
         fetch_n_neighbors: int = 20,
     ) -> list[SearchResult]:
+        """
+        Search for relevant documents in the brain based on a query.
+
+        Args:
+            query (str | Document): The query to search for.
+            n_results (int): The number of results to return.
+            filter (Callable | Dict[str, Any] | None): The filter to apply to the search.
+            fetch_n_neighbors (int): The number of neighbors to fetch.
+
+        Returns:
+            list[SearchResult]: The list of retrieved chunks.
+
+        Example:
+        ```python
+        brain = Brain.from_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"])
+        results = await brain.asearch("Why everybody loves Quivr?")
+        for result in results:
+            print(result.chunk.page_content)
+        ```
+        """
         if not self.vector_db:
             raise ValueError("No vector db configured for this brain")
 
@@ -383,6 +525,26 @@ def ask(
         list_files: list[QuivrKnowledge] | None = None,
         chat_history: ChatHistory | None = None,
     ) -> ParsedRAGResponse:
+        """
+        Ask a question to the brain and get a generated answer.
+
+        Args:
+            question (str): The question to ask.
+            retrieval_config (RetrievalConfig | None): The retrieval configuration (see RetrievalConfig docs).
+            rag_pipeline (Type[Union[QuivrQARAG, QuivrQARAGLangGraph]] | None): The RAG pipeline to use.
+            list_files (list[QuivrKnowledge] | None): The list of files to include in the RAG pipeline.
+            chat_history (ChatHistory | None): The chat history to use.
+
+        Returns:
+            ParsedRAGResponse: The generated answer.
+
+        Example:
+        ```python
+        brain = Brain.from_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"])
+        answer = brain.ask("What is the meaning of life?")
+        print(answer.answer)
+        ```
+        """
         llm = self.llm
 
         # If you passed a different llm model we'll override the brain  one
@@ -420,6 +582,27 @@ async def ask_streaming(
         list_files: list[QuivrKnowledge] | None = None,
         chat_history: ChatHistory | None = None,
     ) -> AsyncGenerator[ParsedRAGChunkResponse, ParsedRAGChunkResponse]:
+        """
+        Ask a question to the brain and get a streamed generated answer.
+
+        Args:
+            question (str): The question to ask.
+            retrieval_config (RetrievalConfig | None): The retrieval configuration (see RetrievalConfig docs).
+            rag_pipeline (Type[Union[QuivrQARAG, QuivrQARAGLangGraph]] | None): The RAG pipeline to use.
+            list_files (list[QuivrKnowledge] | None): The list of files to include in the RAG pipeline.
+            chat_history (ChatHistory | None): The chat history to use.
+
+        Returns:
+            AsyncGenerator[ParsedRAGChunkResponse, ParsedRAGChunkResponse]: The streamed generated answer.
+
+        Example:
+        ```python
+        brain = Brain.from_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"])
+        async for chunk in brain.ask_streaming("What is the meaning of life?"):
+            print(chunk.answer)
+        ```
+
+        """
         llm = self.llm
 
         # If you passed a different llm model we'll override the brain  one

diff --git a/backend/core/quivr_core/chat.py b/backend/core/quivr_core/chat.py
@@ -10,21 +10,35 @@
 
 class ChatHistory:
     """
-    Chat history is a list of ChatMessage.
-    It is used to store the chat history of a chat.
+    ChatHistory is a class that maintains a record of chat conversations. Each message
+    in the history is represented by an instance of the `ChatMessage` class, and the
+    chat history is stored internally as a list of these `ChatMessage` objects.
+    The class provides methods to retrieve, append, iterate, and manipulate the chat
+    history, as well as utilities to convert the messages into specific formats
+    and support deep copying.
     """
 
     def __init__(self, chat_id: UUID, brain_id: UUID | None) -> None:
+        """Init a new ChatHistory object.
+
+        Args:
+            chat_id (UUID): A unique identifier for the chat session.
+            brain_id (UUID | None): An optional identifier for the brain associated with the chat.
+        """
         self.id = chat_id
         self.brain_id = brain_id
         # TODO(@aminediro): maybe use a deque() instead ?
         self._msgs: list[ChatMessage] = []
 
     def get_chat_history(self, newest_first: bool = False):
-        """Returns a ChatMessage list sorted by time
+        """
+        Retrieves the chat history, optionally sorted in reverse chronological order.
+
+        Args:
+            newest_first (bool, optional): If True, returns the messages in reverse order (newest first). Defaults to False.
 
         Returns:
-            list[ChatMessage]: list of chat messages
+            List[ChatMessage]: A sorted list of chat messages.
         """
         history = sorted(self._msgs, key=lambda msg: msg.message_time)
         if newest_first:
@@ -38,7 +52,11 @@ def append(
         self, langchain_msg: AIMessage | HumanMessage, metadata: dict[str, Any] = {}
     ):
         """
-        Append a message to the chat history.
+        Appends a new message to the chat history.
+
+        Args:
+            langchain_msg (AIMessage | HumanMessage): The message content (either an AI or Human message).
+            metadata (dict[str, Any], optional): Additional metadata related to the message. Defaults to an empty dictionary.
         """
         chat_msg = ChatMessage(
             chat_id=self.id,
@@ -52,7 +70,13 @@ def append(
 
     def iter_pairs(self) -> Generator[Tuple[HumanMessage, AIMessage], None, None]:
         """
-        Iterate over the chat history as pairs of HumanMessage and AIMessage.
+        Iterates over the chat history in pairs, returning a HumanMessage followed by an AIMessage.
+
+        Yields:
+            Tuple[HumanMessage, AIMessage]: Pairs of human and AI messages.
+
+        Raises:
+            AssertionError: If the messages in the pair are not in the expected order (i.e., a HumanMessage followed by an AIMessage).
         """
         # Reverse the chat_history, newest first
         it = iter(self.get_chat_history(newest_first=True))
@@ -66,7 +90,13 @@ def iter_pairs(self) -> Generator[Tuple[HumanMessage, AIMessage], None, None]:
             yield (human_message.msg, ai_message.msg)
 
     def to_list(self) -> List[HumanMessage | AIMessage]:
-        """Format the chat history into a list of HumanMessage and AIMessage"""
+        """
+        Converts the chat history into a list of raw HumanMessage or AIMessage objects.
+
+        Returns:
+            list[HumanMessage | AIMessage]: A list of messages in their raw form, without metadata.
+        """
+
         return [_msg.msg for _msg in self._msgs]
 
     def __deepcopy__(self, memo):