Skip to content

Commit

Permalink
docs(core): init (#3365)
Browse files Browse the repository at this point in the history
# Description

Please include a summary of the changes and the related issue. Please
also include relevant motivation and context.

## Checklist before requesting a review

Please delete options that are not relevant.

- [ ] My code follows the style guidelines of this project
- [ ] I have performed a self-review of my code
- [ ] I have commented hard-to-understand areas
- [ ] I have ideally added tests that prove my fix is effective or that
my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] Any dependent changes have been merged

## Screenshots (if appropriate):

---------

Co-authored-by: aminediro <[email protected]>
Co-authored-by: Jacopo Chevallard <[email protected]>
Co-authored-by: chloedia <[email protected]>
Co-authored-by: AmineDiro <[email protected]>
  • Loading branch information
5 people authored Oct 11, 2024
1 parent 6c2858f commit bb572a2
Show file tree
Hide file tree
Showing 19 changed files with 743 additions and 43 deletions.
24 changes: 24 additions & 0 deletions backend/core/quivr_core/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,34 @@


class QuivrBaseConfig(BaseModel):
"""
Base configuration class for Quivr.
This class extends Pydantic's BaseModel and provides a foundation for
configuration management in quivr-core.
Attributes:
model_config (ConfigDict): Configuration for the Pydantic model.
It's set to forbid extra attributes, ensuring strict adherence
to the defined schema.
Class Methods:
from_yaml: Create an instance of the class from a YAML file.
"""

model_config = ConfigDict(extra="forbid")

@classmethod
def from_yaml(cls, file_path: str | Path):
"""
Create an instance of the class from a YAML file.
Args:
file_path (str | Path): The path to the YAML file.
Returns:
QuivrBaseConfig: An instance of the class initialized with the data from the YAML file.
"""
# Load the YAML file
with open(file_path, "r") as stream:
config_data = yaml.safe_load(stream)
Expand Down
183 changes: 183 additions & 0 deletions backend/core/quivr_core/brain/brain.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,24 @@
async def process_files(
storage: StorageBase, skip_file_error: bool, **processor_kwargs: dict[str, Any]
) -> list[Document]:
"""
Process files in storage.
This function takes a StorageBase and return a list of langchain documents.
Args:
storage (StorageBase): The storage containing the files to process.
skip_file_error (bool): Whether to skip files that cannot be processed.
processor_kwargs (dict[str, Any]): Additional arguments for the processor.
Returns:
list[Document]: List of processed documents in the Langchain Document format.
Raises:
ValueError: If a file cannot be processed and skip_file_error is False.
Exception: If no processor is found for a file of a specific type and skip_file_error is False.
"""

knowledge = []
for file in await storage.get_files():
try:
Expand All @@ -71,6 +89,36 @@ async def process_files(


class Brain:
"""
A class representing a Brain.
This class allows for the creation of a Brain, which is a collection of knowledge one wants to retrieve information from.
A Brain is set to:
* Store files in the storage of your choice (local, S3, etc.)
* Process the files in the storage to extract text and metadata in a wide range of format.
* Store the processed files in the vector store of your choice (FAISS, PGVector, etc.) - default to FAISS.
* Create an index of the processed files.
* Use the *Quivr* workflow for the retrieval augmented generation.
A Brain is able to:
* Search for information in the vector store.
* Answer questions about the knowledges in the Brain.
* Stream the answer to the question.
Attributes:
name (str): The name of the brain.
id (UUID): The unique identifier of the brain.
storage (StorageBase): The storage used to store the files.
llm (LLMEndpoint): The language model used to generate the answer.
vector_db (VectorStore): The vector store used to store the processed files.
embedder (Embeddings): The embeddings used to create the index of the processed files.
"""

def __init__(
self,
*,
Expand Down Expand Up @@ -106,6 +154,22 @@ def print_info(self):

@classmethod
def load(cls, folder_path: str | Path) -> Self:
"""
Load a brain from a folder path.
Args:
folder_path (str | Path): The path to the folder containing the brain.
Returns:
Brain: The brain loaded from the folder path.
Example:
```python
brain_loaded = Brain.load("path/to/brain")
brain_loaded.print_info()
```
"""
if isinstance(folder_path, str):
folder_path = Path(folder_path)
if not folder_path.exists():
Expand Down Expand Up @@ -154,6 +218,20 @@ def load(cls, folder_path: str | Path) -> Self:
)

async def save(self, folder_path: str | Path):
"""
Save the brain to a folder path.
Args:
folder_path (str | Path): The path to the folder where the brain will be saved.
Returns:
str: The path to the folder where the brain was saved.
Example:
```python
await brain.save("path/to/brain")
```
"""
if isinstance(folder_path, str):
folder_path = Path(folder_path)

Expand Down Expand Up @@ -247,6 +325,28 @@ async def afrom_files(
skip_file_error: bool = False,
processor_kwargs: dict[str, Any] | None = None,
):
"""
Create a brain from a list of file paths.
Args:
name (str): The name of the brain.
file_paths (list[str | Path]): The list of file paths to add to the brain.
vector_db (VectorStore | None): The vector store used to store the processed files.
storage (StorageBase): The storage used to store the files.
llm (LLMEndpoint | None): The language model used to generate the answer.
embedder (Embeddings | None): The embeddings used to create the index of the processed files.
skip_file_error (bool): Whether to skip files that cannot be processed.
processor_kwargs (dict[str, Any] | None): Additional arguments for the processor.
Returns:
Brain: The brain created from the file paths.
Example:
```python
brain = await Brain.afrom_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"])
brain.print_info()
```
"""
if llm is None:
llm = default_llm()

Expand Down Expand Up @@ -327,6 +427,28 @@ async def afrom_langchain_documents(
llm: LLMEndpoint | None = None,
embedder: Embeddings | None = None,
) -> Self:
"""
Create a brain from a list of langchain documents.
Args:
name (str): The name of the brain.
langchain_documents (list[Document]): The list of langchain documents to add to the brain.
vector_db (VectorStore | None): The vector store used to store the processed files.
storage (StorageBase): The storage used to store the files.
llm (LLMEndpoint | None): The language model used to generate the answer.
embedder (Embeddings | None): The embeddings used to create the index of the processed files.
Returns:
Brain: The brain created from the langchain documents.
Example:
```python
from langchain_core.documents import Document
documents = [Document(page_content="Hello, world!")]
brain = await Brain.afrom_langchain_documents(name="My Brain", langchain_documents=documents)
brain.print_info()
```
"""
if llm is None:
llm = default_llm()

Expand Down Expand Up @@ -357,6 +479,26 @@ async def asearch(
filter: Callable | Dict[str, Any] | None = None,
fetch_n_neighbors: int = 20,
) -> list[SearchResult]:
"""
Search for relevant documents in the brain based on a query.
Args:
query (str | Document): The query to search for.
n_results (int): The number of results to return.
filter (Callable | Dict[str, Any] | None): The filter to apply to the search.
fetch_n_neighbors (int): The number of neighbors to fetch.
Returns:
list[SearchResult]: The list of retrieved chunks.
Example:
```python
brain = Brain.from_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"])
results = await brain.asearch("Why everybody loves Quivr?")
for result in results:
print(result.chunk.page_content)
```
"""
if not self.vector_db:
raise ValueError("No vector db configured for this brain")

Expand All @@ -383,6 +525,26 @@ def ask(
list_files: list[QuivrKnowledge] | None = None,
chat_history: ChatHistory | None = None,
) -> ParsedRAGResponse:
"""
Ask a question to the brain and get a generated answer.
Args:
question (str): The question to ask.
retrieval_config (RetrievalConfig | None): The retrieval configuration (see RetrievalConfig docs).
rag_pipeline (Type[Union[QuivrQARAG, QuivrQARAGLangGraph]] | None): The RAG pipeline to use.
list_files (list[QuivrKnowledge] | None): The list of files to include in the RAG pipeline.
chat_history (ChatHistory | None): The chat history to use.
Returns:
ParsedRAGResponse: The generated answer.
Example:
```python
brain = Brain.from_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"])
answer = brain.ask("What is the meaning of life?")
print(answer.answer)
```
"""
llm = self.llm

# If you passed a different llm model we'll override the brain one
Expand Down Expand Up @@ -420,6 +582,27 @@ async def ask_streaming(
list_files: list[QuivrKnowledge] | None = None,
chat_history: ChatHistory | None = None,
) -> AsyncGenerator[ParsedRAGChunkResponse, ParsedRAGChunkResponse]:
"""
Ask a question to the brain and get a streamed generated answer.
Args:
question (str): The question to ask.
retrieval_config (RetrievalConfig | None): The retrieval configuration (see RetrievalConfig docs).
rag_pipeline (Type[Union[QuivrQARAG, QuivrQARAGLangGraph]] | None): The RAG pipeline to use.
list_files (list[QuivrKnowledge] | None): The list of files to include in the RAG pipeline.
chat_history (ChatHistory | None): The chat history to use.
Returns:
AsyncGenerator[ParsedRAGChunkResponse, ParsedRAGChunkResponse]: The streamed generated answer.
Example:
```python
brain = Brain.from_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"])
async for chunk in brain.ask_streaming("What is the meaning of life?"):
print(chunk.answer)
```
"""
llm = self.llm

# If you passed a different llm model we'll override the brain one
Expand Down
44 changes: 37 additions & 7 deletions backend/core/quivr_core/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,35 @@

class ChatHistory:
"""
Chat history is a list of ChatMessage.
It is used to store the chat history of a chat.
ChatHistory is a class that maintains a record of chat conversations. Each message
in the history is represented by an instance of the `ChatMessage` class, and the
chat history is stored internally as a list of these `ChatMessage` objects.
The class provides methods to retrieve, append, iterate, and manipulate the chat
history, as well as utilities to convert the messages into specific formats
and support deep copying.
"""

def __init__(self, chat_id: UUID, brain_id: UUID | None) -> None:
"""Init a new ChatHistory object.
Args:
chat_id (UUID): A unique identifier for the chat session.
brain_id (UUID | None): An optional identifier for the brain associated with the chat.
"""
self.id = chat_id
self.brain_id = brain_id
# TODO(@aminediro): maybe use a deque() instead ?
self._msgs: list[ChatMessage] = []

def get_chat_history(self, newest_first: bool = False):
"""Returns a ChatMessage list sorted by time
"""
Retrieves the chat history, optionally sorted in reverse chronological order.
Args:
newest_first (bool, optional): If True, returns the messages in reverse order (newest first). Defaults to False.
Returns:
list[ChatMessage]: list of chat messages
List[ChatMessage]: A sorted list of chat messages.
"""
history = sorted(self._msgs, key=lambda msg: msg.message_time)
if newest_first:
Expand All @@ -38,7 +52,11 @@ def append(
self, langchain_msg: AIMessage | HumanMessage, metadata: dict[str, Any] = {}
):
"""
Append a message to the chat history.
Appends a new message to the chat history.
Args:
langchain_msg (AIMessage | HumanMessage): The message content (either an AI or Human message).
metadata (dict[str, Any], optional): Additional metadata related to the message. Defaults to an empty dictionary.
"""
chat_msg = ChatMessage(
chat_id=self.id,
Expand All @@ -52,7 +70,13 @@ def append(

def iter_pairs(self) -> Generator[Tuple[HumanMessage, AIMessage], None, None]:
"""
Iterate over the chat history as pairs of HumanMessage and AIMessage.
Iterates over the chat history in pairs, returning a HumanMessage followed by an AIMessage.
Yields:
Tuple[HumanMessage, AIMessage]: Pairs of human and AI messages.
Raises:
AssertionError: If the messages in the pair are not in the expected order (i.e., a HumanMessage followed by an AIMessage).
"""
# Reverse the chat_history, newest first
it = iter(self.get_chat_history(newest_first=True))
Expand All @@ -66,7 +90,13 @@ def iter_pairs(self) -> Generator[Tuple[HumanMessage, AIMessage], None, None]:
yield (human_message.msg, ai_message.msg)

def to_list(self) -> List[HumanMessage | AIMessage]:
"""Format the chat history into a list of HumanMessage and AIMessage"""
"""
Converts the chat history into a list of raw HumanMessage or AIMessage objects.
Returns:
list[HumanMessage | AIMessage]: A list of messages in their raw form, without metadata.
"""

return [_msg.msg for _msg in self._msgs]

def __deepcopy__(self, memo):
Expand Down
Loading

0 comments on commit bb572a2

Please sign in to comment.