Add pdf ingestion

Mozilla-Ocho · Apr 15, 2024 · 00ff342 · 00ff342
1 parent b485337
commit 00ff342
Show file tree

Hide file tree

Showing 6 changed files with 58 additions and 11 deletions.
diff --git a/memory_cache_hub/api/v1/ingest.py b/memory_cache_hub/api/v1/ingest.py
@@ -22,10 +22,21 @@ def ingest_project_files(
     project_files = list_project_file_uploads(root_directory, project.name)
     chroma_collection = chroma_collection_for_project(chroma_client, chroma_embedding_function, project.name)
     # Delete the collection because we are going to re-ingest all the files
-    chroma_client.delete_collection(chroma_collection.name)
+    # chroma_client.delete_collection(chroma_collection.name)
     chroma_collection = chroma_collection_for_project(chroma_client, chroma_embedding_function, project.name)
     # Prepend root_direct to each project_files path
     file_paths = [os.path.join(root_directory, project_file) for project_file in project_files]
+    # Filter the file_paths such that only the files that have not been ingested are included
+    filtered_file_paths = []
+    for file_path in file_paths:
+        query_results = chroma_collection.query(query_texts=[""], where={"source_file_path": file_path})
+        if len(query_results["ids"][0]) == 0:
+            print(f"ADDING FILE {file_path}")
+            filtered_file_paths.append(file_path)
+
+
+    file_paths = filtered_file_paths
+
     fragments = fragments_from_files(file_paths, 1000, 200, chroma_embedding_function)
     if len(fragments) == 0:
         return {"status": "ok", "message": "No fragments found in the project files"}

diff --git a/memory_cache_hub/api/v1/rag.py b/memory_cache_hub/api/v1/rag.py
@@ -23,6 +23,8 @@ def rag_ask(
         chroma_embedding_function = Depends(get_embedding_function),
         db=Depends(get_db)
 ):
+    print("GOT RAG ASK REQUEST:")
+    print(body)
     prompt = body.prompt
     project = db_get_project(db, body.project_id)
     chroma_collection = chroma_collection_for_project(chroma_client, chroma_embedding_function, project.name)
@@ -31,12 +33,13 @@ def rag_ask(
     big_content = ""
     big_content += f"Consider the following context:\n"
     for i, result in enumerate(query_results['metadatas'][0]):
-        if i == 2:
+        if i == 5:
             break
         file_path = result['source_file_path']
         big_content += f"----File: {file_path}\n"
         big_content += f"{query_results['documents'][0][i]}\n"
 
+    big_content += f"-----\n"
     big_content += f"Based on the context above, answer the following question:\n"
     big_content += f"{prompt}\n"
 
@@ -55,12 +58,22 @@ def rag_ask(
     print("\n--------\n")
     print(big_content)
     print("\n--------\n")
-    reply = openai_compatible_completions(complete_url, complete_model, messages)
+    try:
+        print(complete_url)
+        reply = openai_compatible_completions(complete_url, complete_model, messages)
+
+    except Exception as e:
+        return RagAskResponse(
+            status="error",
+            message=str(e)
+        )
+
     print(reply)
     print("\n-------\n")
 
     return RagAskResponse(
-        response=reply,
+        status="ok",
+        response=reply
     )
 
 @router.post("/vector_db_query", status_code=200, tags=["rag"])

diff --git a/memory_cache_hub/api/v1/types.py b/memory_cache_hub/api/v1/types.py
@@ -53,7 +53,9 @@ class RagAskRequest(BaseModel):
     prompt: str
 
 class RagAskResponse(BaseModel):
-    response: str
+    status: str
+    message: Optional[str] = None
+    response: Optional[str] = None
 
 class DownloadLlamafileByNameRequest(BaseModel):
     llamafile_filename: str

diff --git a/memory_cache_hub/core/rag.py b/memory_cache_hub/core/rag.py
@@ -7,9 +7,12 @@
 from typing import List
 from memory_cache_hub.core.chromadb import SentenceTransformerEmbeddingFunction
 from memory_cache_hub.core.types import FragmentMetadata, Fragment
+from pypdf import PdfReader
+
 import os
 import sys
 
+
 def split_text(text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
     """Split text into chunks of the specified size with the specified overlap."""
     chunks = []
@@ -31,13 +34,29 @@ def fragments_from_files(file_paths: List[str],
         if file_path.endswith(".svg"):
             print(f"Skipping SVG file {file_path}.")
             continue
-        try:
-            file_content = open(file_path, encoding="utf-8").read()
-        except UnicodeDecodeError:
-            print(f"Skipping file {file_path} due to UnicodeDecodeError")
-            continue
 
-        chunks = split_text(file_content, chunk_size, chunk_overlap)
+        if file_path.endswith(".pdf"):
+            print(f"Processing PDF file {file_path}")
+            # Extract text from PDF files
+            reader = PdfReader(file_path)
+            number_of_pages = len(reader.pages)
+            chunks = []
+            for page_index in range(number_of_pages):
+                try:
+                    text = reader.pages[page_index].extract_text()
+                    page_chunks = split_text(text, chunk_size, chunk_overlap)
+                    chunks.extend(page_chunks)
+                except Exception as e:
+                    print(f"Skipping page {page_index} of file {file_path} due to error: {e}")
+                    continue
+        else:
+            try:
+                file_content = open(file_path, encoding="utf-8").read()
+                chunks = split_text(file_content, chunk_size, chunk_overlap)
+            except UnicodeDecodeError:
+                print(f"Skipping file {file_path} due to UnicodeDecodeError")
+                continue
+
         for i, chunk in enumerate(chunks):
             fragment = Fragment(
                 fragment_id=md5_hash(chunk),

diff --git a/memory_cache_hub/llamafile/run_handle.py b/memory_cache_hub/llamafile/run_handle.py
@@ -47,6 +47,7 @@ async def _run(self):
             enable_gpu = False
             if enable_gpu:
                 args += ["--ngl", "999"]
+            print("STARTING llamafile with args: ", args, flush=True)
             self.process = await asyncio.create_subprocess_exec(full_file_path,
                                                                 *args,
                                                                 stdout=asyncio.subprocess.PIPE,

diff --git a/requirements.txt b/requirements.txt
@@ -6,6 +6,7 @@ chromadb
 fastapi
 pathspec
 pydantic
+pypdf
 python-multipart
 sentence_transformers
 sqlmodel