Skip to content

Commit

Permalink
Add pdf ingestion
Browse files Browse the repository at this point in the history
  • Loading branch information
johnshaughnessy committed Apr 15, 2024
1 parent b485337 commit 00ff342
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 11 deletions.
13 changes: 12 additions & 1 deletion memory_cache_hub/api/v1/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,21 @@ def ingest_project_files(
project_files = list_project_file_uploads(root_directory, project.name)
chroma_collection = chroma_collection_for_project(chroma_client, chroma_embedding_function, project.name)
# Delete the collection because we are going to re-ingest all the files
chroma_client.delete_collection(chroma_collection.name)
# chroma_client.delete_collection(chroma_collection.name)
chroma_collection = chroma_collection_for_project(chroma_client, chroma_embedding_function, project.name)
# Prepend root_direct to each project_files path
file_paths = [os.path.join(root_directory, project_file) for project_file in project_files]
# Filter the file_paths such that only the files that have not been ingested are included
filtered_file_paths = []
for file_path in file_paths:
query_results = chroma_collection.query(query_texts=[""], where={"source_file_path": file_path})
if len(query_results["ids"][0]) == 0:
print(f"ADDING FILE {file_path}")
filtered_file_paths.append(file_path)


file_paths = filtered_file_paths

fragments = fragments_from_files(file_paths, 1000, 200, chroma_embedding_function)
if len(fragments) == 0:
return {"status": "ok", "message": "No fragments found in the project files"}
Expand Down
19 changes: 16 additions & 3 deletions memory_cache_hub/api/v1/rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ def rag_ask(
chroma_embedding_function = Depends(get_embedding_function),
db=Depends(get_db)
):
print("GOT RAG ASK REQUEST:")
print(body)
prompt = body.prompt
project = db_get_project(db, body.project_id)
chroma_collection = chroma_collection_for_project(chroma_client, chroma_embedding_function, project.name)
Expand All @@ -31,12 +33,13 @@ def rag_ask(
big_content = ""
big_content += f"Consider the following context:\n"
for i, result in enumerate(query_results['metadatas'][0]):
if i == 2:
if i == 5:
break
file_path = result['source_file_path']
big_content += f"----File: {file_path}\n"
big_content += f"{query_results['documents'][0][i]}\n"

big_content += f"-----\n"
big_content += f"Based on the context above, answer the following question:\n"
big_content += f"{prompt}\n"

Expand All @@ -55,12 +58,22 @@ def rag_ask(
print("\n--------\n")
print(big_content)
print("\n--------\n")
reply = openai_compatible_completions(complete_url, complete_model, messages)
try:
print(complete_url)
reply = openai_compatible_completions(complete_url, complete_model, messages)

except Exception as e:
return RagAskResponse(
status="error",
message=str(e)
)

print(reply)
print("\n-------\n")

return RagAskResponse(
response=reply,
status="ok",
response=reply
)

@router.post("/vector_db_query", status_code=200, tags=["rag"])
Expand Down
4 changes: 3 additions & 1 deletion memory_cache_hub/api/v1/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ class RagAskRequest(BaseModel):
prompt: str

class RagAskResponse(BaseModel):
response: str
status: str
message: Optional[str] = None
response: Optional[str] = None

class DownloadLlamafileByNameRequest(BaseModel):
llamafile_filename: str
Expand Down
31 changes: 25 additions & 6 deletions memory_cache_hub/core/rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@
from typing import List
from memory_cache_hub.core.chromadb import SentenceTransformerEmbeddingFunction
from memory_cache_hub.core.types import FragmentMetadata, Fragment
from pypdf import PdfReader

import os
import sys


def split_text(text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
"""Split text into chunks of the specified size with the specified overlap."""
chunks = []
Expand All @@ -31,13 +34,29 @@ def fragments_from_files(file_paths: List[str],
if file_path.endswith(".svg"):
print(f"Skipping SVG file {file_path}.")
continue
try:
file_content = open(file_path, encoding="utf-8").read()
except UnicodeDecodeError:
print(f"Skipping file {file_path} due to UnicodeDecodeError")
continue

chunks = split_text(file_content, chunk_size, chunk_overlap)
if file_path.endswith(".pdf"):
print(f"Processing PDF file {file_path}")
# Extract text from PDF files
reader = PdfReader(file_path)
number_of_pages = len(reader.pages)
chunks = []
for page_index in range(number_of_pages):
try:
text = reader.pages[page_index].extract_text()
page_chunks = split_text(text, chunk_size, chunk_overlap)
chunks.extend(page_chunks)
except Exception as e:
print(f"Skipping page {page_index} of file {file_path} due to error: {e}")
continue
else:
try:
file_content = open(file_path, encoding="utf-8").read()
chunks = split_text(file_content, chunk_size, chunk_overlap)
except UnicodeDecodeError:
print(f"Skipping file {file_path} due to UnicodeDecodeError")
continue

for i, chunk in enumerate(chunks):
fragment = Fragment(
fragment_id=md5_hash(chunk),
Expand Down
1 change: 1 addition & 0 deletions memory_cache_hub/llamafile/run_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ async def _run(self):
enable_gpu = False
if enable_gpu:
args += ["--ngl", "999"]
print("STARTING llamafile with args: ", args, flush=True)
self.process = await asyncio.create_subprocess_exec(full_file_path,
*args,
stdout=asyncio.subprocess.PIPE,
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ chromadb
fastapi
pathspec
pydantic
pypdf
python-multipart
sentence_transformers
sqlmodel
Expand Down

0 comments on commit 00ff342

Please sign in to comment.