[dev] refactor, create CustomImagePagePdfReader

zylon-ai · Sep 6, 2024 · 800127f · 800127f
1 parent be204cf
commit 800127f
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 18 deletions.
diff --git a/private_gpt/components/ingest/ingest_helper.py b/private_gpt/components/ingest/ingest_helper.py
@@ -8,6 +8,8 @@
 from llama_index.core.readers.json import JSONReader
 from llama_index.core.schema import Document
 
+from scripts.readers import CustomImagePagePdfReader
+
 logger = logging.getLogger(__name__)
 
 LLMSHERPA_API_URL = (
@@ -129,24 +131,14 @@ def _load_file_to_documents(file_name: str, file_data: Path) -> list[Document]:
                 "No text extracted from PDF, trying to extract images from PDF"
             )
             try:
-                import pdf2image
-                import pytesseract
-
-                images = pdf2image.convert_from_path(file_data)
-                documents = []
-
-                for i, image in tqdm.tqdm(enumerate(images)):
-                    text = pytesseract.image_to_string(image, lang="rus")
-                    doc = StringIterableReader().load_data(
-                        [text],
-                    )
-                    # )[0]
-                    # doc.metadata["page_label"] = str(i + 1)
-
-                    documents.extend(doc)
+                pdf_reader = CustomImagePagePdfReader(lang="rus")
+                documents = pdf_reader.load_data(file_data.as_posix())
             except Exception as e:
                 logger.error(f"Error extracting images from PDF: {e}")
-                raise ValueError(f"No text extracted from PDF={file_name}")
+                raise ValueError(f"No text extracted from PDF: {file_name}")
+
+        if len(documents) == 0:
+            logger.warning(f"No documents extracted from file: {file_name}")
 
         return documents
 
@@ -158,7 +150,6 @@ def _exclude_metadata(documents: list[Document]) -> None:
             # We don't want the Embeddings search to receive this metadata
             document.excluded_embed_metadata_keys = ["doc_id"]
             # We don't want the LLM to receive these metadata in the context
-            # ToDo currently remove file_name
             document.excluded_llm_metadata_keys = [
                 # "file_name",
                 "doc_id",

diff --git a/private_gpt/ui/ui.py b/private_gpt/ui/ui.py
@@ -1,4 +1,5 @@
 """This file should be imported if and only if you want to run the UI locally."""
+
 import base64
 import logging
 import time
@@ -69,7 +70,8 @@ def curate_sources(sources: list[Chunk]) -> list["Source"]:
             file_name = doc_metadata.get("file_name", "-") if doc_metadata else "-"
             page_label = doc_metadata.get("page_label", "-") if doc_metadata else "-"
 
-            source = Source(file=file_name, page=page_label, text=chunk.text)
+            logger.debug("Source: %s %s", file_name, page_label)
+            source = Source(file=file_name, page=str(page_label), text=chunk.text)
             curated_sources.append(source)
             curated_sources = list(
                 dict.fromkeys(curated_sources).keys()

diff --git a/scripts/readers.py b/scripts/readers.py
@@ -0,0 +1,43 @@
+import logging
+from typing import Any, Dict, List, Optional
+import tqdm
+
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+
+logger = logging.getLogger(__name__)
+
+
+class CustomImagePagePdfReader(BaseReader):
+    def __init__(self, *args: Any, lang: str = "rus", **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+        self.lang = lang
+
+    def load_data(
+        self, pdf_path: str, extra_info: Optional[Dict] = None
+    ) -> List[Document]:
+
+        try:
+            import pdf2image
+        except ImportError:
+            raise ImportError("You need to install `pdf2image` to use this reader")
+
+        try:
+            import pytesseract
+        except ImportError:
+            raise ImportError("You need to install `pytesseract` to use this reader")
+
+        images = pdf2image.convert_from_path(pdf_path)
+        documents = []
+
+        for i, image in tqdm.tqdm(enumerate(images)):
+            text = pytesseract.image_to_string(image, lang=self.lang)
+            doc = Document(
+                text=text,
+                extra_info={"chunk_type": "image", "page_label": i + 1},
+            )
+            documents.append(doc)
+
+        return documents