From 800127f3ba6fb507fe5e14c9b21b04cd50f1ad34 Mon Sep 17 00:00:00 2001 From: zoazhyga Date: Fri, 6 Sep 2024 10:24:20 +0200 Subject: [PATCH] [dev] refactor, create CustomImagePagePdfReader --- .../components/ingest/ingest_helper.py | 25 ++++------- private_gpt/ui/ui.py | 4 +- scripts/readers.py | 43 +++++++++++++++++++ 3 files changed, 54 insertions(+), 18 deletions(-) create mode 100644 scripts/readers.py diff --git a/private_gpt/components/ingest/ingest_helper.py b/private_gpt/components/ingest/ingest_helper.py index 45adbba88..7e5bcd930 100644 --- a/private_gpt/components/ingest/ingest_helper.py +++ b/private_gpt/components/ingest/ingest_helper.py @@ -8,6 +8,8 @@ from llama_index.core.readers.json import JSONReader from llama_index.core.schema import Document +from scripts.readers import CustomImagePagePdfReader + logger = logging.getLogger(__name__) LLMSHERPA_API_URL = ( @@ -129,24 +131,14 @@ def _load_file_to_documents(file_name: str, file_data: Path) -> list[Document]: "No text extracted from PDF, trying to extract images from PDF" ) try: - import pdf2image - import pytesseract - - images = pdf2image.convert_from_path(file_data) - documents = [] - - for i, image in tqdm.tqdm(enumerate(images)): - text = pytesseract.image_to_string(image, lang="rus") - doc = StringIterableReader().load_data( - [text], - ) - # )[0] - # doc.metadata["page_label"] = str(i + 1) - - documents.extend(doc) + pdf_reader = CustomImagePagePdfReader(lang="rus") + documents = pdf_reader.load_data(file_data.as_posix()) except Exception as e: logger.error(f"Error extracting images from PDF: {e}") - raise ValueError(f"No text extracted from PDF={file_name}") + raise ValueError(f"No text extracted from PDF: {file_name}") + + if len(documents) == 0: + logger.warning(f"No documents extracted from file: {file_name}") return documents @@ -158,7 +150,6 @@ def _exclude_metadata(documents: list[Document]) -> None: # We don't want the Embeddings search to receive this metadata document.excluded_embed_metadata_keys = ["doc_id"] # We don't want the LLM to receive these metadata in the context - # ToDo currently remove file_name document.excluded_llm_metadata_keys = [ # "file_name", "doc_id", diff --git a/private_gpt/ui/ui.py b/private_gpt/ui/ui.py index 2c1dcd3e2..f273059bc 100644 --- a/private_gpt/ui/ui.py +++ b/private_gpt/ui/ui.py @@ -1,4 +1,5 @@ """This file should be imported if and only if you want to run the UI locally.""" + import base64 import logging import time @@ -69,7 +70,8 @@ def curate_sources(sources: list[Chunk]) -> list["Source"]: file_name = doc_metadata.get("file_name", "-") if doc_metadata else "-" page_label = doc_metadata.get("page_label", "-") if doc_metadata else "-" - source = Source(file=file_name, page=page_label, text=chunk.text) + logger.debug("Source: %s %s", file_name, page_label) + source = Source(file=file_name, page=str(page_label), text=chunk.text) curated_sources.append(source) curated_sources = list( dict.fromkeys(curated_sources).keys() diff --git a/scripts/readers.py b/scripts/readers.py new file mode 100644 index 000000000..9843d7fe1 --- /dev/null +++ b/scripts/readers.py @@ -0,0 +1,43 @@ +import logging +from typing import Any, Dict, List, Optional +import tqdm + +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import Document + + +logger = logging.getLogger(__name__) + + +class CustomImagePagePdfReader(BaseReader): + def __init__(self, *args: Any, lang: str = "rus", **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + self.lang = lang + + def load_data( + self, pdf_path: str, extra_info: Optional[Dict] = None + ) -> List[Document]: + + try: + import pdf2image + except ImportError: + raise ImportError("You need to install `pdf2image` to use this reader") + + try: + import pytesseract + except ImportError: + raise ImportError("You need to install `pytesseract` to use this reader") + + images = pdf2image.convert_from_path(pdf_path) + documents = [] + + for i, image in tqdm.tqdm(enumerate(images)): + text = pytesseract.image_to_string(image, lang=self.lang) + doc = Document( + text=text, + extra_info={"chunk_type": "image", "page_label": i + 1}, + ) + documents.append(doc) + + return documents