Skip to content

Commit

Permalink
[dev] refactor, create CustomImagePagePdfReader
Browse files Browse the repository at this point in the history
  • Loading branch information
zoazhyga committed Sep 6, 2024
1 parent be204cf commit 800127f
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 18 deletions.
25 changes: 8 additions & 17 deletions private_gpt/components/ingest/ingest_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from llama_index.core.readers.json import JSONReader
from llama_index.core.schema import Document

from scripts.readers import CustomImagePagePdfReader

logger = logging.getLogger(__name__)

LLMSHERPA_API_URL = (
Expand Down Expand Up @@ -129,24 +131,14 @@ def _load_file_to_documents(file_name: str, file_data: Path) -> list[Document]:
"No text extracted from PDF, trying to extract images from PDF"
)
try:
import pdf2image
import pytesseract

images = pdf2image.convert_from_path(file_data)
documents = []

for i, image in tqdm.tqdm(enumerate(images)):
text = pytesseract.image_to_string(image, lang="rus")
doc = StringIterableReader().load_data(
[text],
)
# )[0]
# doc.metadata["page_label"] = str(i + 1)

documents.extend(doc)
pdf_reader = CustomImagePagePdfReader(lang="rus")
documents = pdf_reader.load_data(file_data.as_posix())
except Exception as e:
logger.error(f"Error extracting images from PDF: {e}")
raise ValueError(f"No text extracted from PDF={file_name}")
raise ValueError(f"No text extracted from PDF: {file_name}")

if len(documents) == 0:
logger.warning(f"No documents extracted from file: {file_name}")

return documents

Expand All @@ -158,7 +150,6 @@ def _exclude_metadata(documents: list[Document]) -> None:
# We don't want the Embeddings search to receive this metadata
document.excluded_embed_metadata_keys = ["doc_id"]
# We don't want the LLM to receive these metadata in the context
# ToDo currently remove file_name
document.excluded_llm_metadata_keys = [
# "file_name",
"doc_id",
Expand Down
4 changes: 3 additions & 1 deletion private_gpt/ui/ui.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""This file should be imported if and only if you want to run the UI locally."""

import base64
import logging
import time
Expand Down Expand Up @@ -69,7 +70,8 @@ def curate_sources(sources: list[Chunk]) -> list["Source"]:
file_name = doc_metadata.get("file_name", "-") if doc_metadata else "-"
page_label = doc_metadata.get("page_label", "-") if doc_metadata else "-"

source = Source(file=file_name, page=page_label, text=chunk.text)
logger.debug("Source: %s %s", file_name, page_label)
source = Source(file=file_name, page=str(page_label), text=chunk.text)
curated_sources.append(source)
curated_sources = list(
dict.fromkeys(curated_sources).keys()
Expand Down
43 changes: 43 additions & 0 deletions scripts/readers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import logging
from typing import Any, Dict, List, Optional
import tqdm

from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document


logger = logging.getLogger(__name__)


class CustomImagePagePdfReader(BaseReader):
def __init__(self, *args: Any, lang: str = "rus", **kwargs: Any) -> None:
super().__init__(*args, **kwargs)

self.lang = lang

def load_data(
self, pdf_path: str, extra_info: Optional[Dict] = None
) -> List[Document]:

try:
import pdf2image
except ImportError:
raise ImportError("You need to install `pdf2image` to use this reader")

try:
import pytesseract
except ImportError:
raise ImportError("You need to install `pytesseract` to use this reader")

images = pdf2image.convert_from_path(pdf_path)
documents = []

for i, image in tqdm.tqdm(enumerate(images)):
text = pytesseract.image_to_string(image, lang=self.lang)
doc = Document(
text=text,
extra_info={"chunk_type": "image", "page_label": i + 1},
)
documents.append(doc)

return documents

0 comments on commit 800127f

Please sign in to comment.