diff --git a/haystack/components/converters/pdfminer.py b/haystack/components/converters/pdfminer.py index c5f2415685..6c8fc6cdc7 100644 --- a/haystack/components/converters/pdfminer.py +++ b/haystack/components/converters/pdfminer.py @@ -5,7 +5,7 @@ import io import os from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, Iterator, List, Optional, Union from haystack import Document, component, logging from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata @@ -98,23 +98,27 @@ def __init__( # pylint: disable=too-many-positional-arguments ) self.store_full_path = store_full_path - def _converter(self, extractor) -> str: + @staticmethod + def _converter(lt_page_objs: Iterator) -> str: """ Extracts text from PDF pages then converts the text into a single str - :param extractor: + :param lt_page_objs: Python generator that yields PDF pages. :returns: PDF text converted to single str """ pages = [] - for page in extractor: + for page in lt_page_objs: text = "" for container in page: # Keep text only if isinstance(container, LTTextContainer): - text += container.get_text() + container_text = container.get_text() + if container_text: + text += "\n\n" + text += container_text pages.append(text) # Add a page delimiter @@ -156,8 +160,8 @@ def run( logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e) continue try: - pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params) - text = self._converter(pdf_reader) + pages = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params) + text = self._converter(pages) except Exception as e: logger.warning( "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e diff --git a/releasenotes/notes/fixing-PDFMiner-for-passage-detection-62cf5c3e9758bcf9.yaml b/releasenotes/notes/fixing-PDFMiner-for-passage-detection-62cf5c3e9758bcf9.yaml new file mode 100644 index 0000000000..5b791e9749 --- /dev/null +++ b/releasenotes/notes/fixing-PDFMiner-for-passage-detection-62cf5c3e9758bcf9.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + Updated `PDFMinerToDocument` convert function to to double new lines between container_text so that passages can later by `DocumentSplitter`. diff --git a/test/components/converters/test_pdfminer_to_document.py b/test/components/converters/test_pdfminer_to_document.py index 92aeb2dcd1..4691a2a1a2 100644 --- a/test/components/converters/test_pdfminer_to_document.py +++ b/test/components/converters/test_pdfminer_to_document.py @@ -6,6 +6,7 @@ import pytest from haystack import Document +from haystack.components.preprocessors import DocumentSplitter from haystack.dataclasses import ByteStream from haystack.components.converters.pdfminer import PDFMinerToDocument @@ -155,3 +156,32 @@ def test_run_empty_document(self, caplog, test_files_path): # Check that not only content is used when the returned document is initialized and doc id is generated assert results["documents"][0].meta["file_path"] == "non_text_searchable.pdf" assert results["documents"][0].id != Document(content="").id + + def test_run_detect_pages_and_split_by_passage(self, test_files_path): + converter = PDFMinerToDocument() + sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"] + pdf_doc = converter.run(sources=sources) + splitter = DocumentSplitter(split_length=1, split_by="page") + docs = splitter.run(pdf_doc["documents"]) + assert len(docs["documents"]) == 4 + + def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path): + converter = PDFMinerToDocument() + sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"] + pdf_doc = converter.run(sources=sources) + splitter = DocumentSplitter(split_length=1, split_by="passage") + docs = splitter.run(pdf_doc["documents"]) + + assert len(docs["documents"]) == 29 + + expected = ( + "\nA wiki (/ˈwɪki/ (About this soundlisten) WIK-ee) is a hypertext publication collaboratively" + " \nedited and managed by its own audience directly using a web browser. A typical wiki \ncontains " + "multiple pages for the subjects or scope of the project and may be either open \nto the public or " + "limited to use within an organization for maintaining its internal knowledge \nbase. Wikis are " + "enabled by wiki software, otherwise known as wiki engines. A wiki engine, \nbeing a form of a " + "content management system, differs from other web-based systems \nsuch as blog software, in that " + "the content is created without any defined owner or leader, \nand wikis have little inherent " + "structure, allowing structure to emerge according to the \nneeds of the users.[1] \n\n" + ) + assert docs["documents"][6].content == expected