Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: DOCXToDocument: add table extraction #8457

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
106 changes: 67 additions & 39 deletions haystack/components/converters/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
with LazyImport("Run 'pip install python-docx'") as docx_import:
import docx
from docx.document import Document as DocxDocument
from docx.table import Table
from docx.text.paragraph import Paragraph


Expand Down Expand Up @@ -118,9 +119,9 @@ def run(
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
continue
try:
file = docx.Document(io.BytesIO(bytestream.data))
paragraphs = self._extract_paragraphs_with_page_breaks(file.paragraphs)
text = "\n".join(paragraphs)
docx_document = docx.Document(io.BytesIO(bytestream.data))
elements = self._extract_elements(docx_document)
text = "\n".join(elements)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to a DOCX Document, skipping. Error: {error}",
Expand All @@ -129,52 +130,79 @@ def run(
)
continue

docx_metadata = self._get_docx_metadata(document=file)
docx_metadata = self._get_docx_metadata(document=docx_document)
merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}
document = Document(content=text, meta=merged_metadata)
documents.append(document)

return {"documents": documents}

def _extract_paragraphs_with_page_breaks(self, paragraphs: List["Paragraph"]) -> List[str]:
def _extract_elements(self, document: "DocxDocument") -> List[str]:
"""
Extracts paragraphs from a DOCX file, including page breaks.
Extracts elements from a DOCX file.

Page breaks (both soft and hard page breaks) are not automatically extracted by python-docx as '\f' chars.
This means we need to add them in ourselves, as done here. This allows the correct page number
to be associated with each document if the file contents are split, e.g. by DocumentSplitter.

:param paragraphs:
List of paragraphs from a DOCX file.

:returns:
List of strings (paragraph text fields) with all page breaks added in as '\f' characters.
:param document: The DOCX Document object.
:returns: List of strings (paragraph texts and table representations) with page breaks added as '\f' characters.
"""
paragraph_texts = []
for para in paragraphs:
if para.contains_page_break:
para_text_w_page_breaks = ""
# Usually, just 1 page break exists, but could be more if paragraph is really long, so we loop over them
for pb_index, page_break in enumerate(para.rendered_page_breaks):
# Can only extract text from first paragraph page break, unfortunately
if pb_index == 0:
if page_break.preceding_paragraph_fragment:
para_text_w_page_breaks += page_break.preceding_paragraph_fragment.text
para_text_w_page_breaks += "\f"
if page_break.following_paragraph_fragment:
# following_paragraph_fragment contains all text for remainder of paragraph.
# However, if the remainder of the paragraph spans multiple page breaks, it won't include
# those later page breaks so we have to add them at end of text in the `else` block below.
# This is not ideal, but this case should be very rare and this is likely good enough.
para_text_w_page_breaks += page_break.following_paragraph_fragment.text
else:
para_text_w_page_breaks += "\f"

paragraph_texts.append(para_text_w_page_breaks)
elements = []
for element in document.element.body:
if element.tag.endswith("p"):
paragraph = Paragraph(element, document)
if paragraph.contains_page_break:
para_text = self._process_paragraph_with_page_breaks(paragraph)
else:
para_text = paragraph.text
elements.append(para_text)
elif element.tag.endswith("tbl"):
table = docx.table.Table(element, document)
table_md = self._table_to_markdown(table)
elements.append(table_md)

return elements

def _process_paragraph_with_page_breaks(self, paragraph: "Paragraph") -> str:
para_text = ""
# Usually, just 1 page break exists, but could be more if paragraph is really long, so we loop over them
for pb_index, page_break in enumerate(paragraph.rendered_page_breaks):
# Can only extract text from first paragraph page break, unfortunately
if pb_index == 0:
if page_break.preceding_paragraph_fragment:
para_text += page_break.preceding_paragraph_fragment.text
para_text += "\f"
if page_break.following_paragraph_fragment:
# following_paragraph_fragment contains all text for remainder of paragraph.
# However, if the remainder of the paragraph spans multiple page breaks, it won't include
# those later page breaks so we have to add them at end of text in the `else` block below.
# This is not ideal, but this case should be very rare and this is likely good enough.
para_text += page_break.following_paragraph_fragment.text
else:
paragraph_texts.append(para.text)

return paragraph_texts
para_text += "\f"
return para_text

def _table_to_markdown(self, table: "Table") -> str:
markdown: List[str] = []
max_col_widths: List[int] = []

# Calculate max width for each column
for row in table.rows:
for i, cell in enumerate(row.cells):
cell_text = cell.text.strip()
if i >= len(max_col_widths):
max_col_widths.append(len(cell_text))
else:
max_col_widths[i] = max(max_col_widths[i], len(cell_text))

# Process rows
for i, row in enumerate(table.rows):
md_row = [cell.text.strip().ljust(max_col_widths[j]) for j, cell in enumerate(row.cells)]
markdown.append("| " + " | ".join(md_row) + " |")

# Add separator after header row
if i == 0:
separator = ["-" * max_col_widths[j] for j in range(len(row.cells))]
markdown.append("| " + " | ".join(separator) + " |")

return "\n".join(markdown)

def _get_docx_metadata(self, document: "DocxDocument") -> DOCXMetadata:
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Enhanced DOCX converter to support table extraction along with paragraph content, improving the versatility of content processing from DOCX files.
83 changes: 80 additions & 3 deletions test/components/converters/test_docx_file_to_document.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import logging
import json
import logging

shadeMe marked this conversation as resolved.
Show resolved Hide resolved
import pytest

from haystack.dataclasses import ByteStream
from haystack import Document
from haystack.components.converters.docx import DOCXToDocument, DOCXMetadata
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument
from haystack.dataclasses import ByteStream


@pytest.fixture
Expand Down Expand Up @@ -48,6 +48,83 @@ def test_run(self, test_files_path, docx_converter):
),
}

def test_run_with_table(self, test_files_path, docx_converter):
"""
Test if the component runs correctly
"""
paths = [test_files_path / "docx" / "sample_docx.docx"]
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "Donald Trump" in docs[0].content ## :-)
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": str(paths[0]),
"docx": DOCXMetadata(
author="Saha, Anirban",
category="",
comments="",
content_status="",
created="2020-07-14T08:14:00+00:00",
identifier="",
keywords="",
language="",
last_modified_by="Saha, Anirban",
last_printed=None,
modified="2020-07-14T08:16:00+00:00",
revision=1,
subject="",
title="",
version="",
),
}
# let's now detect that the table markdown is correctly added and that order of elements is correct
content_parts = docs[0].content.split("\n\n")
table_index = next(i for i, part in enumerate(content_parts) if "| This | Is | Just a |" in part)
# check that natural order of the document is preserved
assert any("Donald Trump" in part for part in content_parts[:table_index]), "Text before table not found"
assert any(
"Now we are in Page 2" in part for part in content_parts[table_index + 1 :]
), "Text after table not found"

def test_table_between_two_paragraphs(self, test_files_path, docx_converter):
paths = [test_files_path / "docx" / "sample_docx_3.docx"]
output = docx_converter.run(sources=paths)

content = output["documents"][0].content

paragraphs_one = content.find("Table: AI Use Cases in Different Industries")
paragraphs_two = content.find("Paragraph 2:")
table = content[paragraphs_one + len("Table: AI Use Cases in Different Industries") + 1 : paragraphs_two]
split = list(filter(None, table.split("\n")))
expected_table_header = "| Industry | AI Use Case | Impact |"
expected_last_row = "| Finance | Fraud detection and prevention | Reduced financial losses |"

assert split[0] == expected_table_header
assert split[-1] == expected_last_row

def test_table_content_correct_parsing(self, test_files_path, docx_converter):
paths = [test_files_path / "docx" / "sample_docx_3.docx"]
output = docx_converter.run(sources=paths)
content = output["documents"][0].content

paragraphs_one = content.find("Table: AI Use Cases in Different Industries")
paragraphs_two = content.find("Paragraph 2:")
table = content[paragraphs_one + len("Table: AI Use Cases in Different Industries") + 1 : paragraphs_two]
split = list(filter(None, table.split("\n")))

assert len(split) == 4

expected_table_header = "| Industry | AI Use Case | Impact |"
expected_table_top_border = "| ---------- | ------------------------------ | ------------------------- |"
expected_table_row_one = "| Healthcare | Predictive diagnostics | Improved patient outcomes |"
expected_table_row_two = "| Finance | Fraud detection and prevention | Reduced financial losses |"

assert split[0] == expected_table_header
assert split[1] == expected_table_top_border
assert split[2] == expected_table_row_one
assert split[3] == expected_table_row_two

def test_run_with_additional_meta(self, test_files_path, docx_converter):
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"})
Expand Down
Binary file added test/test_files/docx/sample_docx_3.docx
Binary file not shown.
Loading