deepset-ai · vblagoje · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
@@ -17,6 +17,7 @@
 with LazyImport("Run 'pip install python-docx'") as docx_import:
     import docx
     from docx.document import Document as DocxDocument
+    from docx.table import Table
     from docx.text.paragraph import Paragraph
 
 
@@ -118,9 +119,9 @@ def run(
                 logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
                 continue
             try:
-                file = docx.Document(io.BytesIO(bytestream.data))
-                paragraphs = self._extract_paragraphs_with_page_breaks(file.paragraphs)
-                text = "\n".join(paragraphs)
+                docx_document = docx.Document(io.BytesIO(bytestream.data))
+                elements = self._extract_elements(docx_document)
+                text = "\n".join(elements)
             except Exception as e:
                 logger.warning(
                     "Could not read {source} and convert it to a DOCX Document, skipping. Error: {error}",
@@ -129,52 +130,79 @@ def run(
                 )
                 continue
 
-            docx_metadata = self._get_docx_metadata(document=file)
+            docx_metadata = self._get_docx_metadata(document=docx_document)
             merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}
             document = Document(content=text, meta=merged_metadata)
             documents.append(document)
 
         return {"documents": documents}
 
-    def _extract_paragraphs_with_page_breaks(self, paragraphs: List["Paragraph"]) -> List[str]:
+    def _extract_elements(self, document: "DocxDocument") -> List[str]:
         """
-        Extracts paragraphs from a DOCX file, including page breaks.
+        Extracts elements from a DOCX file.
 
-        Page breaks (both soft and hard page breaks) are not automatically extracted by python-docx as '\f' chars.
-        This means we need to add them in ourselves, as done here. This allows the correct page number
-        to be associated with each document if the file contents are split, e.g. by DocumentSplitter.
-
-        :param paragraphs:
-            List of paragraphs from a DOCX file.
-
-        :returns:
-            List of strings (paragraph text fields) with all page breaks added in as '\f' characters.
+        :param document: The DOCX Document object.
+        :returns: List of strings (paragraph texts and table representations) with page breaks added as '\f' characters.
         """
-        paragraph_texts = []
-        for para in paragraphs:
-            if para.contains_page_break:
-                para_text_w_page_breaks = ""
-                # Usually, just 1 page break exists, but could be more if paragraph is really long, so we loop over them
-                for pb_index, page_break in enumerate(para.rendered_page_breaks):
-                    # Can only extract text from first paragraph page break, unfortunately
-                    if pb_index == 0:
-                        if page_break.preceding_paragraph_fragment:
-                            para_text_w_page_breaks += page_break.preceding_paragraph_fragment.text
-                        para_text_w_page_breaks += "\f"
-                        if page_break.following_paragraph_fragment:
-                            # following_paragraph_fragment contains all text for remainder of paragraph.
-                            # However, if the remainder of the paragraph spans multiple page breaks, it won't include
-                            # those later page breaks so we have to add them at end of text in the `else` block below.
-                            # This is not ideal, but this case should be very rare and this is likely good enough.
-                            para_text_w_page_breaks += page_break.following_paragraph_fragment.text
-                    else:
-                        para_text_w_page_breaks += "\f"
-
-                paragraph_texts.append(para_text_w_page_breaks)
+        elements = []
+        for element in document.element.body:
+            if element.tag.endswith("p"):
+                paragraph = Paragraph(element, document)
+                if paragraph.contains_page_break:
+                    para_text = self._process_paragraph_with_page_breaks(paragraph)
+                else:
+                    para_text = paragraph.text
+                elements.append(para_text)
+            elif element.tag.endswith("tbl"):
+                table = docx.table.Table(element, document)
+                table_md = self._table_to_markdown(table)
+                elements.append(table_md)
+
+        return elements
+
+    def _process_paragraph_with_page_breaks(self, paragraph: "Paragraph") -> str:
+        para_text = ""
+        # Usually, just 1 page break exists, but could be more if paragraph is really long, so we loop over them
+        for pb_index, page_break in enumerate(paragraph.rendered_page_breaks):
+            # Can only extract text from first paragraph page break, unfortunately
+            if pb_index == 0:
+                if page_break.preceding_paragraph_fragment:
+                    para_text += page_break.preceding_paragraph_fragment.text
+                para_text += "\f"
+                if page_break.following_paragraph_fragment:
+                    # following_paragraph_fragment contains all text for remainder of paragraph.
+                    # However, if the remainder of the paragraph spans multiple page breaks, it won't include
+                    # those later page breaks so we have to add them at end of text in the `else` block below.
+                    # This is not ideal, but this case should be very rare and this is likely good enough.
+                    para_text += page_break.following_paragraph_fragment.text
             else:
-                paragraph_texts.append(para.text)
-
-        return paragraph_texts
+                para_text += "\f"
+        return para_text
+
+    def _table_to_markdown(self, table: "Table") -> str:
+        markdown: List[str] = []
+        max_col_widths: List[int] = []
+
+        # Calculate max width for each column
+        for row in table.rows:
+            for i, cell in enumerate(row.cells):
+                cell_text = cell.text.strip()
+                if i >= len(max_col_widths):
+                    max_col_widths.append(len(cell_text))
+                else:
+                    max_col_widths[i] = max(max_col_widths[i], len(cell_text))
+
+        # Process rows
+        for i, row in enumerate(table.rows):
+            md_row = [cell.text.strip().ljust(max_col_widths[j]) for j, cell in enumerate(row.cells)]
+            markdown.append("| " + " | ".join(md_row) + " |")
+
+            # Add separator after header row
+            if i == 0:
+                separator = ["-" * max_col_widths[j] for j in range(len(row.cells))]
+                markdown.append("| " + " | ".join(separator) + " |")
+
+        return "\n".join(markdown)
 
     def _get_docx_metadata(self, document: "DocxDocument") -> DOCXMetadata:
         """

@@ -0,0 +1,4 @@
+---
+features:
+ - |
+   Enhanced DOCX converter to support table extraction along with paragraph content, improving the versatility of content processing from DOCX files.
@@ -1,11 +1,11 @@
-import logging
 import json
+import logging
 
 import pytest
 
-from haystack.dataclasses import ByteStream
 from haystack import Document
-from haystack.components.converters.docx import DOCXToDocument, DOCXMetadata
+from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument
+from haystack.dataclasses import ByteStream
 
 
 @pytest.fixture
@@ -48,6 +48,83 @@ def test_run(self, test_files_path, docx_converter):
             ),
         }
 
+    def test_run_with_table(self, test_files_path, docx_converter):
+        """
+        Test if the component runs correctly
+        """
+        paths = [test_files_path / "docx" / "sample_docx.docx"]
+        output = docx_converter.run(sources=paths)
+        docs = output["documents"]
+        assert len(docs) == 1
+        assert "Donald Trump" in docs[0].content  ## :-)
+        assert docs[0].meta.keys() == {"file_path", "docx"}
+        assert docs[0].meta == {
+            "file_path": str(paths[0]),
+            "docx": DOCXMetadata(
+                author="Saha, Anirban",
+                category="",
+                comments="",
+                content_status="",
+                created="2020-07-14T08:14:00+00:00",
+                identifier="",
+                keywords="",
+                language="",
+                last_modified_by="Saha, Anirban",
+                last_printed=None,
+                modified="2020-07-14T08:16:00+00:00",
+                revision=1,
+                subject="",
+                title="",
+                version="",
+            ),
+        }
+        # let's now detect that the table markdown is correctly added and that order of elements is correct
+        content_parts = docs[0].content.split("\n\n")
+        table_index = next(i for i, part in enumerate(content_parts) if "| This | Is     | Just a |" in part)
+        # check that natural order of the document is preserved
+        assert any("Donald Trump" in part for part in content_parts[:table_index]), "Text before table not found"
+        assert any(
+            "Now we are in Page 2" in part for part in content_parts[table_index + 1 :]
+        ), "Text after table not found"
+
+    def test_table_between_two_paragraphs(self, test_files_path, docx_converter):
+        paths = [test_files_path / "docx" / "sample_docx_3.docx"]
+        output = docx_converter.run(sources=paths)
+
+        content = output["documents"][0].content
+
+        paragraphs_one = content.find("Table: AI Use Cases in Different Industries")
+        paragraphs_two = content.find("Paragraph 2:")
+        table = content[paragraphs_one + len("Table: AI Use Cases in Different Industries") + 1 : paragraphs_two]
+        split = list(filter(None, table.split("\n")))
+        expected_table_header = "| Industry   | AI Use Case                    | Impact                    |"
+        expected_last_row = "| Finance    | Fraud detection and prevention | Reduced financial losses  |"
+
+        assert split[0] == expected_table_header
+        assert split[-1] == expected_last_row
+
+    def test_table_content_correct_parsing(self, test_files_path, docx_converter):
+        paths = [test_files_path / "docx" / "sample_docx_3.docx"]
+        output = docx_converter.run(sources=paths)
+        content = output["documents"][0].content
+
+        paragraphs_one = content.find("Table: AI Use Cases in Different Industries")
+        paragraphs_two = content.find("Paragraph 2:")
+        table = content[paragraphs_one + len("Table: AI Use Cases in Different Industries") + 1 : paragraphs_two]
+        split = list(filter(None, table.split("\n")))
+
+        assert len(split) == 4
+
+        expected_table_header = "| Industry   | AI Use Case                    | Impact                    |"
+        expected_table_top_border = "| ---------- | ------------------------------ | ------------------------- |"
+        expected_table_row_one = "| Healthcare | Predictive diagnostics         | Improved patient outcomes |"
+        expected_table_row_two = "| Finance    | Fraud detection and prevention | Reduced financial losses  |"
+
+        assert split[0] == expected_table_header
+        assert split[1] == expected_table_top_border
+        assert split[2] == expected_table_row_one
+        assert split[3] == expected_table_row_two
+
     def test_run_with_additional_meta(self, test_files_path, docx_converter):
         paths = [test_files_path / "docx" / "sample_docx_1.docx"]
         output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"})