diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index b42056e..32554a2 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -434,7 +434,7 @@ def get_map_to_page_dimensions(self): return pagedims - def export_to_markdown( + def export_to_markdown( # noqa: C901 self, delim: str = "\n\n", main_text_start: int = 0, @@ -445,8 +445,10 @@ def export_to_markdown( "paragraph", "caption", "table", + "figure", ], strict_text: bool = False, + image_placeholder: str = "", ) -> str: r"""Serialize to Markdown. @@ -460,6 +462,12 @@ def export_to_markdown( Defaults to 0. main_text_end (Optional[int], optional): Main-text slicing stop index (exclusive). Defaults to None. + main_text_labels (list[str], optional): The labels to include in the + markdown. + strict_text (bool, optional): if true, the output will be only plain text + without any markdown styling. Defaults to False. + image_placeholder (str, optional): the placeholder to include to position + images in the markdown. Defaults to a markdown comment "". Returns: str: The exported Markdown representation. @@ -539,6 +547,14 @@ def export_to_markdown( markdown_text = md_table + elif isinstance(item, Figure) and item_type in main_text_labels: + + markdown_text = "" + if not strict_text: + markdown_text = f"{image_placeholder}" + if item.text: + markdown_text += "\n" + item.text + if markdown_text: md_texts.append(markdown_text) diff --git a/test/data/doc/doc-export.md b/test/data/doc/doc-export.md index 8798f47..80ca3c0 100644 --- a/test/data/doc/doc-export.md +++ b/test/data/doc/doc-export.md @@ -16,6 +16,9 @@ In modern document understanding systems [1,15], table extraction is typically a Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL). + +Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL). + today, table detection in documents is a well understood problem, and the latest state-of-the-art (SOTA) object detection methods provide an accuracy comparable to human observers [7,8,10,14,23]. On the other hand, the problem of table structure recognition (TSR) is a lot more challenging and remains a very active area of research, in which many novel machine learning algorithms are being explored [3,4,5,9,11,12,13,14,17,18,21,22]. Recently emerging SOTA methods for table structure recognition employ transformer-based models, in which an image of the table is provided to the network in order to predict the structure of the table as a sequence of tokens. These image-to-sequence (Im2Seq) models are extremely powerful, since they allow for a purely data-driven solution. The tokens of the sequence typically belong to a markup language such as HTML, Latex or Markdown, which allow to describe table structure as rows, columns and spanning cells in various configurations. In Figure 1, we illustrate how HTML is used to represent the table-structure of a particular example table. Public table-structure data sets such as PubTab-Net [22], and FinTabNet [21], which were created in a semi-automated way from paired PDF and HTML sources (e.g. PubMed Central), popularized primarily the use of HTML as ground-truth representation format for TSR. @@ -44,6 +47,9 @@ ulary and can be interpreted as a table structure. For example, with the HTML to Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet. + +Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet. + Obviously, HTML and other general-purpose markup languages were not designed for Im2Seq models. As such, they have some serious drawbacks. First, the token vocabulary needs to be artificially large in order to describe all plausible tabular structures. Since most Im2Seq models use an autoregressive approach, they generate the sequence token by token. Therefore, to reduce inference time, a shorter sequence length is critical. Every table-cell is represented by at least two tokens (