Add unparsed_img text file format

naglis · Dec 22, 2024 · 1bbbf23 · 1bbbf23
1 parent 3fb41e4
commit 1bbbf23
Show file tree

Hide file tree

Showing 6 changed files with 233 additions and 22 deletions.
diff --git a/aeneas/tests/res/inputtext/sonnet_unparsed_img_id.xhtml b/aeneas/tests/res/inputtext/sonnet_unparsed_img_id.xhtml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en" xml:lang="en">
+ <head>
+  <meta charset="utf-8"/>
+  <meta name="viewport" content="width=768,height=1024"/>
+  <link rel="stylesheet" href="../Styles/style.css" type="text/css"/>
+  <title>Sonnet I</title>
+ </head>
+ <body>
+  <div id="divTitle">
+   <h1><span id="f001">I</span></h1>
+  </div>
+  <div id="divSonnet"> 
+   <p>
+    <span id="f002">From fairest creatures we desire increase,</span><br/>
+   </p>
+   <feature>
+    <img id="f003" alt="This is the image description inside alt tag."/>
+   </feature>
+  </div>
+ </body>
+</html>
+
diff --git a/aeneas/tests/res/inputtext/sonnet_unparsed_img_no_alt.xhtml b/aeneas/tests/res/inputtext/sonnet_unparsed_img_no_alt.xhtml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en" xml:lang="en">
+ <head>
+  <meta charset="utf-8"/>
+  <meta name="viewport" content="width=768,height=1024"/>
+  <link rel="stylesheet" href="../Styles/style.css" type="text/css"/>
+  <title>Sonnet I</title>
+ </head>
+ <body>
+  <div id="divTitle">
+   <h1><span id="f001">I</span></h1>
+  </div>
+  <div id="divSonnet"> 
+   <p>
+    <span id="f002">From fairest creatures we desire increase,</span><br/>
+   </p>
+   <feature>
+    <img id="f003"/>
+   </feature>
+  </div>
+ </body>
+</html>
+
diff --git a/aeneas/tests/res/inputtext/sonnet_unparsed_img_no_id.xhtml b/aeneas/tests/res/inputtext/sonnet_unparsed_img_no_id.xhtml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en" xml:lang="en">
+ <head>
+  <meta charset="utf-8"/>
+  <meta name="viewport" content="width=768,height=1024"/>
+  <link rel="stylesheet" href="../Styles/style.css" type="text/css"/>
+  <title>Sonnet I</title>
+ </head>
+ <body>
+  <div id="divTitle">
+   <h1><span id="f001">I</span></h1>
+  </div>
+  <div id="divSonnet"> 
+   <p>
+    <span id="f002">From fairest creatures we desire increase,</span><br/>
+   </p>
+   <feature>
+    <img alt="This is the image description inside alt tag."/>
+   </feature>
+  </div>
+ </body>
+</html>
+
diff --git a/aeneas/tests/test_task.py b/aeneas/tests/test_task.py
@@ -38,6 +38,13 @@
 
 
 class TestTask(unittest.TestCase):
+    maxDiff = None
+
+    def assertTextFragmentsEqual(self, text_file, expected):
+        self.assertSequenceEqual(
+            [(f.identifier, f.text) for f in text_file.fragments], expected
+        )
+
     def dummy_sync_map(self):
         sync_map = SyncMap()
         frag = TextFragment("f001", Language.ENG, ["Fragment 1"])
@@ -95,6 +102,7 @@ def set_text_file(
         task.text_file_path_absolute = gf.absolute_path(path, __file__)
         self.assertIsNotNone(task.text_file)
         self.assertEqual(len(task.text_file), expected)
+        return task.text_file
 
     def tc_from_string(self, config_string, properties):
         taskconf = TaskConfiguration(config_string)
@@ -216,6 +224,56 @@ def test_set_text_file_unparsed_id_class_empty(self):
             id_sort=IDSortingAlgorithm.NUMERIC,
         )
 
+    def test_set_text_file_unparsed_img_id_img_alt(self):
+        text_file = self.set_text_file(
+            "res/inputtext/sonnet_unparsed_img_id.xhtml",
+            TextFileFormat.UNPARSED_IMG,
+            3,
+            id_regex="f[0-9]+",
+            id_sort=IDSortingAlgorithm.NUMERIC,
+        )
+        self.assertTextFragmentsEqual(
+            text_file,
+            [
+                ("f001", "I"),
+                ("f002", "From fairest creatures we desire increase,"),
+                ("f003", "This is the image description inside alt tag."),
+            ],
+        )
+
+    def test_set_text_file_unparsed_img_no_id(self):
+        text_file = self.set_text_file(
+            "res/inputtext/sonnet_unparsed_img_no_id.xhtml",
+            TextFileFormat.UNPARSED_IMG,
+            2,
+            id_regex="f[0-9]+",
+            id_sort=IDSortingAlgorithm.NUMERIC,
+        )
+        self.assertTextFragmentsEqual(
+            text_file,
+            [
+                ("f001", "I"),
+                ("f002", "From fairest creatures we desire increase,"),
+            ],
+        )
+
+    def test_set_text_file_unparsed_img_no_alt(self):
+        text_file = self.set_text_file(
+            "res/inputtext/sonnet_unparsed_img_no_alt.xhtml",
+            TextFileFormat.UNPARSED_IMG,
+            3,
+            id_regex="f[0-9]+",
+            id_sort=IDSortingAlgorithm.NUMERIC,
+        )
+        self.assertTextFragmentsEqual(
+            text_file,
+            [
+                ("f001", "I"),
+                ("f002", "From fairest creatures we desire increase,"),
+                ("f003", ""),
+            ],
+        )
+
     def test_set_text_file_plain(self):
         self.set_text_file("res/inputtext/sonnet_plain.txt", TextFileFormat.PLAIN, 15)
 

diff --git a/aeneas/tests/test_textfile.py b/aeneas/tests/test_textfile.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python
-
 # aeneas is a Python/C library and a set of tools
 # to automagically synchronize audio and text (aka forced alignment)
 #
@@ -24,12 +22,14 @@
 
 from aeneas.idsortingalgorithm import IDSortingAlgorithm
 from aeneas.language import Language
-from aeneas.textfile import TextFile
-from aeneas.textfile import TextFileFormat
-from aeneas.textfile import TextFragment
-from aeneas.textfile import TextFilter
-from aeneas.textfile import TextFilterIgnoreRegex
-from aeneas.textfile import TextFilterTransliterate
+from aeneas.textfile import (
+    TextFile,
+    TextFileFormat,
+    TextFragment,
+    TextFilter,
+    TextFilterIgnoreRegex,
+    TextFilterTransliterate,
+)
 import aeneas.globalconstants as gc
 import aeneas.globalfunctions as gf
 
@@ -60,24 +60,25 @@ class TestTextFile(unittest.TestCase):
 
     def load(
         self,
-        input_file_path=PLAIN_FILE_PATH,
-        fmt=TextFileFormat.PLAIN,
-        expected_length=15,
-        parameters=None,
+        input_file_path: str = PLAIN_FILE_PATH,
+        fmt: str = TextFileFormat.PLAIN,
+        expected_length: int = 15,
+        parameters: dict | None = None,
     ):
         tfl = TextFile(gf.absolute_path(input_file_path, __file__), fmt, parameters)
         self.assertEqual(len(tfl), expected_length)
         return tfl
 
-    def load_and_sort_id(self, input_file_path, id_regex, id_sort, expected):
-        parameters = {}
-        parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX] = id_regex
-        parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT] = id_sort
+    def load_and_sort_id(
+        self, input_file_path: str, id_regex: str, id_sort: str, expected: list[str]
+    ):
+        parameters = {
+            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX: id_regex,
+            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT: id_sort,
+        }
         tfl = self.load(input_file_path, TextFileFormat.UNPARSED, 5, parameters)
-        i = 0
-        for e in expected:
+        for i, e in enumerate(expected):
             self.assertEqual(tfl.fragments[i].identifier, e)
-            i += 1
 
     def load_and_slice(self, expected, start=None, end=None):
         tfl = self.load()

diff --git a/aeneas/textfile.py b/aeneas/textfile.py
@@ -231,10 +231,68 @@ class TextFileFormat:
 
     """
 
+    UNPARSED_IMG = "unparsed_img"
+    """
+    The text file is a well-formed HTML/XHTML file,
+    where the text fragments have already been marked up.
+
+    This is same as the ``unparsed`` format, but additionally the text from
+    `<img>` `alt` is extracted.
+
+    The text fragments will be extracted by matching
+    the ``id`` and/or ``class`` attributes of each elements
+    with the provided regular expressions::
+
+        <?xml version="1.0" encoding="UTF-8"?>
+        <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en" xml:lang="en">
+         <head>
+          <meta charset="utf-8"/>
+          <link rel="stylesheet" href="../Styles/style.css" type="text/css"/>
+          <title>Sonnet I</title>
+         </head>
+         <body>
+          <div id="divTitle">
+           <h1><span class="ra" id="f001">I</span></h1>
+          </div>
+          <div id="divSonnet">
+           <p>
+            <span class="ra" id="f002">From fairest creatures we desire increase,</span><br/>
+            <span class="ra" id="f003">That thereby beauty’s rose might never die,</span><br/>
+            <span class="ra" id="f004">But as the riper should by time decease,</span><br/>
+            <span class="ra" id="f005">His tender heir might bear his memory:</span><br/>
+            <span class="ra" id="f006">But thou contracted to thine own bright eyes,</span><br/>
+            <span class="ra" id="f007">Feed’st thy light’s flame with self-substantial fuel,</span><br/>
+            <span class="ra" id="f008">Making a famine where abundance lies,</span><br/>
+            <span class="ra" id="f009">Thy self thy foe, to thy sweet self too cruel:</span><br/>
+            <span class="ra" id="f010">Thou that art now the world’s fresh ornament,</span><br/>
+            <span class="ra" id="f011">And only herald to the gaudy spring,</span><br/>
+            <span class="ra" id="f012">Within thine own bud buriest thy content,</span><br/>
+            <span class="ra" id="f013">And tender churl mak’st waste in niggarding:</span><br/>
+            <span class="ra" id="f014">Pity the world, or else this glutton be,</span><br/>
+            <span class="ra" id="f015">To eat the world’s due, by the grave and thee.</span>
+           </p>
+
+           <figure>
+            <img alt="This is an image description." src="img.png"/>
+           </figure>
+
+          </div>
+         </body>
+        </html>
+    """
+
     MULTILEVEL_VALUES = [MPLAIN, MUNPARSED]
     """ List of all multilevel formats """
 
-    ALLOWED_VALUES = [MPLAIN, MUNPARSED, PARSED, PLAIN, SUBTITLES, UNPARSED]
+    ALLOWED_VALUES = [
+        MPLAIN,
+        MUNPARSED,
+        PARSED,
+        PLAIN,
+        SUBTITLES,
+        UNPARSED,
+        UNPARSED_IMG,
+    ]
     """ List of all the allowed values """
 
 
@@ -671,6 +729,7 @@ def _read_from_file(self):
             TextFileFormat.PLAIN: self._read_plain,
             TextFileFormat.SUBTITLES: self._read_subtitles,
             TextFileFormat.UNPARSED: self._read_unparsed,
+            TextFileFormat.UNPARSED_IMG: self._read_unparsed_img,
         }
         map_read_function[self.file_format](lines)
 
@@ -911,11 +970,25 @@ def _read_plain(self, lines: typing.Sequence[str]):
             (id_format % idx, [line.strip()]) for idx, line in enumerate(lines, start=1)
         )
 
-    def _read_unparsed(self, lines: typing.Sequence[str]):
+    @staticmethod
+    def _get_node_text(node, *, read_img_alt: bool) -> str:
+        if node.text:
+            return node.text
+        elif read_img_alt and node.name == "img":
+            alt = node.attrs.get("alt")
+            if alt is not None:
+                return alt
+
+        return ""
+
+    def _read_unparsed(
+        self, lines: typing.Sequence[str], *, read_img_alt: bool = False
+    ):
         """
         Read text fragments from an unparsed format text file.
 
         :param list lines: the lines of the unparsed text file
+        :param bool read_img_alt: if True, read text from `<img/>` tag `alt` attribute
         """
 
         def filter_attributes():
@@ -950,7 +1023,8 @@ def filter_attributes():
         for node in nodes:
             try:
                 f_id = node["id"]
-                f_text = node.text
+                f_text = self._get_node_text(node, read_img_alt=read_img_alt)
+
                 text_from_id[f_id] = f_text
                 ids.append(f_id)
             except KeyError:
@@ -970,6 +1044,15 @@ def filter_attributes():
         self.log("Appending fragments")
         self._create_text_fragments((key, [text_from_id[key]]) for key in sorted_ids)
 
+    def _read_unparsed_img(self, lines: typing.Sequence[str]):
+        """
+        Read text fragments from an unparsed format text file, additionally
+        extracting image descriptions.
+
+        :param list lines: the lines of the unparsed text file
+        """
+        return self._read_unparsed(lines, read_img_alt=True)
+
     def _get_id_format(self):
         """Return the id regex from the parameters"""
         id_format = gf.safe_get(