Skip to content

Commit

Permalink
Add unparsed_img text file format
Browse files Browse the repository at this point in the history
  • Loading branch information
naglis committed Dec 22, 2024
1 parent 3fb41e4 commit 1bbbf23
Show file tree
Hide file tree
Showing 6 changed files with 233 additions and 22 deletions.
23 changes: 23 additions & 0 deletions aeneas/tests/res/inputtext/sonnet_unparsed_img_id.xhtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en" xml:lang="en">
<head>
<meta charset="utf-8"/>
<meta name="viewport" content="width=768,height=1024"/>
<link rel="stylesheet" href="../Styles/style.css" type="text/css"/>
<title>Sonnet I</title>
</head>
<body>
<div id="divTitle">
<h1><span id="f001">I</span></h1>
</div>
<div id="divSonnet">
<p>
<span id="f002">From fairest creatures we desire increase,</span><br/>
</p>
<feature>
<img id="f003" alt="This is the image description inside alt tag."/>
</feature>
</div>
</body>
</html>

23 changes: 23 additions & 0 deletions aeneas/tests/res/inputtext/sonnet_unparsed_img_no_alt.xhtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en" xml:lang="en">
<head>
<meta charset="utf-8"/>
<meta name="viewport" content="width=768,height=1024"/>
<link rel="stylesheet" href="../Styles/style.css" type="text/css"/>
<title>Sonnet I</title>
</head>
<body>
<div id="divTitle">
<h1><span id="f001">I</span></h1>
</div>
<div id="divSonnet">
<p>
<span id="f002">From fairest creatures we desire increase,</span><br/>
</p>
<feature>
<img id="f003"/>
</feature>
</div>
</body>
</html>

23 changes: 23 additions & 0 deletions aeneas/tests/res/inputtext/sonnet_unparsed_img_no_id.xhtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en" xml:lang="en">
<head>
<meta charset="utf-8"/>
<meta name="viewport" content="width=768,height=1024"/>
<link rel="stylesheet" href="../Styles/style.css" type="text/css"/>
<title>Sonnet I</title>
</head>
<body>
<div id="divTitle">
<h1><span id="f001">I</span></h1>
</div>
<div id="divSonnet">
<p>
<span id="f002">From fairest creatures we desire increase,</span><br/>
</p>
<feature>
<img alt="This is the image description inside alt tag."/>
</feature>
</div>
</body>
</html>

58 changes: 58 additions & 0 deletions aeneas/tests/test_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@


class TestTask(unittest.TestCase):
maxDiff = None

def assertTextFragmentsEqual(self, text_file, expected):
self.assertSequenceEqual(
[(f.identifier, f.text) for f in text_file.fragments], expected
)

def dummy_sync_map(self):
sync_map = SyncMap()
frag = TextFragment("f001", Language.ENG, ["Fragment 1"])
Expand Down Expand Up @@ -95,6 +102,7 @@ def set_text_file(
task.text_file_path_absolute = gf.absolute_path(path, __file__)
self.assertIsNotNone(task.text_file)
self.assertEqual(len(task.text_file), expected)
return task.text_file

def tc_from_string(self, config_string, properties):
taskconf = TaskConfiguration(config_string)
Expand Down Expand Up @@ -216,6 +224,56 @@ def test_set_text_file_unparsed_id_class_empty(self):
id_sort=IDSortingAlgorithm.NUMERIC,
)

def test_set_text_file_unparsed_img_id_img_alt(self):
text_file = self.set_text_file(
"res/inputtext/sonnet_unparsed_img_id.xhtml",
TextFileFormat.UNPARSED_IMG,
3,
id_regex="f[0-9]+",
id_sort=IDSortingAlgorithm.NUMERIC,
)
self.assertTextFragmentsEqual(
text_file,
[
("f001", "I"),
("f002", "From fairest creatures we desire increase,"),
("f003", "This is the image description inside alt tag."),
],
)

def test_set_text_file_unparsed_img_no_id(self):
text_file = self.set_text_file(
"res/inputtext/sonnet_unparsed_img_no_id.xhtml",
TextFileFormat.UNPARSED_IMG,
2,
id_regex="f[0-9]+",
id_sort=IDSortingAlgorithm.NUMERIC,
)
self.assertTextFragmentsEqual(
text_file,
[
("f001", "I"),
("f002", "From fairest creatures we desire increase,"),
],
)

def test_set_text_file_unparsed_img_no_alt(self):
text_file = self.set_text_file(
"res/inputtext/sonnet_unparsed_img_no_alt.xhtml",
TextFileFormat.UNPARSED_IMG,
3,
id_regex="f[0-9]+",
id_sort=IDSortingAlgorithm.NUMERIC,
)
self.assertTextFragmentsEqual(
text_file,
[
("f001", "I"),
("f002", "From fairest creatures we desire increase,"),
("f003", ""),
],
)

def test_set_text_file_plain(self):
self.set_text_file("res/inputtext/sonnet_plain.txt", TextFileFormat.PLAIN, 15)

Expand Down
39 changes: 20 additions & 19 deletions aeneas/tests/test_textfile.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#!/usr/bin/env python

# aeneas is a Python/C library and a set of tools
# to automagically synchronize audio and text (aka forced alignment)
#
Expand All @@ -24,12 +22,14 @@

from aeneas.idsortingalgorithm import IDSortingAlgorithm
from aeneas.language import Language
from aeneas.textfile import TextFile
from aeneas.textfile import TextFileFormat
from aeneas.textfile import TextFragment
from aeneas.textfile import TextFilter
from aeneas.textfile import TextFilterIgnoreRegex
from aeneas.textfile import TextFilterTransliterate
from aeneas.textfile import (
TextFile,
TextFileFormat,
TextFragment,
TextFilter,
TextFilterIgnoreRegex,
TextFilterTransliterate,
)
import aeneas.globalconstants as gc
import aeneas.globalfunctions as gf

Expand Down Expand Up @@ -60,24 +60,25 @@ class TestTextFile(unittest.TestCase):

def load(
self,
input_file_path=PLAIN_FILE_PATH,
fmt=TextFileFormat.PLAIN,
expected_length=15,
parameters=None,
input_file_path: str = PLAIN_FILE_PATH,
fmt: str = TextFileFormat.PLAIN,
expected_length: int = 15,
parameters: dict | None = None,
):
tfl = TextFile(gf.absolute_path(input_file_path, __file__), fmt, parameters)
self.assertEqual(len(tfl), expected_length)
return tfl

def load_and_sort_id(self, input_file_path, id_regex, id_sort, expected):
parameters = {}
parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX] = id_regex
parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT] = id_sort
def load_and_sort_id(
self, input_file_path: str, id_regex: str, id_sort: str, expected: list[str]
):
parameters = {
gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX: id_regex,
gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT: id_sort,
}
tfl = self.load(input_file_path, TextFileFormat.UNPARSED, 5, parameters)
i = 0
for e in expected:
for i, e in enumerate(expected):
self.assertEqual(tfl.fragments[i].identifier, e)
i += 1

def load_and_slice(self, expected, start=None, end=None):
tfl = self.load()
Expand Down
89 changes: 86 additions & 3 deletions aeneas/textfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,10 +231,68 @@ class TextFileFormat:
"""

UNPARSED_IMG = "unparsed_img"
"""
The text file is a well-formed HTML/XHTML file,
where the text fragments have already been marked up.
This is same as the ``unparsed`` format, but additionally the text from
`<img>` `alt` is extracted.
The text fragments will be extracted by matching
the ``id`` and/or ``class`` attributes of each elements
with the provided regular expressions::
<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en" xml:lang="en">
<head>
<meta charset="utf-8"/>
<link rel="stylesheet" href="../Styles/style.css" type="text/css"/>
<title>Sonnet I</title>
</head>
<body>
<div id="divTitle">
<h1><span class="ra" id="f001">I</span></h1>
</div>
<div id="divSonnet">
<p>
<span class="ra" id="f002">From fairest creatures we desire increase,</span><br/>
<span class="ra" id="f003">That thereby beauty’s rose might never die,</span><br/>
<span class="ra" id="f004">But as the riper should by time decease,</span><br/>
<span class="ra" id="f005">His tender heir might bear his memory:</span><br/>
<span class="ra" id="f006">But thou contracted to thine own bright eyes,</span><br/>
<span class="ra" id="f007">Feed’st thy light’s flame with self-substantial fuel,</span><br/>
<span class="ra" id="f008">Making a famine where abundance lies,</span><br/>
<span class="ra" id="f009">Thy self thy foe, to thy sweet self too cruel:</span><br/>
<span class="ra" id="f010">Thou that art now the world’s fresh ornament,</span><br/>
<span class="ra" id="f011">And only herald to the gaudy spring,</span><br/>
<span class="ra" id="f012">Within thine own bud buriest thy content,</span><br/>
<span class="ra" id="f013">And tender churl mak’st waste in niggarding:</span><br/>
<span class="ra" id="f014">Pity the world, or else this glutton be,</span><br/>
<span class="ra" id="f015">To eat the world’s due, by the grave and thee.</span>
</p>
<figure>
<img alt="This is an image description." src="img.png"/>
</figure>
</div>
</body>
</html>
"""

MULTILEVEL_VALUES = [MPLAIN, MUNPARSED]
""" List of all multilevel formats """

ALLOWED_VALUES = [MPLAIN, MUNPARSED, PARSED, PLAIN, SUBTITLES, UNPARSED]
ALLOWED_VALUES = [
MPLAIN,
MUNPARSED,
PARSED,
PLAIN,
SUBTITLES,
UNPARSED,
UNPARSED_IMG,
]
""" List of all the allowed values """


Expand Down Expand Up @@ -671,6 +729,7 @@ def _read_from_file(self):
TextFileFormat.PLAIN: self._read_plain,
TextFileFormat.SUBTITLES: self._read_subtitles,
TextFileFormat.UNPARSED: self._read_unparsed,
TextFileFormat.UNPARSED_IMG: self._read_unparsed_img,
}
map_read_function[self.file_format](lines)

Expand Down Expand Up @@ -911,11 +970,25 @@ def _read_plain(self, lines: typing.Sequence[str]):
(id_format % idx, [line.strip()]) for idx, line in enumerate(lines, start=1)
)

def _read_unparsed(self, lines: typing.Sequence[str]):
@staticmethod
def _get_node_text(node, *, read_img_alt: bool) -> str:
if node.text:
return node.text
elif read_img_alt and node.name == "img":
alt = node.attrs.get("alt")
if alt is not None:
return alt

return ""

def _read_unparsed(
self, lines: typing.Sequence[str], *, read_img_alt: bool = False
):
"""
Read text fragments from an unparsed format text file.
:param list lines: the lines of the unparsed text file
:param bool read_img_alt: if True, read text from `<img/>` tag `alt` attribute
"""

def filter_attributes():
Expand Down Expand Up @@ -950,7 +1023,8 @@ def filter_attributes():
for node in nodes:
try:
f_id = node["id"]
f_text = node.text
f_text = self._get_node_text(node, read_img_alt=read_img_alt)

text_from_id[f_id] = f_text
ids.append(f_id)
except KeyError:
Expand All @@ -970,6 +1044,15 @@ def filter_attributes():
self.log("Appending fragments")
self._create_text_fragments((key, [text_from_id[key]]) for key in sorted_ids)

def _read_unparsed_img(self, lines: typing.Sequence[str]):
"""
Read text fragments from an unparsed format text file, additionally
extracting image descriptions.
:param list lines: the lines of the unparsed text file
"""
return self._read_unparsed(lines, read_img_alt=True)

def _get_id_format(self):
"""Return the id regex from the parameters"""
id_format = gf.safe_get(
Expand Down

0 comments on commit 1bbbf23

Please sign in to comment.