Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev.ej/convert to offline html #256

Merged
merged 4 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 62 additions & 12 deletions readalongs/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,27 +44,35 @@ class like pathlib.Path. Warning: don't just use "/some/path/config.json"

import io
import logging
import os
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Sequence, Tuple, Union

import click
from lxml import etree

from readalongs import cli
from readalongs.align import create_ras_from_text
from readalongs.log import LOGGER
from readalongs.text.add_ids_to_xml import add_ids
from readalongs.text.make_package import (
DEFAULT_HEADER,
DEFAULT_SUBHEADER,
DEFAULT_TITLE,
create_web_component_html,
)
from readalongs.text.util import parse_xml
from readalongs.util import JoinerCallbackForClick, get_langs_deferred


def align(
textfile: Union[str, Path],
audiofile: Union[str, Path],
output_base: Union[str, Path],
textfile: Union[str, os.PathLike],
audiofile: Union[str, os.PathLike],
output_base: Union[str, os.PathLike],
language: Sequence[str] = (),
output_formats: Sequence[str] = (),
**kwargs
**kwargs,
) -> Tuple[int, Optional[Exception], str]:
"""Run the "readalongs align" command from within a Python script.

Expand Down Expand Up @@ -119,10 +127,10 @@ def align(


def make_xml(
plaintextfile: Union[str, Path],
xmlfile: Union[str, Path],
plaintextfile: Union[str, os.PathLike],
xmlfile: Union[str, os.PathLike],
language: Sequence[str],
**kwargs
**kwargs,
) -> Tuple[int, Optional[Exception], str]:
"""Run the "readalongs make-xml" command from within a Python script.

Expand All @@ -144,7 +152,7 @@ def make_xml(
if isinstance(plaintextfile, click.utils.LazyFile)
else plaintextfile
)
xmlfile = str(xmlfile) if isinstance(xmlfile, Path) else xmlfile
xmlfile = str(xmlfile) if isinstance(xmlfile, os.PathLike) else xmlfile
logging_stream = io.StringIO()
logging_handler = logging.StreamHandler(logging_stream)
try:
Expand Down Expand Up @@ -224,10 +232,8 @@ def convert_to_readalong(
(has no functional effect since g2p is not applied, it's only metadata)

Returns:
str: the readalong XML string, ready to print to a .readalong file
str: the readalong XML file contents, ready to print to .readalong
"""
from lxml import etree

xml_text = create_ras_from_text(
["".join(token.text for token in sentence) for sentence in sentences],
language,
Expand Down Expand Up @@ -259,3 +265,47 @@ def convert_to_readalong(
).decode("utf8")

return xml_text + "\n"


def convert_to_offline_html(
sentences: Sequence[Sequence[Token]],
audio_file_name: Union[str, os.PathLike],
language: Sequence[str] = ("und",),
title: str = DEFAULT_TITLE,
header: str = DEFAULT_HEADER,
subheader: str = DEFAULT_SUBHEADER,
) -> Tuple[str, str]:
"""Convert a list of sentences/paragraphs/pages of tokens, with corresponding autdio,
into a readalong Offline HTML

Args:
sentences: a list of sentences, each of which is a list of Token objects
Paragraph breaks are marked by a empty sentence (i.e., an empty list)
Page breaks are marked by two empty sentences in a row
audio_file_name: the name of the audio file to be used in the offline HTML
language: list of languages to declare at the top of the readalong
(has no functional effect since g2p is not applied, it's only metadata)
title: optional title, will fill the HTML <title> tag
header: optional header, will fill the readalong <span slot='read-along-header'>
subheader: optional subheader, will fill the readalong <span slot='read-along-subheader'>

Returns:
(html_contents, readalong_contents):
- the readalong Offline HTML file contents, ready to print to .html
- the readalong XML file contents, ready to print to .readalong
"""

readalong_xml = convert_to_readalong(sentences, language)
try:
readalong_file = tempfile.NamedTemporaryFile(
"w", encoding="utf8", delete=False, suffix=".readalong"
)
readalong_file.write(readalong_xml)
readalong_file.close()
# print(readalong_file.name)
offline_html = create_web_component_html(
readalong_file.name, audio_file_name, title, header, subheader
)
return offline_html, readalong_xml
finally:
os.unlink(readalong_file.name)
11 changes: 5 additions & 6 deletions readalongs/text/make_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
from base64 import b64encode
from mimetypes import guess_type
from typing import Any
from typing import Any, Union

from lxml import etree

Expand All @@ -36,7 +36,6 @@
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=5.0">
<meta name="application-name" content="read along">
<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=5.0">
<meta name="generator" content="@readalongs/studio (cli) {studio_version}">
<title>{title}</title>
<script>{js}</script>
Expand All @@ -57,11 +56,11 @@
DEFAULT_SUBHEADER = "Your read-along subtitle goes here"


def encode_from_path(path: str) -> str:
def encode_from_path(path: Union[str, os.PathLike]) -> str:
"""Encode file from bytes to b64 string with data and mime signature

Args:
path (str): path to file
path: path to file

Returns:
str: base64 string with data and mime signature
Expand Down Expand Up @@ -118,8 +117,8 @@ def encode_from_path(path: str) -> str:


def create_web_component_html(
ras_path: str,
audio_path: str,
ras_path: Union[str, os.PathLike],
audio_path: Union[str, os.PathLike],
title=DEFAULT_TITLE,
header=DEFAULT_HEADER,
subheader=DEFAULT_SUBHEADER,
Expand Down
3 changes: 1 addition & 2 deletions readalongs/text/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from collections import OrderedDict
from datetime import datetime
from io import TextIOWrapper
from pathlib import Path
from typing import IO, Union

from lxml import etree
Expand Down Expand Up @@ -104,7 +103,7 @@ def is_do_not_align(element):
return dna in ("true", "True", "TRUE", "1")


def load_xml(input_path: Union[str, Path, IO]) -> etree.ElementTree:
def load_xml(input_path: Union[str, os.PathLike, IO]) -> etree.ElementTree:
"""Safely load an XML file with etree.parse to respect encoding

Return: the root of the XML etree
Expand Down
67 changes: 43 additions & 24 deletions test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,36 +97,38 @@ def test_deprecated_prepare(self):
api.prepare(self.data_dir / "ej-fra.txt", os.devnull, ("fra",))
self.assertIn("deprecated", "\n".join(cm.output))

sentences_to_convert = [
[
api.Token("Bonjöûr,", 0.2, 1.0),
api.Token(" "),
api.Token("hello", 1.0, 0.2),
api.Token("!"),
],
[api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
[],
[api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
[],
[],
[
api.Token("("),
api.Token('"'),
api.Token("Page2", 5.2, 0.2),
api.Token("."),
api.Token('"'),
api.Token(")"),
],
]

def test_convert_to_readalong(self):
sentences = [
[
api.Token("Bonjöûr,", 0.2, 1.0),
api.Token(" "),
api.Token("hello", 1.0, 0.2),
api.Token("!"),
],
[api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
[],
[api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
[],
[],
[
api.Token("("),
api.Token('"'),
api.Token("Page2", 5.2, 0.2),
api.Token("."),
api.Token('"'),
api.Token(")"),
],
]

readalong = api.convert_to_readalong(sentences)

readalong = api.convert_to_readalong(self.sentences_to_convert)
# print(readalong)

# Make the reference by calling align with the same text and adjusting
# things we expect to be different.
sentences_as_text = "\n".join(
"".join(token.text for token in sentence) for sentence in sentences
"".join(token.text for token in sentence)
for sentence in self.sentences_to_convert
)
with open(self.tempdir / "sentences.txt", "w", encoding="utf8") as f:
f.write(sentences_as_text)
Expand All @@ -152,6 +154,23 @@ def test_convert_to_readalong(self):
readalong = re.sub(r"dur=\".*?\"", 'dur="ddd"', readalong)
self.assertEqual(readalong, align_result)

def test_convert_to_offline_html(self):
html, _ = api.convert_to_offline_html(
self.sentences_to_convert,
str(self.data_dir / "noise.mp3"),
subheader="by Jove!",
)
# with open("test.html", "w", encoding="utf8") as f:
# f.write(html)
# print(html)
self.assertIn("<html", html)
self.assertIn("<body", html)
self.assertIn('<meta name="generator" content="@readalongs/studio (cli)', html)
self.assertIn('<read-along href="data:application/readalong+xml;base64', html)
self.assertIn('audio="data:audio/', html)
self.assertIn("<span slot='read-along-header'>", html)
self.assertIn("<span slot='read-along-subheader'>by Jove!</span>", html)


if __name__ == "__main__":
main()
Loading