-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a model for parsing YouTube V1 API responses
This is mostly arranged acount captions, but has room to grow
- Loading branch information
Jon Betts
committed
Aug 2, 2023
1 parent
aa70824
commit 7830357
Showing
6 changed files
with
350 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import pytest | ||
from pytest import param | ||
|
||
from via.services.youtube_api._nested_data import safe_get | ||
|
||
|
||
class TestSafeGet: | ||
@pytest.mark.parametrize( | ||
"data,path,expected", | ||
( | ||
param({"a": {"b": 1}}, ["a", "b"], 1, id="nested_dict_key"), | ||
param({"a": 1}, ["b"], ..., id="missing_dict_key"), | ||
param({"a": None}, ["a"], None, id="null_not_default"), | ||
param({"a": None}, ["a", "b"], ..., id="dict_key_into_none"), | ||
param({"a": [{"b": 1}]}, ["a", 0, "b"], 1, id="array_key"), | ||
param({"a": [{"b": 1}]}, ["a", 1, "b"], ..., id="missing_array_key"), | ||
), | ||
) | ||
def test_it(self, data, path, expected): | ||
assert safe_get(data, path, default=...) == expected |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
from unittest.mock import sentinel | ||
|
||
import pytest | ||
from h_matchers import Any | ||
from pytest import param | ||
|
||
from via.services.youtube_api import Captions, CaptionTrack, Video | ||
|
||
|
||
class TestCaptionTrack: | ||
@pytest.mark.parametrize("kind", (None, True)) | ||
def test_from_v1_json(self, kind): | ||
data = { | ||
"name": {"simpleText": "English (British) - Name"}, | ||
"languageCode": "en-GB", | ||
"baseUrl": sentinel.url, | ||
} | ||
if kind: | ||
data["kind"] = kind | ||
|
||
caption_track = CaptionTrack.from_v1_json(data) | ||
|
||
assert caption_track == Any.instance_of(CaptionTrack).with_attrs( | ||
{ | ||
"name": "Name", | ||
"language_code": "en-gb", | ||
"label": "English (British) - Name", | ||
"kind": kind, | ||
"base_url": sentinel.url, | ||
} | ||
) | ||
|
||
@pytest.mark.parametrize( | ||
"caption_track,id_string", | ||
( | ||
(CaptionTrack(language_code="en"), "en"), | ||
(CaptionTrack(language_code="en", kind="asr"), "en.a"), | ||
(CaptionTrack(language_code="en", name="Hello"), "en..SGVsbG8="), | ||
# Let's try everything at once | ||
( | ||
CaptionTrack(language_code="en-gb", kind="asr", name="Name"), | ||
"en-gb.a.TmFtZQ==", | ||
), | ||
), | ||
) | ||
def test_id(self, caption_track, id_string): | ||
assert caption_track.id == id_string | ||
|
||
def test_is_auto_generated(self): | ||
caption_track = CaptionTrack("en", kind="asr") | ||
assert caption_track.is_auto_generated | ||
|
||
caption_track.kind = None | ||
assert not caption_track.is_auto_generated | ||
|
||
|
||
class TestCaptions: | ||
def test_from_v1_json(self, CaptionTrack): | ||
captions = Captions.from_v1_json({"captionTracks": [{"track": "fake_dict"}]}) | ||
|
||
CaptionTrack.from_v1_json.assert_called_once_with({"track": "fake_dict"}) | ||
assert captions == Any.instance_of(Captions).with_attrs( | ||
{"tracks": [CaptionTrack.from_v1_json.return_value]} | ||
) | ||
|
||
def test_from_v1_json_minimal(self, CaptionTrack): | ||
captions = Captions.from_v1_json({}) | ||
|
||
assert not captions.tracks | ||
CaptionTrack.assert_not_called() | ||
|
||
@pytest.mark.parametrize( | ||
"preferences,expected_label", | ||
( | ||
param( | ||
[CaptionTrack("en")], | ||
"plain_en", | ||
id="direct_match", | ||
), | ||
param( | ||
[CaptionTrack("de"), CaptionTrack("en-gb")], | ||
"plain_en_gb", | ||
id="miss_then_hit", | ||
), | ||
param( | ||
[ | ||
CaptionTrack("de"), | ||
CaptionTrack(Any.string.matching("^en-.*"), name="Name"), | ||
], | ||
"named_en_gb", | ||
id="wild_cards", | ||
), | ||
param( | ||
[CaptionTrack("fr", kind=None), CaptionTrack("en", kind="asr")], | ||
"en_auto", | ||
id="fallback_to_auto", | ||
), | ||
param( | ||
[CaptionTrack(Any(), name="Name")], | ||
"named_en_gb", | ||
id="same_level_sorting", | ||
), | ||
param([CaptionTrack("fr")], None, id="miss"), | ||
), | ||
) | ||
def test_find_matching_track(self, preferences, expected_label): | ||
captions = Captions( | ||
tracks=[ | ||
CaptionTrack("en", label="plain_en"), | ||
CaptionTrack("en-gb", label="plain_en_gb"), | ||
CaptionTrack("en-us", name="Name", label="named_en_us"), | ||
CaptionTrack("en-gb", name="Name", label="named_en_gb"), | ||
CaptionTrack("en", kind="asr", label="en_auto"), | ||
] | ||
) | ||
|
||
caption_track = captions.find_matching_track(preferences) | ||
|
||
assert ( | ||
caption_track.label == expected_label | ||
if expected_label | ||
else not caption_track | ||
) | ||
|
||
@pytest.fixture | ||
def CaptionTrack(self, patch): | ||
return patch("via.services.youtube_api.models.CaptionTrack") | ||
|
||
|
||
class TestVideo: | ||
def test_from_v1_json(self, Captions): | ||
video = Video.from_v1_json( | ||
data={ | ||
"captions": {"playerCaptionsTracklistRenderer": sentinel.captions}, | ||
}, | ||
) | ||
|
||
Captions.from_v1_json.assert_called_once_with(sentinel.captions) | ||
|
||
assert video == Any.instance_of(Video).with_attrs( | ||
{"caption": Captions.from_v1_json.return_value} | ||
) | ||
|
||
def test_from_v1_json_minimal(self, Captions): | ||
Video.from_v1_json({}) | ||
|
||
Captions.from_v1_json.assert_called_once_with({}) | ||
|
||
@pytest.fixture | ||
def Captions(self, patch): | ||
return patch("via.services.youtube_api.models.Captions") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from via.services.youtube_api.models import ( | ||
Captions, | ||
CaptionTrack, | ||
Transcript, | ||
TranscriptText, | ||
Video, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
from typing import Iterable | ||
|
||
|
||
def safe_get(data, path: Iterable, default=None): | ||
"""Get deeply nested items without exploding.""" | ||
|
||
for key in path: | ||
try: | ||
data = data[key] | ||
except (KeyError, IndexError, TypeError): | ||
return default | ||
|
||
return data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
import base64 | ||
from copy import deepcopy | ||
from dataclasses import dataclass, field | ||
from operator import attrgetter | ||
from typing import List, Optional | ||
|
||
from via.services.youtube_api._nested_data import safe_get | ||
|
||
|
||
@dataclass | ||
class CaptionTrack: | ||
"""A source of transcription data, in a particular language.""" | ||
|
||
language_code: str | ||
"""Original language of the track.""" | ||
|
||
name: Optional[str] = None | ||
"""Human set name for the track.""" | ||
|
||
kind: Optional[str] = None | ||
"""Is this track automatically generated by audio to text AI?""" | ||
|
||
label: Optional[str] = None | ||
"""Human readable name (determined by language + name).""" | ||
|
||
base_url: Optional[str] = None | ||
"""URL to download the original language text (as XML).""" | ||
|
||
@classmethod | ||
def from_v1_json(cls, data: dict): | ||
"""Create an instance from a `captionTrack` section of JSON.""" | ||
|
||
label = data["name"]["simpleText"] | ||
|
||
return CaptionTrack( | ||
name=label.split(" - ", 1)[-1] if " - " in label else None, | ||
language_code=data["languageCode"].lower(), | ||
label=label, | ||
kind=data.get("kind", None), | ||
base_url=data["baseUrl"], | ||
) | ||
|
||
@property | ||
def id(self) -> str: # pylint: disable=invalid-name | ||
if self.name: | ||
# Ensure our ids don't contain wild characters | ||
name = base64.b64encode(self.name.encode("utf-8")).decode("utf-8") | ||
else: | ||
name = None | ||
|
||
return ".".join( | ||
part or "" | ||
for part in [ | ||
self.language_code, | ||
"a" if self.is_auto_generated else None, | ||
name, | ||
] | ||
).rstrip(".") | ||
|
||
@property | ||
def is_auto_generated(self) -> bool: | ||
"""Get whether this caption track auto generated.""" | ||
|
||
return self.kind == "asr" | ||
|
||
|
||
@dataclass | ||
class Captions: | ||
"""All information about captions.""" | ||
|
||
tracks: List[CaptionTrack] = field(default_factory=list) | ||
"""Available tracks to pick from.""" | ||
|
||
@classmethod | ||
def from_v1_json(cls, data: dict): | ||
"""Create an instance from JSON. | ||
This is populated from the `captions.playerCaptionsTracklistRenderer` | ||
section. | ||
""" | ||
|
||
return Captions( | ||
tracks=[ | ||
CaptionTrack.from_v1_json(track) | ||
for track in data.get("captionTracks", []) | ||
] | ||
) | ||
|
||
def find_matching_track( | ||
self, preferences: List[CaptionTrack] | ||
) -> Optional[CaptionTrack]: | ||
""" | ||
Get a caption track which matching the preferences in order. | ||
This method takes the provided list of caption track objects and | ||
searches the available tracks for those with matching details: | ||
* language_code | ||
* name | ||
* is_auto_generated / kind | ||
Earlier items are higher priority. | ||
:param preferences: List of partially filled out caption track objects | ||
which represent the caption track we would like. | ||
""" | ||
|
||
def get_key(track: CaptionTrack): | ||
return track.language_code, track.kind, track.name | ||
|
||
search_keys = [get_key(preference) for preference in preferences] | ||
best_index, best_caption_track = None, None | ||
|
||
# Sort the tracks to keep the algorithm more stable! This only insulates | ||
# us from sorting changes, not metadata changes. | ||
for caption_track in sorted(self.tracks, key=attrgetter("id")): | ||
try: | ||
index = search_keys.index(get_key(caption_track)) | ||
except ValueError: | ||
continue | ||
|
||
# Items with lower indexes are first choices for the user | ||
if best_index is None or best_index > index: | ||
best_index, best_caption_track = index, deepcopy(caption_track) | ||
|
||
return best_caption_track | ||
|
||
|
||
@dataclass | ||
class Video: | ||
"""Data for a video in YouTube.""" | ||
|
||
caption: Optional[Captions] = None | ||
"""Caption related information (tracks and languages).""" | ||
|
||
@classmethod | ||
def from_v1_json(cls, data): | ||
return Video( | ||
caption=Captions.from_v1_json( | ||
safe_get(data, ["captions", "playerCaptionsTracklistRenderer"], {}) | ||
) | ||
) | ||
|
||
|
||
@dataclass | ||
class TranscriptText: | ||
"""An individual row of transcript text.""" | ||
|
||
text: str | ||
start: float | ||
duration: float | ||
|
||
|
||
@dataclass | ||
class Transcript: | ||
"""A full transcript from a caption track.""" | ||
|
||
track: CaptionTrack | ||
text: List[TranscriptText] |