Skip to content

Commit

Permalink
Add a model for parsing YouTube V1 API responses
Browse files Browse the repository at this point in the history
This is mostly arranged acount captions, but has room to grow
  • Loading branch information
Jon Betts committed Aug 2, 2023
1 parent aa70824 commit 7830357
Show file tree
Hide file tree
Showing 6 changed files with 350 additions and 0 deletions.
Empty file.
20 changes: 20 additions & 0 deletions tests/unit/via/services/youtube_api/_nested_data_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import pytest
from pytest import param

from via.services.youtube_api._nested_data import safe_get


class TestSafeGet:
@pytest.mark.parametrize(
"data,path,expected",
(
param({"a": {"b": 1}}, ["a", "b"], 1, id="nested_dict_key"),
param({"a": 1}, ["b"], ..., id="missing_dict_key"),
param({"a": None}, ["a"], None, id="null_not_default"),
param({"a": None}, ["a", "b"], ..., id="dict_key_into_none"),
param({"a": [{"b": 1}]}, ["a", 0, "b"], 1, id="array_key"),
param({"a": [{"b": 1}]}, ["a", 1, "b"], ..., id="missing_array_key"),
),
)
def test_it(self, data, path, expected):
assert safe_get(data, path, default=...) == expected
151 changes: 151 additions & 0 deletions tests/unit/via/services/youtube_api/models_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
from unittest.mock import sentinel

import pytest
from h_matchers import Any
from pytest import param

from via.services.youtube_api import Captions, CaptionTrack, Video


class TestCaptionTrack:
@pytest.mark.parametrize("kind", (None, True))
def test_from_v1_json(self, kind):
data = {
"name": {"simpleText": "English (British) - Name"},
"languageCode": "en-GB",
"baseUrl": sentinel.url,
}
if kind:
data["kind"] = kind

caption_track = CaptionTrack.from_v1_json(data)

assert caption_track == Any.instance_of(CaptionTrack).with_attrs(
{
"name": "Name",
"language_code": "en-gb",
"label": "English (British) - Name",
"kind": kind,
"base_url": sentinel.url,
}
)

@pytest.mark.parametrize(
"caption_track,id_string",
(
(CaptionTrack(language_code="en"), "en"),
(CaptionTrack(language_code="en", kind="asr"), "en.a"),
(CaptionTrack(language_code="en", name="Hello"), "en..SGVsbG8="),
# Let's try everything at once
(
CaptionTrack(language_code="en-gb", kind="asr", name="Name"),
"en-gb.a.TmFtZQ==",
),
),
)
def test_id(self, caption_track, id_string):
assert caption_track.id == id_string

def test_is_auto_generated(self):
caption_track = CaptionTrack("en", kind="asr")
assert caption_track.is_auto_generated

caption_track.kind = None
assert not caption_track.is_auto_generated


class TestCaptions:
def test_from_v1_json(self, CaptionTrack):
captions = Captions.from_v1_json({"captionTracks": [{"track": "fake_dict"}]})

CaptionTrack.from_v1_json.assert_called_once_with({"track": "fake_dict"})
assert captions == Any.instance_of(Captions).with_attrs(
{"tracks": [CaptionTrack.from_v1_json.return_value]}
)

def test_from_v1_json_minimal(self, CaptionTrack):
captions = Captions.from_v1_json({})

assert not captions.tracks
CaptionTrack.assert_not_called()

@pytest.mark.parametrize(
"preferences,expected_label",
(
param(
[CaptionTrack("en")],
"plain_en",
id="direct_match",
),
param(
[CaptionTrack("de"), CaptionTrack("en-gb")],
"plain_en_gb",
id="miss_then_hit",
),
param(
[
CaptionTrack("de"),
CaptionTrack(Any.string.matching("^en-.*"), name="Name"),
],
"named_en_gb",
id="wild_cards",
),
param(
[CaptionTrack("fr", kind=None), CaptionTrack("en", kind="asr")],
"en_auto",
id="fallback_to_auto",
),
param(
[CaptionTrack(Any(), name="Name")],
"named_en_gb",
id="same_level_sorting",
),
param([CaptionTrack("fr")], None, id="miss"),
),
)
def test_find_matching_track(self, preferences, expected_label):
captions = Captions(
tracks=[
CaptionTrack("en", label="plain_en"),
CaptionTrack("en-gb", label="plain_en_gb"),
CaptionTrack("en-us", name="Name", label="named_en_us"),
CaptionTrack("en-gb", name="Name", label="named_en_gb"),
CaptionTrack("en", kind="asr", label="en_auto"),
]
)

caption_track = captions.find_matching_track(preferences)

assert (
caption_track.label == expected_label
if expected_label
else not caption_track
)

@pytest.fixture
def CaptionTrack(self, patch):
return patch("via.services.youtube_api.models.CaptionTrack")


class TestVideo:
def test_from_v1_json(self, Captions):
video = Video.from_v1_json(
data={
"captions": {"playerCaptionsTracklistRenderer": sentinel.captions},
},
)

Captions.from_v1_json.assert_called_once_with(sentinel.captions)

assert video == Any.instance_of(Video).with_attrs(
{"caption": Captions.from_v1_json.return_value}
)

def test_from_v1_json_minimal(self, Captions):
Video.from_v1_json({})

Captions.from_v1_json.assert_called_once_with({})

@pytest.fixture
def Captions(self, patch):
return patch("via.services.youtube_api.models.Captions")
7 changes: 7 additions & 0 deletions via/services/youtube_api/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from via.services.youtube_api.models import (
Captions,
CaptionTrack,
Transcript,
TranscriptText,
Video,
)
13 changes: 13 additions & 0 deletions via/services/youtube_api/_nested_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from typing import Iterable


def safe_get(data, path: Iterable, default=None):
"""Get deeply nested items without exploding."""

for key in path:
try:
data = data[key]
except (KeyError, IndexError, TypeError):
return default

return data
159 changes: 159 additions & 0 deletions via/services/youtube_api/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import base64
from copy import deepcopy
from dataclasses import dataclass, field
from operator import attrgetter
from typing import List, Optional

from via.services.youtube_api._nested_data import safe_get


@dataclass
class CaptionTrack:
"""A source of transcription data, in a particular language."""

language_code: str
"""Original language of the track."""

name: Optional[str] = None
"""Human set name for the track."""

kind: Optional[str] = None
"""Is this track automatically generated by audio to text AI?"""

label: Optional[str] = None
"""Human readable name (determined by language + name)."""

base_url: Optional[str] = None
"""URL to download the original language text (as XML)."""

@classmethod
def from_v1_json(cls, data: dict):
"""Create an instance from a `captionTrack` section of JSON."""

label = data["name"]["simpleText"]

return CaptionTrack(
name=label.split(" - ", 1)[-1] if " - " in label else None,
language_code=data["languageCode"].lower(),
label=label,
kind=data.get("kind", None),
base_url=data["baseUrl"],
)

@property
def id(self) -> str: # pylint: disable=invalid-name
if self.name:
# Ensure our ids don't contain wild characters
name = base64.b64encode(self.name.encode("utf-8")).decode("utf-8")
else:
name = None

return ".".join(
part or ""
for part in [
self.language_code,
"a" if self.is_auto_generated else None,
name,
]
).rstrip(".")

@property
def is_auto_generated(self) -> bool:
"""Get whether this caption track auto generated."""

return self.kind == "asr"


@dataclass
class Captions:
"""All information about captions."""

tracks: List[CaptionTrack] = field(default_factory=list)
"""Available tracks to pick from."""

@classmethod
def from_v1_json(cls, data: dict):
"""Create an instance from JSON.
This is populated from the `captions.playerCaptionsTracklistRenderer`
section.
"""

return Captions(
tracks=[
CaptionTrack.from_v1_json(track)
for track in data.get("captionTracks", [])
]
)

def find_matching_track(
self, preferences: List[CaptionTrack]
) -> Optional[CaptionTrack]:
"""
Get a caption track which matching the preferences in order.
This method takes the provided list of caption track objects and
searches the available tracks for those with matching details:
* language_code
* name
* is_auto_generated / kind
Earlier items are higher priority.
:param preferences: List of partially filled out caption track objects
which represent the caption track we would like.
"""

def get_key(track: CaptionTrack):
return track.language_code, track.kind, track.name

search_keys = [get_key(preference) for preference in preferences]
best_index, best_caption_track = None, None

# Sort the tracks to keep the algorithm more stable! This only insulates
# us from sorting changes, not metadata changes.
for caption_track in sorted(self.tracks, key=attrgetter("id")):
try:
index = search_keys.index(get_key(caption_track))
except ValueError:
continue

# Items with lower indexes are first choices for the user
if best_index is None or best_index > index:
best_index, best_caption_track = index, deepcopy(caption_track)

return best_caption_track


@dataclass
class Video:
"""Data for a video in YouTube."""

caption: Optional[Captions] = None
"""Caption related information (tracks and languages)."""

@classmethod
def from_v1_json(cls, data):
return Video(
caption=Captions.from_v1_json(
safe_get(data, ["captions", "playerCaptionsTracklistRenderer"], {})
)
)


@dataclass
class TranscriptText:
"""An individual row of transcript text."""

text: str
start: float
duration: float


@dataclass
class Transcript:
"""A full transcript from a caption track."""

track: CaptionTrack
text: List[TranscriptText]

0 comments on commit 7830357

Please sign in to comment.