Add a model for parsing YouTube V1 API responses

This is mostly arranged acount captions, but has room to grow
hypothesis · Aug 2, 2023 · 7830357 · 7830357
1 parent aa70824
commit 7830357
Show file tree

Hide file tree

Showing 6 changed files with 350 additions and 0 deletions.
diff --git a/tests/unit/via/services/youtube_api/__init__.py b/tests/unit/via/services/youtube_api/__init__.py
diff --git a/tests/unit/via/services/youtube_api/_nested_data_test.py b/tests/unit/via/services/youtube_api/_nested_data_test.py
@@ -0,0 +1,20 @@
+import pytest
+from pytest import param
+
+from via.services.youtube_api._nested_data import safe_get
+
+
+class TestSafeGet:
+    @pytest.mark.parametrize(
+        "data,path,expected",
+        (
+            param({"a": {"b": 1}}, ["a", "b"], 1, id="nested_dict_key"),
+            param({"a": 1}, ["b"], ..., id="missing_dict_key"),
+            param({"a": None}, ["a"], None, id="null_not_default"),
+            param({"a": None}, ["a", "b"], ..., id="dict_key_into_none"),
+            param({"a": [{"b": 1}]}, ["a", 0, "b"], 1, id="array_key"),
+            param({"a": [{"b": 1}]}, ["a", 1, "b"], ..., id="missing_array_key"),
+        ),
+    )
+    def test_it(self, data, path, expected):
+        assert safe_get(data, path, default=...) == expected
diff --git a/tests/unit/via/services/youtube_api/models_test.py b/tests/unit/via/services/youtube_api/models_test.py
@@ -0,0 +1,151 @@
+from unittest.mock import sentinel
+
+import pytest
+from h_matchers import Any
+from pytest import param
+
+from via.services.youtube_api import Captions, CaptionTrack, Video
+
+
+class TestCaptionTrack:
+    @pytest.mark.parametrize("kind", (None, True))
+    def test_from_v1_json(self, kind):
+        data = {
+            "name": {"simpleText": "English (British) - Name"},
+            "languageCode": "en-GB",
+            "baseUrl": sentinel.url,
+        }
+        if kind:
+            data["kind"] = kind
+
+        caption_track = CaptionTrack.from_v1_json(data)
+
+        assert caption_track == Any.instance_of(CaptionTrack).with_attrs(
+            {
+                "name": "Name",
+                "language_code": "en-gb",
+                "label": "English (British) - Name",
+                "kind": kind,
+                "base_url": sentinel.url,
+            }
+        )
+
+    @pytest.mark.parametrize(
+        "caption_track,id_string",
+        (
+            (CaptionTrack(language_code="en"), "en"),
+            (CaptionTrack(language_code="en", kind="asr"), "en.a"),
+            (CaptionTrack(language_code="en", name="Hello"), "en..SGVsbG8="),
+            # Let's try everything at once
+            (
+                CaptionTrack(language_code="en-gb", kind="asr", name="Name"),
+                "en-gb.a.TmFtZQ==",
+            ),
+        ),
+    )
+    def test_id(self, caption_track, id_string):
+        assert caption_track.id == id_string
+
+    def test_is_auto_generated(self):
+        caption_track = CaptionTrack("en", kind="asr")
+        assert caption_track.is_auto_generated
+
+        caption_track.kind = None
+        assert not caption_track.is_auto_generated
+
+
+class TestCaptions:
+    def test_from_v1_json(self, CaptionTrack):
+        captions = Captions.from_v1_json({"captionTracks": [{"track": "fake_dict"}]})
+
+        CaptionTrack.from_v1_json.assert_called_once_with({"track": "fake_dict"})
+        assert captions == Any.instance_of(Captions).with_attrs(
+            {"tracks": [CaptionTrack.from_v1_json.return_value]}
+        )
+
+    def test_from_v1_json_minimal(self, CaptionTrack):
+        captions = Captions.from_v1_json({})
+
+        assert not captions.tracks
+        CaptionTrack.assert_not_called()
+
+    @pytest.mark.parametrize(
+        "preferences,expected_label",
+        (
+            param(
+                [CaptionTrack("en")],
+                "plain_en",
+                id="direct_match",
+            ),
+            param(
+                [CaptionTrack("de"), CaptionTrack("en-gb")],
+                "plain_en_gb",
+                id="miss_then_hit",
+            ),
+            param(
+                [
+                    CaptionTrack("de"),
+                    CaptionTrack(Any.string.matching("^en-.*"), name="Name"),
+                ],
+                "named_en_gb",
+                id="wild_cards",
+            ),
+            param(
+                [CaptionTrack("fr", kind=None), CaptionTrack("en", kind="asr")],
+                "en_auto",
+                id="fallback_to_auto",
+            ),
+            param(
+                [CaptionTrack(Any(), name="Name")],
+                "named_en_gb",
+                id="same_level_sorting",
+            ),
+            param([CaptionTrack("fr")], None, id="miss"),
+        ),
+    )
+    def test_find_matching_track(self, preferences, expected_label):
+        captions = Captions(
+            tracks=[
+                CaptionTrack("en", label="plain_en"),
+                CaptionTrack("en-gb", label="plain_en_gb"),
+                CaptionTrack("en-us", name="Name", label="named_en_us"),
+                CaptionTrack("en-gb", name="Name", label="named_en_gb"),
+                CaptionTrack("en", kind="asr", label="en_auto"),
+            ]
+        )
+
+        caption_track = captions.find_matching_track(preferences)
+
+        assert (
+            caption_track.label == expected_label
+            if expected_label
+            else not caption_track
+        )
+
+    @pytest.fixture
+    def CaptionTrack(self, patch):
+        return patch("via.services.youtube_api.models.CaptionTrack")
+
+
+class TestVideo:
+    def test_from_v1_json(self, Captions):
+        video = Video.from_v1_json(
+            data={
+                "captions": {"playerCaptionsTracklistRenderer": sentinel.captions},
+            },
+        )
+
+        Captions.from_v1_json.assert_called_once_with(sentinel.captions)
+
+        assert video == Any.instance_of(Video).with_attrs(
+            {"caption": Captions.from_v1_json.return_value}
+        )
+
+    def test_from_v1_json_minimal(self, Captions):
+        Video.from_v1_json({})
+
+        Captions.from_v1_json.assert_called_once_with({})
+
+    @pytest.fixture
+    def Captions(self, patch):
+        return patch("via.services.youtube_api.models.Captions")
diff --git a/via/services/youtube_api/__init__.py b/via/services/youtube_api/__init__.py
@@ -0,0 +1,7 @@
+from via.services.youtube_api.models import (
+    Captions,
+    CaptionTrack,
+    Transcript,
+    TranscriptText,
+    Video,
+)
diff --git a/via/services/youtube_api/_nested_data.py b/via/services/youtube_api/_nested_data.py
@@ -0,0 +1,13 @@
+from typing import Iterable
+
+
+def safe_get(data, path: Iterable, default=None):
+    """Get deeply nested items without exploding."""
+
+    for key in path:
+        try:
+            data = data[key]
+        except (KeyError, IndexError, TypeError):
+            return default
+
+    return data
diff --git a/via/services/youtube_api/models.py b/via/services/youtube_api/models.py
@@ -0,0 +1,159 @@
+import base64
+from copy import deepcopy
+from dataclasses import dataclass, field
+from operator import attrgetter
+from typing import List, Optional
+
+from via.services.youtube_api._nested_data import safe_get
+
+
+@dataclass
+class CaptionTrack:
+    """A source of transcription data, in a particular language."""
+
+    language_code: str
+    """Original language of the track."""
+
+    name: Optional[str] = None
+    """Human set name for the track."""
+
+    kind: Optional[str] = None
+    """Is this track automatically generated by audio to text AI?"""
+
+    label: Optional[str] = None
+    """Human readable name (determined by language + name)."""
+
+    base_url: Optional[str] = None
+    """URL to download the original language text (as XML)."""
+
+    @classmethod
+    def from_v1_json(cls, data: dict):
+        """Create an instance from a `captionTrack` section of JSON."""
+
+        label = data["name"]["simpleText"]
+
+        return CaptionTrack(
+            name=label.split(" - ", 1)[-1] if " - " in label else None,
+            language_code=data["languageCode"].lower(),
+            label=label,
+            kind=data.get("kind", None),
+            base_url=data["baseUrl"],
+        )
+
+    @property
+    def id(self) -> str:  # pylint: disable=invalid-name
+        if self.name:
+            # Ensure our ids don't contain wild characters
+            name = base64.b64encode(self.name.encode("utf-8")).decode("utf-8")
+        else:
+            name = None
+
+        return ".".join(
+            part or ""
+            for part in [
+                self.language_code,
+                "a" if self.is_auto_generated else None,
+                name,
+            ]
+        ).rstrip(".")
+
+    @property
+    def is_auto_generated(self) -> bool:
+        """Get whether this caption track auto generated."""
+
+        return self.kind == "asr"
+
+
+@dataclass
+class Captions:
+    """All information about captions."""
+
+    tracks: List[CaptionTrack] = field(default_factory=list)
+    """Available tracks to pick from."""
+
+    @classmethod
+    def from_v1_json(cls, data: dict):
+        """Create an instance from JSON.
+
+        This is populated from the `captions.playerCaptionsTracklistRenderer`
+        section.
+        """
+
+        return Captions(
+            tracks=[
+                CaptionTrack.from_v1_json(track)
+                for track in data.get("captionTracks", [])
+            ]
+        )
+
+    def find_matching_track(
+        self, preferences: List[CaptionTrack]
+    ) -> Optional[CaptionTrack]:
+        """
+        Get a caption track which matching the preferences in order.
+
+        This method takes the provided list of caption track objects and
+        searches the available tracks for those with matching details:
+
+        * language_code
+        * name
+        * is_auto_generated / kind
+
+        Earlier items are higher priority.
+
+        :param preferences: List of partially filled out caption track objects
+            which represent the caption track we would like.
+        """
+
+        def get_key(track: CaptionTrack):
+            return track.language_code, track.kind, track.name
+
+        search_keys = [get_key(preference) for preference in preferences]
+        best_index, best_caption_track = None, None
+
+        # Sort the tracks to keep the algorithm more stable! This only insulates
+        # us from sorting changes, not metadata changes.
+        for caption_track in sorted(self.tracks, key=attrgetter("id")):
+            try:
+                index = search_keys.index(get_key(caption_track))
+            except ValueError:
+                continue
+
+            # Items with lower indexes are first choices for the user
+            if best_index is None or best_index > index:
+                best_index, best_caption_track = index, deepcopy(caption_track)
+
+        return best_caption_track
+
+
+@dataclass
+class Video:
+    """Data for a video in YouTube."""
+
+    caption: Optional[Captions] = None
+    """Caption related information (tracks and languages)."""
+
+    @classmethod
+    def from_v1_json(cls, data):
+        return Video(
+            caption=Captions.from_v1_json(
+                safe_get(data, ["captions", "playerCaptionsTracklistRenderer"], {})
+            )
+        )
+
+
+@dataclass
+class TranscriptText:
+    """An individual row of transcript text."""
+
+    text: str
+    start: float
+    duration: float
+
+
+@dataclass
+class Transcript:
+    """A full transcript from a caption track."""
+
+    track: CaptionTrack
+    text: List[TranscriptText]