Add an algorithm for matching a YouTube caption track

hypothesis · Aug 3, 2023 · aebdea5 · aebdea5
1 parent caee583
commit aebdea5
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 0 deletions.
diff --git a/tests/unit/via/services/youtube_api/models_test.py b/tests/unit/via/services/youtube_api/models_test.py
@@ -2,6 +2,7 @@
 
 import pytest
 from h_matchers import Any
+from pytest import param
 
 from via.services.youtube_api import Captions, CaptionTrack, Video
 
@@ -68,6 +69,59 @@ def test_from_v1_json_minimal(self, CaptionTrack):
         assert not captions.tracks
         CaptionTrack.assert_not_called()
 
+    @pytest.mark.parametrize(
+        "preferences,expected_label",
+        (
+            param(
+                [CaptionTrack("en")],
+                "plain_en",
+                id="direct_match",
+            ),
+            param(
+                [CaptionTrack("de"), CaptionTrack("en-gb")],
+                "plain_en_gb",
+                id="miss_then_hit",
+            ),
+            param(
+                [
+                    CaptionTrack("de"),
+                    CaptionTrack(Any.string.matching("^en-.*"), name="Name"),
+                ],
+                "named_en_gb",
+                id="wild_cards",
+            ),
+            param(
+                [CaptionTrack("fr", kind=None), CaptionTrack("en", kind="asr")],
+                "en_auto",
+                id="fallback_to_auto",
+            ),
+            param(
+                [CaptionTrack(Any(), name="Name")],
+                "named_en_gb",
+                id="same_level_sorting",
+            ),
+            param([CaptionTrack("fr")], None, id="miss"),
+        ),
+    )
+    def test_find_matching_track(self, preferences, expected_label):
+        captions = Captions(
+            tracks=[
+                CaptionTrack("en", label="plain_en"),
+                CaptionTrack("en-gb", label="plain_en_gb"),
+                CaptionTrack("en-us", name="Name", label="named_en_us"),
+                CaptionTrack("en-gb", name="Name", label="named_en_gb"),
+                CaptionTrack("en", kind="asr", label="en_auto"),
+            ]
+        )
+
+        caption_track = captions.find_matching_track(preferences)
+
+        assert (
+            caption_track.label == expected_label
+            if expected_label
+            else not caption_track
+        )
+
     @pytest.fixture
     def CaptionTrack(self, patch):
         return patch("via.services.youtube_api.models.CaptionTrack")

diff --git a/via/services/youtube_api/models.py b/via/services/youtube_api/models.py
@@ -1,5 +1,7 @@
 import base64
+from copy import deepcopy
 from dataclasses import dataclass, field
+from operator import attrgetter
 from typing import List, Optional
 
 from via.services.youtube_api._nested_data import safe_get
@@ -84,6 +86,45 @@ def from_v1_json(cls, data: dict):
             ]
         )
 
+    def find_matching_track(
+        self, preferences: List[CaptionTrack]
+    ) -> Optional[CaptionTrack]:
+        """
+        Get a caption track which matching the preferences in order.
+
+        This method takes the provided list of caption track objects and
+        searches the available tracks for those with matching details:
+
+        * language_code
+        * name
+        * is_auto_generated / kind
+
+        Earlier items are higher priority.
+
+        :param preferences: List of partially filled out caption track objects
+            which represent the caption track we would like.
+        """
+
+        def get_key(track: CaptionTrack):
+            return track.language_code, track.kind, track.name
+
+        search_keys = [get_key(preference) for preference in preferences]
+        best_index, best_caption_track = None, None
+
+        # Sort the tracks to keep the algorithm more stable! This only insulates
+        # us from sorting changes, not metadata changes.
+        for caption_track in sorted(self.tracks, key=attrgetter("id")):
+            try:
+                index = search_keys.index(get_key(caption_track))
+            except ValueError:
+                continue
+
+            # Items with lower indexes are first choices for the user
+            if best_index is None or best_index > index:
+                best_index, best_caption_track = index, deepcopy(caption_track)
+
+        return best_caption_track
+
 
 @dataclass
 class Video: