Add a client for getting data from YouTube

hypothesis · Aug 2, 2023 · a507b1e · a507b1e
1 parent 2543c70
commit a507b1e
Show file tree

Hide file tree

Showing 4 changed files with 181 additions and 1 deletion.
diff --git a/tests/unit/via/services/youtube_api/client_test.py b/tests/unit/via/services/youtube_api/client_test.py
@@ -0,0 +1,77 @@
+from unittest.mock import sentinel
+
+import pytest
+
+from via.services.youtube_api import (
+    CaptionTrack,
+    Transcript,
+    TranscriptText,
+    YouTubeAPIClient,
+)
+
+
+class TestYouTubeAPIClient:
+    def test_get_video_info(self, client, http_session, Video):
+        video = client.get_video_info("VIDEO_ID")
+
+        http_session.post.assert_called_once_with(
+            "https://youtubei.googleapis.com/youtubei/v1/player",
+            json={
+                "context": {
+                    "client": {
+                        "hl": "en",
+                        "clientName": "WEB",
+                        # Suspicious value right here...
+                        "clientVersion": "2.20210721.00.00",
+                    }
+                },
+                "videoId": "VIDEO_ID",
+            },
+        )
+        response = http_session.post.return_value
+        response.json.assert_called_once_with()
+
+        Video.from_v1_json.assert_called_once_with(data=response.json.return_value)
+        assert video == Video.from_v1_json.return_value
+
+    def test_get_transcript(self, client, http_session):
+        caption_track = CaptionTrack("en", base_url=sentinel.url)
+        response = http_session.get.return_value
+        response.text = """
+        <transcript>
+            <text start="0.21" dur="1.387">Hey there guys,</text>
+
+            <text start="1.597">Lichen&#39; subscribe        </text>
+            <text start="4.327" dur="2.063">
+                &lt;font color=&quot;#A0AAB4&quot;&gt;Buy my merch!&lt;/font&gt;
+            </text>
+        </transcript>
+        """
+
+        transcript = client.get_transcript(caption_track)
+
+        http_session.get.assert_called_once_with(url=caption_track.base_url)
+        assert transcript == Transcript(
+            track=caption_track,
+            text=[
+                TranscriptText(text="Hey there guys,", start=0.21, duration=1.387),
+                TranscriptText(text="Lichen' subscribe", start=1.597, duration=0.0),
+                TranscriptText(text="Buy my merch!", start=4.327, duration=2.063),
+            ],
+        )
+
+    def test_get_transcript_with_no_url(self, client):
+        with pytest.raises(ValueError):
+            client.get_transcript(CaptionTrack("en", base_url=None))
+
+    @pytest.fixture
+    def client(self):
+        return YouTubeAPIClient()
+
+    @pytest.fixture
+    def Video(self, patch):
+        return patch("via.services.youtube_api.client.Video")
+
+    @pytest.fixture(autouse=True)
+    def http_session(self, patch):
+        return patch("via.services.youtube_api.client.HTTPService").return_value
diff --git a/via/services/youtube_api/__init__.py b/via/services/youtube_api/__init__.py
@@ -1 +1,8 @@
-from via.services.youtube_api.models import Captions, CaptionTrack, Video
+from via.services.youtube_api.client import YouTubeAPIClient
+from via.services.youtube_api.models import (
+    Captions,
+    CaptionTrack,
+    Transcript,
+    TranscriptText,
+    Video,
+)
diff --git a/via/services/youtube_api/client.py b/via/services/youtube_api/client.py
@@ -0,0 +1,79 @@
+from xml.etree import ElementTree
+
+import requests
+
+from via.services import HTTPService
+from via.services.youtube_api.models import (
+    CaptionTrack,
+    Transcript,
+    TranscriptText,
+    Video,
+)
+
+
+class YouTubeAPIError(Exception):
+    """Something has gone wrong interacting with YouTube."""
+
+
+class YouTubeAPIClient:
+    """A client for interacting with YouTube and manipulating related URLs."""
+
+    def __init__(self):
+        session = requests.Session()
+        # Ensure any translations that Google provides are in English
+        session.headers["Accept-Language"] = "en-US"
+        self._http = HTTPService(session=session)
+
+    def get_video_info(self, video_id: str) -> Video:
+        """Get information for a given YouTube video."""
+
+        response = self._http.post(
+            "https://youtubei.googleapis.com/youtubei/v1/player",
+            json={
+                "context": {
+                    "client": {
+                        "hl": "en",
+                        "clientName": "WEB",
+                        # Suspicious value right here...
+                        "clientVersion": "2.20210721.00.00",
+                    }
+                },
+                "videoId": video_id,
+            },
+        )
+
+        return Video.from_v1_json(data=response.json())
+
+    def get_transcript(self, caption_track: CaptionTrack) -> Transcript:
+        """Get the transcript associated with a caption track.
+
+        You can set the track `translated_language_code` to ensure we translate
+        the value before returning it.
+        """
+
+        if not caption_track.base_url:
+            raise ValueError("Cannot get a transcript without a URL")
+
+        response = self._http.get(url=caption_track.base_url)
+        xml_elements = ElementTree.fromstring(response.text)
+
+        return Transcript(
+            track=caption_track,
+            text=[
+                TranscriptText(
+                    text=self._strip_html(xml_element.text),
+                    start=float(xml_element.attrib["start"]),
+                    duration=float(xml_element.attrib.get("dur", "0.0")),
+                )
+                for xml_element in xml_elements
+                if xml_element.text is not None
+            ],
+        )
+
+    @staticmethod
+    def _strip_html(xml_string):
+        """Remove all non-text content from an XML fragment or string."""
+
+        return "".join(
+            ElementTree.fromstring(f"<span>{xml_string}</span>").itertext()
+        ).strip()
diff --git a/via/services/youtube_api/models.py b/via/services/youtube_api/models.py
@@ -140,3 +140,20 @@ def from_v1_json(cls, data):
                 safe_get(data, ["captions", "playerCaptionsTracklistRenderer"], {})
             )
         )
+
+
+@dataclass
+class TranscriptText:
+    """An individual row of transcript text."""
+
+    text: str
+    start: float
+    duration: float
+
+
+@dataclass
+class Transcript:
+    """A full transcript from a caption track."""
+
+    track: CaptionTrack
+    text: List[TranscriptText]