Skip to content

Commit

Permalink
Add a client for getting data from YouTube
Browse files Browse the repository at this point in the history
  • Loading branch information
Jon Betts committed Aug 2, 2023
1 parent 7542f92 commit 5ec410d
Show file tree
Hide file tree
Showing 4 changed files with 181 additions and 1 deletion.
77 changes: 77 additions & 0 deletions tests/unit/via/services/youtube_api/client_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from unittest.mock import sentinel

import pytest

from via.services.youtube_api import (
CaptionTrack,
Transcript,
TranscriptText,
YouTubeAPIClient,
)


class TestYouTubeAPIClient:
def test_get_video_info(self, client, http_session, Video):
video = client.get_video_info("VIDEO_ID")

http_session.post.assert_called_once_with(
"https://youtubei.googleapis.com/youtubei/v1/player",
json={
"context": {
"client": {
"hl": "en",
"clientName": "WEB",
# Suspicious value right here...
"clientVersion": "2.20210721.00.00",
}
},
"videoId": "VIDEO_ID",
},
)
response = http_session.post.return_value
response.json.assert_called_once_with()

Video.from_v1_json.assert_called_once_with(data=response.json.return_value)
assert video == Video.from_v1_json.return_value

def test_get_transcript(self, client, http_session):
caption_track = CaptionTrack("en", base_url=sentinel.url)
response = http_session.get.return_value
response.text = """
<transcript>
<text start="0.21" dur="1.387">Hey there guys,</text>
<text start="1.597">Lichen&#39; subscribe </text>
<text start="4.327" dur="2.063">
&lt;font color=&quot;#A0AAB4&quot;&gt;Buy my merch!&lt;/font&gt;
</text>
</transcript>
"""

transcript = client.get_transcript(caption_track)

http_session.get.assert_called_once_with(url=caption_track.base_url)
assert transcript == Transcript(
track=caption_track,
text=[
TranscriptText(text="Hey there guys,", start=0.21, duration=1.387),
TranscriptText(text="Lichen' subscribe", start=1.597, duration=0.0),
TranscriptText(text="Buy my merch!", start=4.327, duration=2.063),
],
)

def test_get_transcript_with_no_url(self, client):
with pytest.raises(ValueError):
client.get_transcript(CaptionTrack("en", base_url=None))

@pytest.fixture
def client(self):
return YouTubeAPIClient()

@pytest.fixture
def Video(self, patch):
return patch("via.services.youtube_api.client.Video")

@pytest.fixture(autouse=True)
def http_session(self, patch):
return patch("via.services.youtube_api.client.HTTPService").return_value
9 changes: 8 additions & 1 deletion via/services/youtube_api/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,8 @@
from via.services.youtube_api.models import Captions, CaptionTrack, Video
from via.services.youtube_api.client import YouTubeAPIClient
from via.services.youtube_api.models import (
Captions,
CaptionTrack,
Transcript,
TranscriptText,
Video,
)
79 changes: 79 additions & 0 deletions via/services/youtube_api/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from xml.etree import ElementTree

import requests

from via.services import HTTPService
from via.services.youtube_api.models import (
CaptionTrack,
Transcript,
TranscriptText,
Video,
)


class YouTubeAPIError(Exception):
"""Something has gone wrong interacting with YouTube."""


class YouTubeAPIClient:
"""A client for interacting with YouTube and manipulating related URLs."""

def __init__(self):
session = requests.Session()
# Ensure any translations that Google provides are in English
session.headers["Accept-Language"] = "en-US"
self._http = HTTPService(session=session)

def get_video_info(self, video_id: str) -> Video:
"""Get information for a given YouTube video."""

response = self._http.post(
"https://youtubei.googleapis.com/youtubei/v1/player",
json={
"context": {
"client": {
"hl": "en",
"clientName": "WEB",
# Suspicious value right here...
"clientVersion": "2.20210721.00.00",
}
},
"videoId": video_id,
},
)

return Video.from_v1_json(data=response.json())

def get_transcript(self, caption_track: CaptionTrack) -> Transcript:
"""Get the transcript associated with a caption track.
You can set the track `translated_language_code` to ensure we translate
the value before returning it.
"""

if not caption_track.base_url:
raise ValueError("Cannot get a transcript without a URL")

response = self._http.get(url=caption_track.base_url)
xml_elements = ElementTree.fromstring(response.text)

return Transcript(
track=caption_track,
text=[
TranscriptText(
text=self._strip_html(xml_element.text),
start=float(xml_element.attrib["start"]),
duration=float(xml_element.attrib.get("dur", "0.0")),
)
for xml_element in xml_elements
if xml_element.text is not None
],
)

@staticmethod
def _strip_html(xml_string):
"""Remove all non-text content from an XML fragment or string."""

return "".join(
ElementTree.fromstring(f"<span>{xml_string}</span>").itertext()
).strip()
17 changes: 17 additions & 0 deletions via/services/youtube_api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,20 @@ def from_v1_json(cls, data):
safe_get(data, ["captions", "playerCaptionsTracklistRenderer"], {})
)
)


@dataclass
class TranscriptText:
"""An individual row of transcript text."""

text: str
start: float
duration: float


@dataclass
class Transcript:
"""A full transcript from a caption track."""

track: CaptionTrack
text: List[TranscriptText]

0 comments on commit 5ec410d

Please sign in to comment.