Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a client for getting data from YouTube #1100

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions tests/unit/via/services/youtube_api/client_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from unittest.mock import sentinel

import pytest

from via.services.youtube_api import (
CaptionTrack,
Transcript,
TranscriptText,
YouTubeAPIClient,
)


class TestYouTubeAPIClient:
def test_get_video_info(self, client, http_session, Video):
video = client.get_video_info("VIDEO_ID")

http_session.post.assert_called_once_with(
"https://youtubei.googleapis.com/youtubei/v1/player",
json={
"context": {
"client": {
"hl": "en",
"clientName": "WEB",
# Suspicious value right here...
"clientVersion": "2.20210721.00.00",
}
},
"videoId": "VIDEO_ID",
},
)
response = http_session.post.return_value
response.json.assert_called_once_with()

Video.from_v1_json.assert_called_once_with(data=response.json.return_value)
assert video == Video.from_v1_json.return_value

def test_get_transcript(self, client, http_session):
caption_track = CaptionTrack("en", base_url=sentinel.url)
response = http_session.get.return_value
response.text = """
<transcript>
<text start="0.21" dur="1.387">Hey there guys,</text>

<text start="1.597">Lichen&#39; subscribe </text>
<text start="4.327" dur="2.063">
&lt;font color=&quot;#A0AAB4&quot;&gt;Buy my merch!&lt;/font&gt;
</text>
</transcript>
"""

transcript = client.get_transcript(caption_track)

http_session.get.assert_called_once_with(url=caption_track.base_url)
assert transcript == Transcript(
track=caption_track,
text=[
TranscriptText(text="Hey there guys,", start=0.21, duration=1.387),
TranscriptText(text="Lichen' subscribe", start=1.597, duration=0.0),
TranscriptText(text="Buy my merch!", start=4.327, duration=2.063),
],
)

def test_get_transcript_with_no_url(self, client):
with pytest.raises(ValueError):
client.get_transcript(CaptionTrack("en", base_url=None))

@pytest.fixture
def client(self):
return YouTubeAPIClient()

@pytest.fixture
def Video(self, patch):
return patch("via.services.youtube_api.client.Video")

@pytest.fixture(autouse=True)
def http_session(self, patch):
return patch("via.services.youtube_api.client.HTTPService").return_value
9 changes: 8 additions & 1 deletion via/services/youtube_api/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,8 @@
from via.services.youtube_api.models import Captions, CaptionTrack, Video
from via.services.youtube_api.client import YouTubeAPIClient
from via.services.youtube_api.models import (
Captions,
CaptionTrack,
Transcript,
TranscriptText,
Video,
)
79 changes: 79 additions & 0 deletions via/services/youtube_api/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from xml.etree import ElementTree

import requests

from via.services import HTTPService
from via.services.youtube_api.models import (
CaptionTrack,
Transcript,
TranscriptText,
Video,
)


class YouTubeAPIError(Exception):
"""Something has gone wrong interacting with YouTube."""


class YouTubeAPIClient:
"""A client for interacting with YouTube and manipulating related URLs."""

def __init__(self):
session = requests.Session()
# Ensure any translations that Google provides are in English
session.headers["Accept-Language"] = "en-US"
self._http = HTTPService(session=session)

def get_video_info(self, video_id: str) -> Video:
"""Get information for a given YouTube video."""

response = self._http.post(
"https://youtubei.googleapis.com/youtubei/v1/player",
json={
"context": {
"client": {
"hl": "en",
"clientName": "WEB",
# Suspicious value right here...
"clientVersion": "2.20210721.00.00",
}
},
"videoId": video_id,
},
)

return Video.from_v1_json(data=response.json())

def get_transcript(self, caption_track: CaptionTrack) -> Transcript:
"""Get the transcript associated with a caption track.

You can set the track `translated_language_code` to ensure we translate
the value before returning it.
"""

if not caption_track.base_url:
raise ValueError("Cannot get a transcript without a URL")

response = self._http.get(url=caption_track.base_url)
xml_elements = ElementTree.fromstring(response.text)

return Transcript(
track=caption_track,
text=[
TranscriptText(
text=self._strip_html(xml_element.text),
start=float(xml_element.attrib["start"]),
duration=float(xml_element.attrib.get("dur", "0.0")),
)
for xml_element in xml_elements
if xml_element.text is not None
],
)

@staticmethod
def _strip_html(xml_string):
"""Remove all non-text content from an XML fragment or string."""

return "".join(
ElementTree.fromstring(f"<span>{xml_string}</span>").itertext()
).strip()
17 changes: 17 additions & 0 deletions via/services/youtube_api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,20 @@ def from_v1_json(cls, data):
safe_get(data, ["captions", "playerCaptionsTracklistRenderer"], {})
)
)


@dataclass
class TranscriptText:
"""An individual row of transcript text."""

text: str
start: float
duration: float


@dataclass
class Transcript:
"""A full transcript from a caption track."""

track: CaptionTrack
text: List[TranscriptText]