-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a client for getting data from YouTube
- Loading branch information
Jon Betts
committed
Aug 2, 2023
1 parent
2543c70
commit a507b1e
Showing
4 changed files
with
181 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
from unittest.mock import sentinel | ||
|
||
import pytest | ||
|
||
from via.services.youtube_api import ( | ||
CaptionTrack, | ||
Transcript, | ||
TranscriptText, | ||
YouTubeAPIClient, | ||
) | ||
|
||
|
||
class TestYouTubeAPIClient: | ||
def test_get_video_info(self, client, http_session, Video): | ||
video = client.get_video_info("VIDEO_ID") | ||
|
||
http_session.post.assert_called_once_with( | ||
"https://youtubei.googleapis.com/youtubei/v1/player", | ||
json={ | ||
"context": { | ||
"client": { | ||
"hl": "en", | ||
"clientName": "WEB", | ||
# Suspicious value right here... | ||
"clientVersion": "2.20210721.00.00", | ||
} | ||
}, | ||
"videoId": "VIDEO_ID", | ||
}, | ||
) | ||
response = http_session.post.return_value | ||
response.json.assert_called_once_with() | ||
|
||
Video.from_v1_json.assert_called_once_with(data=response.json.return_value) | ||
assert video == Video.from_v1_json.return_value | ||
|
||
def test_get_transcript(self, client, http_session): | ||
caption_track = CaptionTrack("en", base_url=sentinel.url) | ||
response = http_session.get.return_value | ||
response.text = """ | ||
<transcript> | ||
<text start="0.21" dur="1.387">Hey there guys,</text> | ||
<text start="1.597">Lichen' subscribe </text> | ||
<text start="4.327" dur="2.063"> | ||
<font color="#A0AAB4">Buy my merch!</font> | ||
</text> | ||
</transcript> | ||
""" | ||
|
||
transcript = client.get_transcript(caption_track) | ||
|
||
http_session.get.assert_called_once_with(url=caption_track.base_url) | ||
assert transcript == Transcript( | ||
track=caption_track, | ||
text=[ | ||
TranscriptText(text="Hey there guys,", start=0.21, duration=1.387), | ||
TranscriptText(text="Lichen' subscribe", start=1.597, duration=0.0), | ||
TranscriptText(text="Buy my merch!", start=4.327, duration=2.063), | ||
], | ||
) | ||
|
||
def test_get_transcript_with_no_url(self, client): | ||
with pytest.raises(ValueError): | ||
client.get_transcript(CaptionTrack("en", base_url=None)) | ||
|
||
@pytest.fixture | ||
def client(self): | ||
return YouTubeAPIClient() | ||
|
||
@pytest.fixture | ||
def Video(self, patch): | ||
return patch("via.services.youtube_api.client.Video") | ||
|
||
@pytest.fixture(autouse=True) | ||
def http_session(self, patch): | ||
return patch("via.services.youtube_api.client.HTTPService").return_value |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,8 @@ | ||
from via.services.youtube_api.models import Captions, CaptionTrack, Video | ||
from via.services.youtube_api.client import YouTubeAPIClient | ||
from via.services.youtube_api.models import ( | ||
Captions, | ||
CaptionTrack, | ||
Transcript, | ||
TranscriptText, | ||
Video, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
from xml.etree import ElementTree | ||
|
||
import requests | ||
|
||
from via.services import HTTPService | ||
from via.services.youtube_api.models import ( | ||
CaptionTrack, | ||
Transcript, | ||
TranscriptText, | ||
Video, | ||
) | ||
|
||
|
||
class YouTubeAPIError(Exception): | ||
"""Something has gone wrong interacting with YouTube.""" | ||
|
||
|
||
class YouTubeAPIClient: | ||
"""A client for interacting with YouTube and manipulating related URLs.""" | ||
|
||
def __init__(self): | ||
session = requests.Session() | ||
# Ensure any translations that Google provides are in English | ||
session.headers["Accept-Language"] = "en-US" | ||
self._http = HTTPService(session=session) | ||
|
||
def get_video_info(self, video_id: str) -> Video: | ||
"""Get information for a given YouTube video.""" | ||
|
||
response = self._http.post( | ||
"https://youtubei.googleapis.com/youtubei/v1/player", | ||
json={ | ||
"context": { | ||
"client": { | ||
"hl": "en", | ||
"clientName": "WEB", | ||
# Suspicious value right here... | ||
"clientVersion": "2.20210721.00.00", | ||
} | ||
}, | ||
"videoId": video_id, | ||
}, | ||
) | ||
|
||
return Video.from_v1_json(data=response.json()) | ||
|
||
def get_transcript(self, caption_track: CaptionTrack) -> Transcript: | ||
"""Get the transcript associated with a caption track. | ||
You can set the track `translated_language_code` to ensure we translate | ||
the value before returning it. | ||
""" | ||
|
||
if not caption_track.base_url: | ||
raise ValueError("Cannot get a transcript without a URL") | ||
|
||
response = self._http.get(url=caption_track.base_url) | ||
xml_elements = ElementTree.fromstring(response.text) | ||
|
||
return Transcript( | ||
track=caption_track, | ||
text=[ | ||
TranscriptText( | ||
text=self._strip_html(xml_element.text), | ||
start=float(xml_element.attrib["start"]), | ||
duration=float(xml_element.attrib.get("dur", "0.0")), | ||
) | ||
for xml_element in xml_elements | ||
if xml_element.text is not None | ||
], | ||
) | ||
|
||
@staticmethod | ||
def _strip_html(xml_string): | ||
"""Remove all non-text content from an XML fragment or string.""" | ||
|
||
return "".join( | ||
ElementTree.fromstring(f"<span>{xml_string}</span>").itertext() | ||
).strip() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters