Skip to content

Commit

Permalink
Return the first transcript we find, instead of searching specificall…
Browse files Browse the repository at this point in the history
…y for "en"

This paves the way for storing non "en" transcripts in the DB, should we find
them. This includes automatically generated, other English dialects like "en-gb"
and named English transcripts.
  • Loading branch information
Jon Betts committed Aug 3, 2023
1 parent c1d64a5 commit 963a33c
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 44 deletions.
20 changes: 3 additions & 17 deletions tests/factories/transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,6 @@ class Meta:

video_id = Sequence(lambda n: f"video_id_{n}")
transcript_id = Sequence(lambda n: f"transcript_id_{n}")
transcript = [
{
"text": "[Music]",
"start": 0.0,
"duration": 7.52,
},
{
"text": "how many of you remember the first time",
"start": 5.6,
"duration": 4.72,
},
{
"text": "you saw a playstation 1 game if you were",
"start": 7.52,
"duration": 4.72,
},
]
transcript = Sequence(
lambda n: [{"text": "[Music]", "start": float(n), "duration": float(n + 1)}]
)
15 changes: 12 additions & 3 deletions tests/unit/via/services/youtube_test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from datetime import datetime, timedelta
from io import BytesIO
from unittest.mock import sentinel

Expand Down Expand Up @@ -115,11 +116,19 @@ def test_get_transcript(self, db_session, svc, YouTubeTranscriptApi):
)
]

@pytest.mark.usefixtures("db_session")
@pytest.mark.parametrize("transcript__transcript_id", ["en"])
def test_get_transcript_returns_cached_transcripts(
self, transcript, svc, YouTubeTranscriptApi
self, db_session, transcript_factory, svc, YouTubeTranscriptApi
):
# Add our decoy first to check we aren't picking by row number
decoy_transcript = transcript_factory(transcript_id="en-us")
transcript = transcript_factory(
video_id=decoy_transcript.video_id, transcript_id="en"
)
# Flush to generate created dates and move the decoy to have a later
# created date
db_session.flush()
decoy_transcript.created += timedelta(hours=1)

returned_transcript = svc.get_transcript(transcript.video_id)

YouTubeTranscriptApi.get_transcript.assert_not_called()
Expand Down
40 changes: 16 additions & 24 deletions via/services/youtube.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from urllib.parse import parse_qs, quote_plus, urlparse

from sqlalchemy import select
from sqlalchemy.exc import NoResultFound
from youtube_transcript_api import YouTubeTranscriptApi

from via.models import Transcript, Video
Expand Down Expand Up @@ -95,30 +94,23 @@ def get_transcript(self, video_id):
:raise Exception: this method might raise any type of exception that
YouTubeTranscriptApi raises
"""
transcript_id = language_code = "en"

try:
transcript = (
self._db.scalars(
select(Transcript).where(
Transcript.video_id == video_id,
Transcript.transcript_id == transcript_id,
)
)
.one()
.transcript
)
except NoResultFound:
transcript = YouTubeTranscriptApi.get_transcript(
video_id, languages=(language_code,)
)
self._db.add(
Transcript(
video_id=video_id,
transcript_id=transcript_id,
transcript=transcript,
)
)
# Find the first transcript we can for this video. We don't mind which
# one it is
if transcript_model := self._db.scalars(
select(Transcript)
.where(Transcript.video_id == video_id)
# Sort by transcript create date to increase the chance we return
# the same transcript should we ever get more than one.
.order_by(Transcript.created.asc())
).first():
return transcript_model.transcript

# If there is no match, retrieve a transcript from YouTube and store it
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=("en",))
self._db.add(
Transcript(video_id=video_id, transcript_id="en", transcript=transcript)
)

return transcript

Expand Down

0 comments on commit 963a33c

Please sign in to comment.