Return the first transcript we find, instead of searching specificall…

…y for "en" This paves the way for storing non "en" transcripts in the DB, should we find them. This includes automatically generated, other English dialects like "en-gb" and named English transcripts.
hypothesis · Aug 3, 2023 · 963a33c · 963a33c
1 parent c1d64a5
commit 963a33c
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 44 deletions.
diff --git a/tests/factories/transcript.py b/tests/factories/transcript.py
@@ -10,20 +10,6 @@ class Meta:
 
     video_id = Sequence(lambda n: f"video_id_{n}")
     transcript_id = Sequence(lambda n: f"transcript_id_{n}")
-    transcript = [
-        {
-            "text": "[Music]",
-            "start": 0.0,
-            "duration": 7.52,
-        },
-        {
-            "text": "how many of you remember the first time",
-            "start": 5.6,
-            "duration": 4.72,
-        },
-        {
-            "text": "you saw a playstation 1 game if you were",
-            "start": 7.52,
-            "duration": 4.72,
-        },
-    ]
+    transcript = Sequence(
+        lambda n: [{"text": "[Music]", "start": float(n), "duration": float(n + 1)}]
+    )
diff --git a/tests/unit/via/services/youtube_test.py b/tests/unit/via/services/youtube_test.py
@@ -1,3 +1,4 @@
+from datetime import datetime, timedelta
 from io import BytesIO
 from unittest.mock import sentinel
 
@@ -115,11 +116,19 @@ def test_get_transcript(self, db_session, svc, YouTubeTranscriptApi):
             )
         ]
 
-    @pytest.mark.usefixtures("db_session")
-    @pytest.mark.parametrize("transcript__transcript_id", ["en"])
     def test_get_transcript_returns_cached_transcripts(
-        self, transcript, svc, YouTubeTranscriptApi
+        self, db_session, transcript_factory, svc, YouTubeTranscriptApi
     ):
+        # Add our decoy first to check we aren't picking by row number
+        decoy_transcript = transcript_factory(transcript_id="en-us")
+        transcript = transcript_factory(
+            video_id=decoy_transcript.video_id, transcript_id="en"
+        )
+        # Flush to generate created dates and move the decoy to have a later
+        # created date
+        db_session.flush()
+        decoy_transcript.created += timedelta(hours=1)
+
         returned_transcript = svc.get_transcript(transcript.video_id)
 
         YouTubeTranscriptApi.get_transcript.assert_not_called()

diff --git a/via/services/youtube.py b/via/services/youtube.py
@@ -1,7 +1,6 @@
 from urllib.parse import parse_qs, quote_plus, urlparse
 
 from sqlalchemy import select
-from sqlalchemy.exc import NoResultFound
 from youtube_transcript_api import YouTubeTranscriptApi
 
 from via.models import Transcript, Video
@@ -95,30 +94,23 @@ def get_transcript(self, video_id):
         :raise Exception: this method might raise any type of exception that
             YouTubeTranscriptApi raises
         """
-        transcript_id = language_code = "en"
 
-        try:
-            transcript = (
-                self._db.scalars(
-                    select(Transcript).where(
-                        Transcript.video_id == video_id,
-                        Transcript.transcript_id == transcript_id,
-                    )
-                )
-                .one()
-                .transcript
-            )
-        except NoResultFound:
-            transcript = YouTubeTranscriptApi.get_transcript(
-                video_id, languages=(language_code,)
-            )
-            self._db.add(
-                Transcript(
-                    video_id=video_id,
-                    transcript_id=transcript_id,
-                    transcript=transcript,
-                )
-            )
+        # Find the first transcript we can for this video. We don't mind which
+        # one it is
+        if transcript_model := self._db.scalars(
+            select(Transcript)
+            .where(Transcript.video_id == video_id)
+            # Sort by transcript create date to increase the chance we return
+            # the same transcript should we ever get more than one.
+            .order_by(Transcript.created.asc())
+        ).first():
+            return transcript_model.transcript
+
+        # If there is no match, retrieve a transcript from YouTube and store it
+        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=("en",))
+        self._db.add(
+            Transcript(video_id=video_id, transcript_id="en", transcript=transcript)
+        )
 
         return transcript