Remove pafy dependency

The pafy has not had any release since 2019 Should fix download errors like KeyError 'like_count', ref mps-youtube/pafy#287
soundsensing · Apr 16, 2021 · 64f6b76 · 64f6b76
1 parent 33126fe
commit 64f6b76
Show file tree

Hide file tree

Showing 5 changed files with 169 additions and 16 deletions.
diff --git a/download_audioset.py b/download_audioset.py
@@ -17,7 +17,6 @@
 from functools import partial
 
 import multiprocessing_logging
-import pafy
 
 from errors import SubprocessError, FfmpegValidationError, \
                    FfmpegIncorrectDurationError, FfmpegUnopenableFileError
@@ -317,6 +316,110 @@ def ffmpeg(ffmpeg_path, input_path, output_path, input_args=None,
         LOGGER.error(error_msg.format(num_retries, input_path, str(last_err)))
 
 
+
+def get_video_info(url):
+
+    import youtube_dl
+    #print(youtube_dl.__version__)
+
+    ydl_opts = {
+        'format': 'bestaudio/best',
+        'verbose': True,
+        'cookies': 'cookies.txt',
+        'print-traffic': True,
+    }
+    print(url)
+    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+        result = ydl.extract_info(url, download=False)
+
+    if 'entries' in result:
+        # Can be a playlist or a list of videos
+        video = result['entries'][0]
+    else:
+        # Just a video
+        video = result
+
+    return video
+
+
+def get_video_info(url):
+
+    import youtube_dl
+
+    ydl_opts = {
+        #'format': 'bestaudio/best',
+        #'verbose': True,
+        #'cookies': 'cookies.txt',
+        #'print-traffic': True,
+    }
+    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+        result = ydl.extract_info(url, download=False)
+
+    if 'entries' in result:
+        # Can be a playlist or a list of videos
+        video = result['entries'][0]
+    else:
+        # Just a video
+        video = result
+
+    return video
+
+def format_is_audio_only(format):
+    t = format['acodec'] != 'none' \
+        and format['vcodec'] == 'none' # no video present, audio only
+    return t
+
+def format_is_video_only(format):
+    t = format['acodec'] == 'none' \
+        and format['vcodec'] != 'none'
+    return t
+
+def format_is_video_with_audio(format):
+    t = format['acodec'] != 'none' \
+        and format['vcodec'] != 'none'
+    return t
+
+def sort_audio_formats(formats, by='abr'):
+    f = filter(format_is_audio_only, formats) 
+    s = sorted(f, key=lambda f: f[by], reverse=True)
+    return s
+
+def get_best_audio_format(formats):
+    s = sort_audio_formats(formats)
+    return s[0]
+
+
+# note: Not all formats have vbr?
+def sort_video_formats(formats, with_audio=True, by=('width', 'tbr')):
+    pred = format_is_video_with_audio if with_audio else format_is_video_only
+    f = filter(pred, formats)
+
+    def get_key(f):
+        key = tuple(f[k] for k in by)
+        return key
+
+    s = sorted(f, key=get_key, reverse=True)
+    return s
+
+
+def get_best_video_format(formats, video_mode, sort_by=('width', 'tbr')):
+    video_noaudio_formats = sort_video_formats(formats, with_audio=False, by=sort_by)
+    video_audio_formats = sort_video_formats(formats, with_audio=True, by=sort_by)
+
+    if video_mode == '':
+        return None
+
+    if video_mode in ('bestvideo', 'bestvideowithaudio'):
+        # If there isn't a video only option, go with best video with audio
+        if len(video_noaudio_formats):
+            return video_noaudio_formats[0]
+        else:
+            return video_audio_formats[0]
+    elif video_mode in ('bestvideoaudio', 'bestvideoaudionoaudio'):
+        return video_audio_formats[0]
+    else:
+        raise ValueError('Invalid video mode: {}'.format(video_mode))
+
 def download_yt_video(ytid, ts_start, ts_end, output_dir, ffmpeg_path, ffprobe_path,
                       audio_codec='flac', audio_format='flac',
                       audio_sample_rate=48000, audio_bit_depth=16,
@@ -408,8 +511,13 @@ def download_yt_video(ytid, ts_start, ts_end, output_dir, ffmpeg_path, ffprobe_p
 
     # Get the direct URLs to the videos with best audio and with best video (with audio)
 
-    video = pafy.new(video_page_url)
-    video_duration = video.length
+    url = f'https://www.youtube.com/watch?v={ytid}'
+    video = get_video_info(url)
+
+    video_url = video['url']
+    video_duration = video['duration']
+    print(video['id'], video_duration, video_url)
+
     end_past_video_end = False
     if ts_end > video_duration:
         warn_msg = "End time for segment ({} - {}) of video {} extends past end of video (length {} sec)"
@@ -418,17 +526,9 @@ def download_yt_video(ytid, ts_start, ts_end, output_dir, ffmpeg_path, ffprobe_p
         ts_end = ts_start + duration
         end_past_video_end = True
 
-    if video_mode in ('bestvideo', 'bestvideowithaudio', ''):
-        best_video = video.getbestvideo()
-        # If there isn't a video only option, go with best video with audio
-        if best_video is None:
-            best_video = video.getbest()
-    elif video_mode in ('bestvideoaudio', 'bestvideoaudionoaudio'):
-        best_video = video.getbest()
-    else:
-        raise ValueError('Invalid video mode: {}'.format(video_mode))
-    best_audio = video.getbestaudio()
-    best_video_url = best_video.url
+    best_video = get_best_video_format(video['formats'], video_mode=video_mode)
+    best_audio = get_best_audio_format(video['formats'])
+    best_video_url = best_video['url']
     best_audio_url = best_audio.url
 
     audio_info = {

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,4 @@
 youtube-dl==2017.9.15
-pafy==0.5.3.1
 multiprocessing-logging==0.2.4
 sox==1.3.3
 sk-video==1.1.8

diff --git a/tests/data/69kudlOXwMs.info.json b/tests/data/69kudlOXwMs.info.json
diff --git a/tests/test_download.py b/tests/test_download.py
@@ -0,0 +1,53 @@
+
+import sys
+import os.path
+import json
+
+here = os.path.dirname(__file__)
+
+sys.path.insert(0, os.path.join(here, '..'))
+import download_audioset as audiosetdl
+
+
+
+def read_json_file(path):
+    with open(path, 'r') as f:
+        contents = f.read()
+        data = json.loads(contents)
+        return data
+
+def test_get_best_audio():
+
+    p = os.path.join(here, 'data/69kudlOXwMs.info.json')
+    info = read_json_file(p)
+    expected_best_bitrate = 130.955
+
+    s = audiosetdl.sort_audio_formats(info["formats"])    
+    print(list(f['abr'] for f in s))
+
+    assert s[0]['abr'] == expected_best_bitrate
+
+    best = audiosetdl.get_best_audio_format(info["formats"])
+    print(best['abr'])
+
+def test_get_best_video():
+    p = os.path.join(here, 'data/69kudlOXwMs.info.json')
+    info = read_json_file(p)
+
+    for f in info['formats']:
+        print(f.get('width'), f.get('vbr'), f.get('tbr'))
+
+    expected_best_width = 1280
+    expected_best_bitrate_videoaudio = 1662.976
+    expected_best_bitrate_video_only = 1530.31
+
+    # with audio
+    best = audiosetdl.get_best_video_format(info["formats"], video_mode='bestvideoaudio')
+    assert best['width'] == expected_best_width
+    assert best['tbr'] == expected_best_bitrate_videoaudio
+
+    # without audio
+    best = audiosetdl.get_best_video_format(info["formats"], video_mode='bestvideo')
+    assert best['width'] == expected_best_width
+    assert best['tbr'] == expected_best_bitrate_video_only
+
diff --git a/utils.py b/utils.py
@@ -112,4 +112,4 @@ def get_subset_name(subset_path):
     if ext[1:].isdigit():
         subset_name, file_num = os.path.splitext(subset_name)
 
-    return subset_name
+    return subset_name