Skip to content

Commit

Permalink
Remove pafy dependency
Browse files Browse the repository at this point in the history
The pafy has not had any release since 2019

Should fix download errors like KeyError 'like_count',
ref mps-youtube/pafy#287
  • Loading branch information
jonnor committed Apr 16, 2021
1 parent 33126fe commit 64f6b76
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 16 deletions.
128 changes: 114 additions & 14 deletions download_audioset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from functools import partial

import multiprocessing_logging
import pafy

from errors import SubprocessError, FfmpegValidationError, \
FfmpegIncorrectDurationError, FfmpegUnopenableFileError
Expand Down Expand Up @@ -317,6 +316,110 @@ def ffmpeg(ffmpeg_path, input_path, output_path, input_args=None,
LOGGER.error(error_msg.format(num_retries, input_path, str(last_err)))



def get_video_info(url):

import youtube_dl
#print(youtube_dl.__version__)

ydl_opts = {
'format': 'bestaudio/best',
'verbose': True,
'cookies': 'cookies.txt',
'print-traffic': True,
}
print(url)
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
result = ydl.extract_info(url, download=False)

if 'entries' in result:
# Can be a playlist or a list of videos
video = result['entries'][0]
else:
# Just a video
video = result

return video


def get_video_info(url):

import youtube_dl

ydl_opts = {
#'format': 'bestaudio/best',
#'verbose': True,
#'cookies': 'cookies.txt',
#'print-traffic': True,
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
result = ydl.extract_info(url, download=False)

if 'entries' in result:
# Can be a playlist or a list of videos
video = result['entries'][0]
else:
# Just a video
video = result

return video

def format_is_audio_only(format):
t = format['acodec'] != 'none' \
and format['vcodec'] == 'none' # no video present, audio only
return t

def format_is_video_only(format):
t = format['acodec'] == 'none' \
and format['vcodec'] != 'none'
return t

def format_is_video_with_audio(format):
t = format['acodec'] != 'none' \
and format['vcodec'] != 'none'
return t

def sort_audio_formats(formats, by='abr'):
f = filter(format_is_audio_only, formats)
s = sorted(f, key=lambda f: f[by], reverse=True)
return s

def get_best_audio_format(formats):
s = sort_audio_formats(formats)
return s[0]


# note: Not all formats have vbr?
def sort_video_formats(formats, with_audio=True, by=('width', 'tbr')):
pred = format_is_video_with_audio if with_audio else format_is_video_only
f = filter(pred, formats)

def get_key(f):
key = tuple(f[k] for k in by)
return key

s = sorted(f, key=get_key, reverse=True)
return s


def get_best_video_format(formats, video_mode, sort_by=('width', 'tbr')):
video_noaudio_formats = sort_video_formats(formats, with_audio=False, by=sort_by)
video_audio_formats = sort_video_formats(formats, with_audio=True, by=sort_by)

if video_mode == '':
return None

if video_mode in ('bestvideo', 'bestvideowithaudio'):
# If there isn't a video only option, go with best video with audio
if len(video_noaudio_formats):
return video_noaudio_formats[0]
else:
return video_audio_formats[0]
elif video_mode in ('bestvideoaudio', 'bestvideoaudionoaudio'):
return video_audio_formats[0]
else:
raise ValueError('Invalid video mode: {}'.format(video_mode))

def download_yt_video(ytid, ts_start, ts_end, output_dir, ffmpeg_path, ffprobe_path,
audio_codec='flac', audio_format='flac',
audio_sample_rate=48000, audio_bit_depth=16,
Expand Down Expand Up @@ -408,8 +511,13 @@ def download_yt_video(ytid, ts_start, ts_end, output_dir, ffmpeg_path, ffprobe_p

# Get the direct URLs to the videos with best audio and with best video (with audio)

video = pafy.new(video_page_url)
video_duration = video.length
url = f'https://www.youtube.com/watch?v={ytid}'
video = get_video_info(url)

video_url = video['url']
video_duration = video['duration']
print(video['id'], video_duration, video_url)

end_past_video_end = False
if ts_end > video_duration:
warn_msg = "End time for segment ({} - {}) of video {} extends past end of video (length {} sec)"
Expand All @@ -418,17 +526,9 @@ def download_yt_video(ytid, ts_start, ts_end, output_dir, ffmpeg_path, ffprobe_p
ts_end = ts_start + duration
end_past_video_end = True

if video_mode in ('bestvideo', 'bestvideowithaudio', ''):
best_video = video.getbestvideo()
# If there isn't a video only option, go with best video with audio
if best_video is None:
best_video = video.getbest()
elif video_mode in ('bestvideoaudio', 'bestvideoaudionoaudio'):
best_video = video.getbest()
else:
raise ValueError('Invalid video mode: {}'.format(video_mode))
best_audio = video.getbestaudio()
best_video_url = best_video.url
best_video = get_best_video_format(video['formats'], video_mode=video_mode)
best_audio = get_best_audio_format(video['formats'])
best_video_url = best_video['url']
best_audio_url = best_audio.url

audio_info = {
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
youtube-dl==2017.9.15
pafy==0.5.3.1
multiprocessing-logging==0.2.4
sox==1.3.3
sk-video==1.1.8
Expand Down
1 change: 1 addition & 0 deletions tests/data/69kudlOXwMs.info.json

Large diffs are not rendered by default.

53 changes: 53 additions & 0 deletions tests/test_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@

import sys
import os.path
import json

here = os.path.dirname(__file__)

sys.path.insert(0, os.path.join(here, '..'))
import download_audioset as audiosetdl



def read_json_file(path):
with open(path, 'r') as f:
contents = f.read()
data = json.loads(contents)
return data

def test_get_best_audio():

p = os.path.join(here, 'data/69kudlOXwMs.info.json')
info = read_json_file(p)
expected_best_bitrate = 130.955

s = audiosetdl.sort_audio_formats(info["formats"])
print(list(f['abr'] for f in s))

assert s[0]['abr'] == expected_best_bitrate

best = audiosetdl.get_best_audio_format(info["formats"])
print(best['abr'])

def test_get_best_video():
p = os.path.join(here, 'data/69kudlOXwMs.info.json')
info = read_json_file(p)

for f in info['formats']:
print(f.get('width'), f.get('vbr'), f.get('tbr'))

expected_best_width = 1280
expected_best_bitrate_videoaudio = 1662.976
expected_best_bitrate_video_only = 1530.31

# with audio
best = audiosetdl.get_best_video_format(info["formats"], video_mode='bestvideoaudio')
assert best['width'] == expected_best_width
assert best['tbr'] == expected_best_bitrate_videoaudio

# without audio
best = audiosetdl.get_best_video_format(info["formats"], video_mode='bestvideo')
assert best['width'] == expected_best_width
assert best['tbr'] == expected_best_bitrate_video_only

2 changes: 1 addition & 1 deletion utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,4 @@ def get_subset_name(subset_path):
if ext[1:].isdigit():
subset_name, file_num = os.path.splitext(subset_name)

return subset_name
return subset_name

0 comments on commit 64f6b76

Please sign in to comment.