desktop/media/tomkv

#!/usr/bin/env python3

import os, sys, json, re, math, time, pathlib as pl, subprocess as sp


class adict(dict):
	@classmethod
	def rec_make(cls, *args, _rec=id, **kws):
		v, ids, rec_key = (
			_rec if _rec is not id else (dict(*args, **kws), adict(), id) )
		if isinstance(v, (dict, list)):
			if (vid := id(v)) in ids: raise ValueError(
				f'Recursive data at [ {".".join(map(str, ids.values()))} ]' )
			if rec_key is not id: ids = adict(ids); ids[rec_key] = vid
			if isinstance(v, dict): v = cls(
				(k, cls.rec_make(_rec=(v, ids, k))) for k,v in v.items() )
			elif isinstance(v, list): v = list(
				cls.rec_make(_rec=(v, ids, f'[{n}]')) for n,v in enumerate(v) )
		return v
	def __init__(self, *args, **kws):
		super().__init__(*args, **kws)
		self.__dict__ = self

def sz_repr(size, _units=list(
		reversed(list((u, 2 ** (n * 10)) for n, u in enumerate('BKMGT'))) )):
	for u, u1 in _units:
		if size > u1: break
	if u1 > 1: size = f'{size/u1:.1f}'
	return f'{size}{u}'

def td_repr(delta, units_max=2, units_res=None, _units=dict(
		h=3600, m=60, s=1, y=365.2422*86400, mo=30.5*86400, w=7*86400, d=1*86400 )):
	res, s, n_last = list(), abs(delta), units_max - 1
	units = sorted(_units.items(), key=lambda v: v[1], reverse=True)
	for unit, unit_s in units:
		val = math.floor(s / unit_s)
		if not val:
			if units_res == unit: break
			continue
		if len(res) == n_last or units_res == unit:
			val, n_last = round(s / unit_s), True
		res.append(f'{val:.0f}{unit}')
		if n_last is True: break
		s -= val * unit_s
	return ' '.join(res) if res else '<1s'


def parse_rgb10_pixfmts():
	probe = sp.run(['ffmpeg', '-v', 'fatal', '-pix_fmts'], check=True, stdout=sp.PIPE)
	parse, pxfmt_set = False, set()
	for line in probe.stdout.decode().splitlines():
		if not (line := line.strip()): continue
		if not parse:
			if line == '-----': parse = True
			continue
		try:
			flags, fmt, nc, nb, cbits = line.split()
			nc, nb = int(nc), int(nb)
		except: raise RuntimeError(f'Failed to decode "ffmpeg -pix_fmts" line: {line!r}')
		if flags[0] != 'I': continue
		if nc == 3 and nb <= 15 and cbits == '10-10-10': pxfmt_set.add(fmt)
	if not parse: raise RuntimeError('Failed to decode "ffmpeg -pix_fmts" output')
	return pxfmt_set

def src_probe(p):
	probe = sp.run([ 'ffprobe', '-v', 'fatal', '-show_entries',
		'stream:format', '-print_format', 'json', str(p) ], check=True, stdout=sp.PIPE)
	probe = adict.rec_make(json.loads(probe.stdout))
	video = audio = sub = 0; errs, subs, multistream = list(), list(), None
	if (fmt := probe.format.format_name) in ['ass', 'srt', 'microdvd']:
		return adict(t='format', msg=f'Subtitle file [{fmt}]')
	for s in probe.streams:
		if not (ct := s.get('codec_type')): continue
		if ct == 'video':
			if video:
				if s.codec_name == 'mjpeg': continue # as long as it's not the first stream
				multistream = True; continue
			if fps := s.get('avg_frame_rate') or 0:
				if '/' not in fps: fps = float(fps)
				else: a, b = map(float, fps.split('/')); fps = a and b and a/b
			video = adict( c=s.codec_name, fps=fps, w=s.width,
				h=s.height, pxf=s.pix_fmt, br=int(s.get('bit_rate') or 0) )
		if ct == 'audio':
			if audio: multistream = True; continue
			audio = adict(c=s.codec_name, chans=s.channels)
		if ct == 'subtitle':
			sub += 1
			if s.codec_name in ['dvb_subtitle', 'dvd_subtitle', 'hdmv_pgs_subtitle']:
				errs.append(adict(t='subs', msg='Has bitmap subtitles'))
			else: subs.append(sub-1)
	if not (audio and video): return adict(t='format', msg='Missing A/V streams')
	if sub or subs: subs.append(sub)
	return adict( v=video, a=audio, s=subs, ms=multistream,
		errs=errs, td=float(probe.format.duration), sz=int(probe.format.size) )


def main(args=None):
	import argparse, textwrap
	dd = lambda text: re.sub( r' \t+', ' ',
		textwrap.dedent(text).strip('\n') + '\n' ).replace('\t', '  ')
	parser = argparse.ArgumentParser(
		formatter_class=argparse.RawTextHelpFormatter,
		usage='%(prog)s [options] src [src ...]', description=dd('''
			ffprobe-check and convert source file(s)
				to a more compact video format, as needed, into current dir.
			Encodes to av1/opus/mkv, downscaling to ~720p30b10/96k-stereo.
			Initial ffprobe is intended to detect files that might already be
				converted or won't benefit from it as much, and skip those by default,
				as well as any files that can't be handled by this script correctly
				(e.g. ones that have multiple A/V streams or errors of any kind).
			Does not run conversion by default, only prints actions to be done.'''))
	parser.add_argument('src', nargs='+', help='File(s) to convert.')
	parser.add_argument('-x', '--convert', action='store_true', help=dd('''
		Run ffmpeg commands to convert all files not
			listed as PROBLEM or SKIP (unless -f/--force is used).'''))
	parser.add_argument('-f', '--force', action='store_true', help=dd('''
		Also process files marked as SKIP, i.e. ones that don't seem to need it.'''))
	parser.add_argument('-1', '--force-stream1', action='store_true', help=dd('''
		Process files with multiple A/V streams, encoding first stream from those.'''))
	parser.add_argument('-q', '--quiet', action='store_true', help=dd('''
		Don't print any WARN/SKIP info about files that seem to be encoded properly.'''))
	parser.add_argument('-T', '--dst-dir', metavar='path', help=dd('''
		Existing path to store resulting files in. Defaults to current dir.'''))
	parser.add_argument('--name',
		metavar='tpl', default='{name}.mkv', help=dd('''
			Template to rename resulting file(s), instead of default: %(default)s
			Can be used to set non-mkv container format, e.g. mp4.
				ffmpeg auto-detects it from extension, so it must be something conventional.
			Names are deduplicated with number-suffix when multiple sources are used.
			Substituted keys: "name" - source filename without extension.'''))
	parser.add_argument('-r', '--rm-list', metavar='file[:ratio]', help=dd('''
		Generate a list of files to cleanup after conversion, one per line.
		It will have realpath of all source files by default, unless ratio number
			(float in 0-1.0 range) is also specified, colon-separated after filename.
		With ratio number, filename on the list is picked
			from either source or destination after each operation,
			based on resulting filesize difference - source if
			resulting size is larger than source*ratio, otherwise destination.
		Intended use is to make an easy-to-use list of files to
			rm when replacing old ones with converted versions,
			without unnecessary replacement if there's not enough benefit.
		Specified list file is always overwritten.'''))
	parser.add_argument('-R', '--rm-list-regen', action='store_true', help=dd('''
		When using -n/--skip-n or similar options,
			still check file sizes when they exist, and put them on the list.
		Can be used to make -r/--rm-list with new compression ratio target,
			by re-running script at any time with -n/--skip-n covering processed files.'''))
	parser.add_argument('-n', '--skip-n', metavar='n', type=int, help=dd('''
		Skip first N files that'd have been processed otherwise.
		Can be used to resume a long operation, using number from
			"wc -l" on -r/--rm-list or printed n/m count between/after ffmpeg runs.'''))
	parser.add_argument('-F', '--fn-opts', action='store_true', help=dd('''
		Apply and strip ffmpeg params stored in filenames.
		If filename has ".tomkv<opts>." before file extension, it'll be parsed and removed.
		Where "<opts>" part can have any number of following options, concatenated:
			+ss=<time> - translated to "-ss <time>" for ffmpeg command.
			+to=<time> - "-to <time>" for ffmpeg - stop time to cut video short at.
		Filename example: video.tomkv+ss=1:23+to=4:56.mp4'''))
	opts = parser.parse_args(sys.argv[1:] if args is None else args)

	src_list = list()
	for src in opts.src:
		try: src_list.append((src := pl.Path(src), srcx := src.resolve(strict=True)))
		except FileNotFoundError: parser.error(f'Source path missing/inaccessible: {src}')
		if '\n' in f'{src} {srcx}': parser.error(f'Source path with newline in it: {src!r}')
	if opts.dst_dir: os.chdir(opts.dst_dir)
	if rm_list := opts.rm_list:
		rm_list, rm_list_ratio = ( (rm_list, math.inf)
			if ':' not in rm_list else rm_list.rsplit(':', 1) )
		rm_list, rm_list_ratio = open(rm_list, 'w'), float(rm_list_ratio)
	nx = max(0, opts.skip_n or 0)
	pxfmt_set = parse_rgb10_pixfmts()

	## ffprobe checks
	for n, (src, srcx) in enumerate(src_list):
		src_list[n] = None
		try: p = src_probe(srcx)
		except Exception as err: p = adict( t='probe',
			msg=f'Failed to process media info: [{err.__class__.__name__}] {err}' )
		if p.get('ms') and not opts.force_stream1:
			p = adict(t='format', msg='Multiple A/V streams detected')
		# Parse fnopts, format dst filename
		fn = (opts.name or '{name}.mkv').format(name=src.name.rsplit('.', 1)[0])
		p.fn, p.ext = fn.rsplit('.', 1) if '.' in fn else (fn, '')
		if opts.fn_opts and (m := re.search(r'\.tomkv(\+\S+)?$', p.fn)):
			p.fn, fnopts = fn[:m.start()], m[1] or ''
			try:
				p.opts = dict(opt.split('=', 1) for opt in fnopts.split('+') if opt)
				if set(p.opts) - {'ss', 'to'}: raise ValueError
			except: p = adict(t='file', msg=f'Failed to parse -F/--fn-opts: {fnopts!r}')
		# Last check for any unfixable issues
		if (errs := p.get('errs')) is None:
			print(f'\n{src.name} :: PROBLEM {p.get("t") or "-"} :: {p.msg}'); continue
		# Check for warnings and skippable issues
		p.v.scale, p.v.resample = p.v.w > 1400 or p.v.h > 1500, p.v.fps > 35
		p.v.pxconv = p.v.pxf not in pxfmt_set
		if p.v.c in ['hevc', 'av1'] and not p.v.scale and not p.v.resample and p.v.br < 2.5e6:
			errs.append(adict(t='video', msg=f'Already encoded to <1280x <30fps {p.v.c}'))
		if p.a.c == 'opus' and p.a.chans <= 2:
			errs.append(adict( t='audio', warn=1,
				msg='Already encoded in <200k 2ch opus, will copy it as-is' ))
			p.a.clone = True
		# Last check for non-fatal errors
		if errs:
			try: skip = err = next(err for err in errs if not err.get('warn'))
			except: err = errs[skip := 0]
			if not opts.quiet:
				err_verdict = lambda: 'WARN' if err.get('warn') or opts.force else 'SKIP'
				if len(errs) == 1:
					print(f'\n{src.name} :: {err_verdict()}'.rstrip())
					for err in errs: print(f'  {err_verdict()} {err.get("t") or "-"} :: {err.msg}')
				else: print(f'\n{src.name} :: {err_verdict()} {err.get("t") or "-"} :: {err.msg}')
			if skip and not opts.force: continue
		src_list[n], p.src = p, srcx

	## Deduplication of dst filenames
	dst_name_aliases, dst_name_map, ext = dict(), dict(), p.ext and f'.{p.ext}'
	dst_name_fmt = lambda p: dst_name_aliases.setdefault(p.src, p.fn + ext)
	src_list = list(filter(None, src_list))
	for p in sorted(src_list, key=lambda p: (p.fn, str(p.src))):
		dst_name_map.setdefault(dst_name_fmt(p), list()).append(p)
	for dst_name, ps in dst_name_map.items():
		if len(ps) == 1: continue
		nf = str(len(str(len(ps))))
		for n, p in enumerate(ps, 1):
			dst_name_aliases[p.src] = ('{}.{:0'+nf+'d}{}').format(p.fn, n, ext)
	for p in src_list: p.dst, p.tmp = (dst := dst_name_fmt(p)), f'_tmp.{dst}'

	## Main ffmpeg conversion loop
	dry_run, m = not opts.convert, len(src_list)
	nx, ts0 = min(nx, m), time.monotonic()

	sz_src_done = sz_src_proc = sz_dst_done = 0
	def _skipped_stats_catchup(n):
		nonlocal sz_src_done, sz_dst_done
		for nc, pc in enumerate(src_list, 1):
			if nc == n: break
			try: sz_dst = os.stat(pc.dst).st_size
			except FileNotFoundError: continue
			sz_src_done += (sz_src := pc.sz); sz_dst_done += sz_dst
			if rm_list and opts.rm_list_regen:
				improved = sz_dst/sz_src < rm_list_ratio
				print(pc.src if improved else pc.dst, file=rm_list)

	if dry_run: print()
	for n, p in enumerate(src_list, 1):
		filters = list()
		if p.v.resample: filters.append('fps=30')
		if p.v.scale: filters.append(
			"scale='if(gte(iw,ih),min(1280,iw),-2)"
			":if(lt(iw,ih),min(1280,ih),-2)',setsar=1:1" )
		if filters: filters = ['-filter:v', ','.join(filters)]
		if fnopts := p.get('opts'):
			filters = sum(([f'-{k}', v] for k, v in fnopts.items()), []) + filters
		if p.v.pxconv: filters.extend(['-pix_fmt', 'yuv420p10le'])
		if p.a.get('clone'): ac = ['-c:a', 'copy']
		else:
			ac = ['-c:a', 'libopus', '-b:a', '96k']
			if p.a.chans == 6: ac = [ '-filter:a', # -ac2 discards sw channel
				'pan=stereo|c0=0.5*c2+0.707*c0+0.707*c4+0.5*c3'
				'|c1=0.5*c2+0.707*c1+0.707*c5+0.5*c3,volume=2.0' ] + ac
			elif p.a.chans != 2: ac = ['-ac', '2'] + ac
		fmt = ['-movflags', '+faststart'] if p.ext.lower() in ['mp4', 'mov', 'm4v'] else []
		if p.s or p.ms:
			fmt.extend('-map 0:v:0 -map 0:a:0'.split())
			if p.s and (sn := (subs := p.s.copy()).pop()):
				if subs == list(range(sn)): fmt.extend(['-map', '0:s']); subs.clear()
				for n in subs: fmt.extend(['-map', f'0:s:{n}'])
		cmd = [ 'ffmpeg', '-hide_banner', '-i', str(p.src), *filters,
			*'-c:v libsvtav1 -preset 5 -crf 38'.split(), *ac, *fmt, '-y', p.tmp ]
		dt, ts1 = time.strftime('%Y-%m-%d %H:%M:%S'), time.monotonic()
		msg = f'\n\n- {dt} --- [ {n} / {m} ] :: {td_repr(p.td)} :: {p.src} -> {p.dst}\n'
		if n == nx and not dry_run: _skipped_stats_catchup(n+1)
		if n <= nx: continue
		if dry_run: msg = msg.strip()
		print(msg); print(' '.join((repr(a) if any( c in a for c in
			r' \'"|*?!&$`{}[];' ) else a) for a in cmd), end='\n\n', flush=True)
		if dry_run: continue

		sp.run( cmd, check=True,
			env=dict(os.environ, SVT_LOG='2'), stdin=sp.DEVNULL )
		os.rename(p.tmp, p.dst)

		# Stats/rm-list for last processed file
		target = ''
		sz_src_done += (sz_src := p.sz); sz_src_proc += sz_src
		sz_dst_done += (sz_dst := os.stat(p.dst).st_size)
		if rm_list:
			improved = sz_dst/sz_src < rm_list_ratio
			print(p.src if improved else p.dst, file=rm_list, flush=True)
			if rm_list_ratio is not math.inf:
				target = 'better' if improved else 'WORSE'
				target = f' [ {target} than {round(rm_list_ratio*100)}% target ]'
		dt, td = time.strftime('%Y-%m-%d %H:%M:%S'), time.monotonic() - ts1
		print( f'- {dt} --- [ {n} / {m} ] :: {p.dst}'
			f' :: 100% -> {round(100*sz_dst/sz_src)}%{target}'
			f' :: {sz_repr(sz_src)} -> {sz_repr(sz_dst)}'
			f' :: encoded {td_repr(p.td)} in {td_repr(td)}, at {p.td/td:.2f}x speed' )

		# Total stats and estimates
		sz_src_left = sum(pc.sz for nc, pc in enumerate(src_list, 1) if nc > n)
		sz_dst_left = sz_src_left * (sz_ratio := sz_dst_done/sz_src_done)
		td_left = sz_src_left / (sz_src_proc / (td := time.monotonic() - ts0))
		st = ( f'- --- Processed so far :: {sz_repr(sz_src_done)} ->'
			f' {sz_repr(sz_dst_done)} [ {round(100*sz_ratio)}% ] in {td_repr(td)}' )
		if nx: st += f', with first {nx} file(s) skipped on this run'
		print(st)
		if n == m: print('- --- all done', flush=True)
		else: print(
			f'- --- Left to process :: {m-n} file(s) / {sz_repr(sz_src_left)}'
			f' -> additional ~{sz_repr(sz_dst_left)} in {td_repr(td_left)}'
			f' (est. ~{sz_repr(sz_dst_done+sz_dst_left)} final total)', flush=True )

if __name__ == '__main__': sys.exit(main())