From 2e94c7b46e9f088c8bf2db922da2e140ffa396e0 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Wed, 6 Dec 2023 13:25:00 -0800 Subject: [PATCH 01/21] Add validation directory (moved from rms-webtools repo) --- validation/pdsarchives.py | 508 ++++++++ validation/pdschecksums.py | 860 +++++++++++++ validation/pdsdata-sync.sh | 76 ++ validation/pdsdependency.py | 948 ++++++++++++++ validation/pdsindexshelf.py | 494 +++++++ validation/pdsinfoshelf.py | 883 +++++++++++++ validation/pdslinkshelf.py | 1718 +++++++++++++++++++++++++ validation/re-validate.py | 818 ++++++++++++ validation/shelf-consistency-check.py | 87 ++ 9 files changed, 6392 insertions(+) create mode 100755 validation/pdsarchives.py create mode 100755 validation/pdschecksums.py create mode 100755 validation/pdsdata-sync.sh create mode 100755 validation/pdsdependency.py create mode 100755 validation/pdsindexshelf.py create mode 100755 validation/pdsinfoshelf.py create mode 100755 validation/pdslinkshelf.py create mode 100755 validation/re-validate.py create mode 100755 validation/shelf-consistency-check.py diff --git a/validation/pdsarchives.py b/validation/pdsarchives.py new file mode 100755 index 0000000..185aa7e --- /dev/null +++ b/validation/pdsarchives.py @@ -0,0 +1,508 @@ +#!/usr/bin/env python3 +################################################################################ +# pdsarchives.py library and main program +# +# Syntax: +# pdsarchives.py --task path [path ...] +# +# Enter the --help option to see more information. +################################################################################ + +import sys +import os +import tarfile +import zlib +import argparse + +import pdslogger +import pdsfile + +LOGNAME = 'pds.validation.archives' +LOGROOT_ENV = 'PDS_LOG_ROOT' + +################################################################################ +# General tarfile functions +################################################################################ + +def load_directory_info(pdsdir, limits={'normal':100}, logger=None): + """Generate a list of tuples (abspath, dirpath, nbytes, mod time) recursively + for the given directory tree. + """ + + dirpath = pdsdir.abspath + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Generating file info', dirpath, limits) + + try: + (tarpath, lskip) = pdsdir.archive_path_and_lskip() + + tuples = [(dirpath, dirpath[lskip:], 0, 0)] + for (path, dirs, files) in os.walk(dirpath): + + # Load files + for file in files: + abspath = os.path.join(path, file) + + if file == '.DS_Store': # skip .DS_Store files + logger.ds_store('.DS_Store skipped', abspath) + continue + + if file.startswith('._'): # skip dot-underscore files + logger.dot_underscore('._* file skipped', abspath) + continue + + if '/.' in abspath: # flag invisible files + logger.invisible('Invisible file', abspath) + + nbytes = os.path.getsize(abspath) + modtime = os.path.getmtime(abspath) + logger.normal('File info generated', abspath) + + tuples.append((abspath, abspath[lskip:], nbytes, modtime)) + + # Load directories + for dir in dirs: + abspath = os.path.join(path, dir) + + if dir.startswith('._'): # skip dot-underscore files + logger.dot_underscore('._* directory skipped', abspath) + continue + + if '/.' in abspath: # flag invisible files + logger.invisible('Invisible directory', abspath) + + logger.normal('Directory info generated', abspath) + + tuples.append((abspath, abspath[lskip:], 0, 0)) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + return tuples + +################################################################################ + +def read_archive_info(tarpath, limits={'normal':100}, logger=None): + """Return a list of tuples (abspath, dirpath, nbytes, modtime) from a .tar.gz + file.""" + + tarpath = os.path.abspath(tarpath) + pdstar = pdsfile.PdsFile.from_abspath(tarpath) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdstar.root_) + logger.open('Reading archive file', tarpath, limits=limits) + + try: + (dirpath, prefix) = pdstar.dirpath_and_prefix_for_archive() + + tuples = [] + with tarfile.open(tarpath, 'r:gz') as f: + + members = f.getmembers() + for member in members: + abspath = os.path.join(prefix, member.name) + + if abspath.endswith('/.DS_Store'): # skip .DS_Store files + logger.error('.DS_Store in tarfile', abspath) + + if '/._' in abspath: # skip dot-underscore files + logger.error('._* file in tarfile', abspath) + + if '/.' in abspath: # flag invisible files + logger.invisible('Invisible file found', abspath) + + if member.isdir(): + tuples.append((abspath, member.name, 0, 0)) + else: + tuples.append((abspath, member.name, member.size, + member.mtime)) + + logger.normal('Info read', abspath) + + except (zlib.error, Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + return tuples + +################################################################################ + +def write_archive(pdsdir, clobber=True, archive_invisibles=True, + limits={'normal':-1, 'dot_':100}, logger=None): + """Write an archive file containing all the files in the directory.""" + + def archive_filter(member): + """Internal function to filter filenames""" + + # Erase user info + member.uid = member.gid = 0 + member.uname = member.gname = "root" + + # Check for valid file names + basename = os.path.basename(member.name) + if basename == '.DS_Store': + logger.ds_store('.DS_Store file skipped', member.name) + return None + + if basename.startswith('._') or '/._' in member.name: + logger.dot_underscore('._* file skipped', member.name) + return None + + if basename.startswith('.') or '/.' in member.name: + if archive_invisibles: + logger.invisible('Invisible file archived', member.name) + return member + else: + logger.invisible('Invisible file skipped', member.name) + return None + + logger.normal('File archived', member.name) + return member + + #### Begin active code + + dirpath = pdsdir.abspath + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Writing .tar.gz file for', dirpath, limits=limits) + + try: + (tarpath, lskip) = pdsdir.archive_path_and_lskip() + + # Create parent directory if necessary + parent = os.path.split(tarpath)[0] + if not os.path.exists(parent): + logger.normal('Creating directory', parent) + os.makedirs(parent) + + if not clobber and os.path.exists(tarpath): + logger.error('Archive file already exists', tarpath) + return + + f = tarfile.open(tarpath, mode='w:gz') + f.add(dirpath, arcname=dirpath[lskip:], recursive=True, + filter=archive_filter) + logger.normal('Written', tarpath) + f.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def validate_tuples(dir_tuples, tar_tuples, limits={'normal':100}, logger=None): + """Validate the directory list of tuples against the list from the tarfile. + """ + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.open('Validating file information', limits=limits) + + valid = True + try: + tardict = {} + for (abspath, dirpath, nbytes, modtime) in tar_tuples: + tardict[abspath] = (dirpath, nbytes, modtime) + + for (abspath, dirpath, nbytes, modtime) in dir_tuples: + if abspath not in tardict: + logger.error('Missing from tar file', abspath) + valid = False + + elif (dirpath, nbytes, modtime) != tardict[abspath]: + + if nbytes != tardict[abspath][1]: + logger.error('Byte count mismatch: ' + + '%d (filesystem) vs. %d (tarfile)' % + (nbytes, tardict[abspath][1]), abspath) + valid = False + + if abs(modtime - tardict[abspath][2]) > 1: + logger.error('Modification time mismatch: ' + + '%s (filesystem) vs. %s (tarfile)' % + (modtime, tardict[abspath][2]), abspath) + valid = False + + del tardict[abspath] + + else: + logger.normal('Validated', dirpath) + del tardict[abspath] + + keys = list(tardict.keys()) + keys.sort() + for abspath in keys: + logger.error('Missing from directory', abspath) + valid = False + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + logger.close() + +################################################################################ +# Simplified functions to perform tasks +################################################################################ + +def initialize(pdsdir, logger=None): + write_archive(pdsdir, clobber=False, logger=logger) + return True + +def reinitialize(pdsdir, logger=None): + write_archive(pdsdir, clobber=True, logger=logger) + return True + +def validate(pdsdir, logger=None): + + dir_tuples = load_directory_info(pdsdir, logger=logger) + + tarpath = pdsdir.archive_path_and_lskip()[0] + tar_tuples = read_archive_info(tarpath, logger=logger) + + return validate_tuples(dir_tuples, tar_tuples, logger=logger) + +def repair(pdsdir, logger=None): + + tarpath = pdsdir.archive_path_and_lskip()[0] + if not os.path.exists(tarpath): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Archive file does not exist; initializing', tarpath) + initialize(pdsdir, logger=logger) + return True + + tar_tuples = read_archive_info(tarpath, logger=logger) + dir_tuples = load_directory_info(pdsdir, logger=logger) + + # Compare + dir_tuples.sort() + tar_tuples.sort() + canceled = (dir_tuples == tar_tuples) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.info('!!! Files match; repair canceled', tarpath) + return False + + # Overwrite tar file if necessary + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.info('Discrepancies found; writing new file', tarpath) + write_archive(pdsdir, clobber=True, logger=logger) + return True + +def update(pdsdir, logger=None): + + tarpath = pdsdir.archive_path_and_lskip()[0] + + if os.path.exists(tarpath): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.info('Archive file exists; skipping', tarpath) + return False + + # Write tar file if necessary + write_archive(pdsdir, clobber=True, logger=logger) + return True + +################################################################################ +# Executable program +################################################################################ + +if __name__ == '__main__': + + # Set up parser + parser = argparse.ArgumentParser( + description='pdsarchives: Create, maintain and validate .tar.gz ' + + 'archives of PDS volume directory trees.') + + parser.add_argument('--initialize', '--init', const='initialize', + default='', action='store_const', dest='task', + help='Create a .tar.gz archive for a volume. Abort ' + + 'if the archive already exists.') + + parser.add_argument('--reinitialize', '--reinit', const='reinitialize', + default='', action='store_const', dest='task', + help='Create a .tar.gz archive for a volume. Replace ' + + 'the archive if it already exists.') + + parser.add_argument('--validate', const='validate', + default='', action='store_const', dest='task', + help='Validate every file in a volume against the ' + + 'contents of its .tar.gz archive. Files match ' + + 'if they have identical byte counts and ' + + 'modification dates; file contents are not ' + + 'compared.') + + parser.add_argument('--repair', const='repair', + default='', action='store_const', dest='task', + help='Validate every file in a volume against the ' + + 'contents of its .tar.gz archive. If any file ' + + 'has changed, write a new archive.') + + parser.add_argument('--update', const='update', + default='', action='store_const', dest='task', + help='Search a volume set directory for any new ' + + 'volumes and create a new archive file for each ' + + 'of them; do not update any pre-existing archive '+ + 'files.') + + parser.add_argument('volume', nargs='+', type=str, + help='The path to the root of the volume or volume ' + + 'set. For a volume set, all the volume ' + + 'directories inside it are handled in sequence.') + + parser.add_argument('--log', '-l', type=str, default='', + help='Optional root directory for a duplicate of the ' + + 'log files. If not specified, the value of ' + + 'environment variable "%s" ' % LOGROOT_ENV + + 'is used. In addition, individual logs are ' + + 'written into the "logs" directory parallel to ' + + '"holdings". Logs are created inside the ' + + '"pdsarchives" subdirectory of each log root ' + + 'directory.' + ) + + parser.add_argument('--quiet', '-q', action='store_true', + help='Do not also log to the terminal.') + + # Parse and validate the command line + args = parser.parse_args() + + if not args.task: + print('pdsarchives error: Missing task') + sys.exit(1) + + status = 0 + + # Define the logging directory + if args.log == '': + try: + args.log = os.environ[LOGROOT_ENV] + except KeyError: + args.log = None + + # Initialize the logger + logger = pdslogger.PdsLogger(LOGNAME) + pdsfile.PdsFile.set_log_root(args.log) + + if not args.quiet: + logger.add_handler(pdslogger.stdout_handler) + + if args.log: + path = os.path.join(args.log, 'pdsarchives') + warning_handler = pdslogger.warning_handler(path) + logger.add_handler(warning_handler) + + error_handler = pdslogger.error_handler(path) + logger.add_handler(error_handler) + + # Generate a list of pdsfiles for volume directories + pdsdirs = [] + for path in args.volume: + + path = os.path.abspath(path) + if not os.path.exists(path): + print('No such file or directory: ' + path) + sys.exit(1) + + pdsf = pdsfile.PdsFile.from_abspath(path) + if pdsf.checksums_: + print('No archives for checksum files: ' + path) + sys.exit(1) + + if pdsf.archives_: + print('No archives for archive files: ' + path) + sys.exit(1) + + pdsdir = pdsf.volume_pdsfile() + if pdsdir and pdsdir.isdir: + pdsdirs.append(pdsdir) + else: + pdsdir = pdsf.volset_pdsfile() + children = [pdsdir.child(c) for c in pdsdir.childnames] + pdsdirs += [c for c in children if c.isdir] + # "if c.isdir" is False for volset level readme files + + # Begin logging and loop through pdsdirs... + logger.open(' '.join(sys.argv)) + try: + for pdsdir in pdsdirs: + + # Save logs in up to two places + logfiles = set([pdsdir.log_path_for_volume('_links', + task=args.task, + dir='pdsarchives'), + pdsdir.log_path_for_volume('_links', + task=args.task, + dir='pdsarchives', + place='parallel')]) + + # Create all the handlers for this level in the logger + local_handlers = [] + for logfile in logfiles: + local_handlers.append(pdslogger.file_handler(logfile)) + logdir = os.path.split(logfile)[0] + + # These handlers are only used if they don't already exist + warning_handler = pdslogger.warning_handler(logdir) + error_handler = pdslogger.error_handler(logdir) + local_handlers += [warning_handler, error_handler] + + # Open the next level of the log + if len(pdsdirs) > 1: + logger.blankline() + + logger.open('Task %s for' % args.task, pdsdir.abspath, + handler=local_handlers) + + try: + for logfile in logfiles: + logger.info('Log file', logfile) + + if args.task == 'initialize': + proceed = initialize(pdsdir) + + elif args.task == 'reinitialize': + proceed = reinitialize(pdsdir) + + elif args.task == 'validate': + proceed = validate(pdsdir) + + elif args.task == 'repair': + proceed = repair(pdsdir) + + else: # update + proceed = update(pdsdir) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + status = 1 + proceed = False + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + if fatal or errors: + status = 1 + + sys.exit(status) diff --git a/validation/pdschecksums.py b/validation/pdschecksums.py new file mode 100755 index 0000000..382e3b2 --- /dev/null +++ b/validation/pdschecksums.py @@ -0,0 +1,860 @@ +#!/usr/bin/env python3 +################################################################################ +# pdschecksums.py library and main program +# +# Syntax: +# pdschecksums.py --task path [path ...] +# +# Enter the --help option to see more information. +################################################################################ + +import argparse +import datetime +import glob +import hashlib +import os +import shutil +import sys + +import pdslogger +import pdsfile + +# Holds log file directories temporarily, used by move_old_checksums() +LOGDIRS = [] + +LOGNAME = 'pds.validation.checksums' +LOGROOT_ENV = 'PDS_LOG_ROOT' + +################################################################################ + +# From http://stackoverflow.com/questions/3431825/- +# generating-an-md5-checksum-of-a-file + +def hashfile(fname, blocksize=65536): + f = open(fname, 'rb') + hasher = hashlib.md5() + buf = f.read(blocksize) + while len(buf) > 0: + hasher.update(buf) + buf = f.read(blocksize) + return hasher.hexdigest() + +################################################################################ + +def generate_checksums(pdsdir, selection=None, oldpairs=[], regardless=True, + limits={'normal':-1}, logger=None): + """Generate a list of tuples (abspath, checksum) recursively from the given + directory tree. + + If a selection is specified, it is interpreted as the basename of a file, + and only that file is processed. + + The optional oldpairs is a list of (abspath, checksum) pairs. For any file + that already has a checksum in the shortcut list, the checksum is copied + from this list rather than re-calculated. This list is merged with the + selection if a selection is identified. + + If regardless is True, then the checksum of a selection is calculated + regardless of whether it is already in abspairs. + + Also return the latest modification date among all the files checked. + """ + + dirpath = pdsdir.abspath + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Generating MD5 checksums', dirpath, limits=limits) + + latest_mtime = 0. + try: + md5_dict = {} + for (abspath, hex) in oldpairs: + md5_dict[abspath] = hex + + newtuples = [] + for (path, dirs, files) in os.walk(dirpath): + for file in files: + abspath = os.path.join(path, file) + latest_mtime = max(latest_mtime, os.path.getmtime(abspath)) + + if selection and file != selection: + continue + + if file == '.DS_Store': # skip .DS_Store files + logger.ds_store('.DS_Store skipped', abspath) + continue + + if file.startswith('._'): # skip dot-underscore files + logger.dot_underscore('._* file skipped', abspath) + continue + + if '/.' in abspath: # flag invisible files + logger.invisible('Invisible file', abspath) + + if regardless and selection: + md5 = hashfile(abspath) + newtuples.append((abspath, md5, file)) + logger.normal('Selected MD5=%s' % md5, abspath) + + elif abspath in md5_dict: + newtuples.append((abspath, md5_dict[abspath], file)) + logger.debug('MD5 copied', abspath) + + else: + md5 = hashfile(abspath) + newtuples.append((abspath, md5, file)) + logger.normal('MD5=%s' % md5, abspath) + + if selection: + if len(newtuples) == 0: + logger.error('File selection not found', selection) + return ({}, latest_mtime) + + if len(newtuples) > 1: + logger.error('Multiple copies of file selection found', + selection) + return ({}, latest_mtime) + + # Add new values to dictionary + for (abspath, md5, _) in newtuples: + md5_dict[abspath] = md5 + + # Restore original order, old keys then new + old_keys = [p[0] for p in oldpairs] + + newpairs = [] + for key in old_keys: + newpairs.append((key, md5_dict[key])) + del md5_dict[key] + + for (key, new_md5, new_file) in newtuples: + if key in md5_dict: # if not already copied to list of pairs + newpairs.append((key, md5_dict[key])) + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('Lastest holdings file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + return (newpairs, latest_mtime) + +################################################################################ + +def read_checksums(check_path, selection=None, limits={}, logger=None): + + """Return a list of tuples (abspath, checksum) from a checksum file. + + If a selection is specified, then only the checksum with this file name + is returned.""" + + check_path = os.path.abspath(check_path) + pdscheck = pdsfile.PdsFile.from_abspath(check_path) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdscheck.root_) + logger.open('Reading MD5 checksums', check_path, limits=limits) + + try: + logger.info('MD5 checksum file', check_path) + + if not os.path.exists(check_path): + logger.error('MD5 checksum file not found', check_path) + return [] + + prefix_ = pdscheck.dirpath_and_prefix_for_checksum()[1] + + # Read the pairs + abspairs = [] + with open(check_path, 'r') as f: + for rec in f: + hexval = rec[:32] + filepath = rec[34:].rstrip() + + if selection and os.path.basename(filepath) != selection: + continue + + basename = os.path.basename(filepath) + if basename == '.DS_Store': + logger.error('.DS_Store found in checksum file', filepath) + continue + + if basename.startswith('._'): + logger.error('._* file found in checksum file', filepath) + continue + + if basename[0] == '.': + logger.invisible('Checksum for invisible file', filepath) + + abspairs.append((prefix_ + filepath, hexval)) + logger.debug('Read', filepath) + + if selection and len(abspairs) == 0: + logger.error('File selection not found', selection) + return [] + + except Exception as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + return abspairs + +################################################################################ + +def checksum_dict(dirpath, logger=None): + + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.PdsFile.from_abspath(dirpath) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.info('Loading checksums for', dirpath, force=True) + + check_path = pdsdir.checksum_path_and_lskip()[0] + abspairs = read_checksums(check_path, logger=logger) + + pair_dict = {} + for (abspath, checksum) in abspairs: + pair_dict[abspath] = checksum + + logger.info('Checksum load completed', dirpath, force=True) + return pair_dict + +################################################################################ + +def write_checksums(check_path, abspairs, + limits={'dot_':-1, 'ds_store':-1, 'invisible':100}, + logger=None): + """Write a checksum table containing the given pairs (abspath, checksum).""" + + check_path = os.path.abspath(check_path) + pdscheck = pdsfile.PdsFile.from_abspath(check_path) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdscheck.root_) + logger.open('Writing MD5 checksums', check_path, limits=limits) + + try: + # Create parent directory if necessary + parent = os.path.split(check_path)[0] + if not os.path.exists(parent): + logger.normal('Creating directory', parent) + os.makedirs(parent) + + prefix_ = pdscheck.dirpath_and_prefix_for_checksum()[1] + lskip = len(prefix_) + + # Write file + f = open(check_path, 'w') + for pair in abspairs: + (abspath, hex) = pair + + if abspath.endswith('/.DS_Store'): # skip .DS_Store files + logger.ds_store('.DS_Store skipped', abspath) + continue + + if '/._' in abspath: # skip dot-underscore files + logger.dot_underscore('._* file skipped', abspath) + continue + + if '/.' in abspath: # flag invisible files + logger.invisible('Invisible file', abspath) + + f.write('%s %s\n' % (hex, abspath[lskip:])) + logger.debug('Written', abspath) + + f.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def validate_pairs(pairs1, pairs2, selection=None, limits={}, logger=None): + """Validate the first checksum list against the second. + + If a selection is specified, only a file with that basename is checked.""" + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.open('Validating checksums', limits=limits) + + success = True + try: + md5_dict = {} + for (abspath, hex) in pairs2: + md5_dict[abspath] = hex + + for (abspath, hex) in pairs1: + if selection and selection != os.path.basename(abspath): + continue + + if abspath not in md5_dict: + logger.error('Missing checksum', abspath) + success = False + + elif hex != md5_dict[abspath]: + del md5_dict[abspath] + logger.error('Checksum mismatch', abspath) + success = False + + else: + del md5_dict[abspath] + logger.normal('Validated', abspath) + + if not selection: + abspaths = list(md5_dict.keys()) + abspaths.sort() + for abspath in abspaths: + logger.error('Extra file', abspath) + success = False + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + logger.close() + return success + +################################################################################ + +def move_old_checksums(check_path, logger=None): + """Appends a version number to an existing checksum file and moves it to + the associated log directory.""" + + if not os.path.exists(check_path): return + + check_basename = os.path.basename(check_path) + (check_prefix, check_ext) = os.path.splitext(check_basename) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + from_logged = False + for log_dir in LOGDIRS: + dest_template = log_dir + '/' + check_prefix + '_v???' + check_ext + version_paths = glob.glob(dest_template) + + max_version = 0 + lskip = len(check_ext) + for version_path in version_paths: + version = int(version_path[-lskip-3:-lskip]) + max_version = max(max_version, version) + + new_version = max_version + 1 + dest = dest_template.replace('???', '%03d' % new_version) + shutil.copy(check_path, dest) + + if not from_logged: + logger.info('Checksum file moved from: ' + check_path) + from_logged = True + + logger.info('Checksum file moved to', dest) + +################################################################################ +# Simplified functions to perform tasks +################################################################################ + +def initialize(pdsdir, selection=None, logger=None): + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Make sure checksum file does not exist + if os.path.exists(check_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Checksum file already exists', check_path) + return False + + # Check selection + if selection: + raise ValueError('File selection is disallowed for task ' + + '"initialize": ' + selection) + + # Generate checksums + (pairs, _) = generate_checksums(pdsdir, logger=logger) + if not pairs: + return False + + # Write new checksum file + write_checksums(check_path, pairs, logger=logger) + return True + +def reinitialize(pdsdir, selection=None, logger=None): + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Warn if checksum file does not exist + if not os.path.exists(check_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Checksum file does not exist', check_path) + return False + else: + logger.warn('Checksum file does not exist; initializing', check_path) + return initialize(pdsdir, selection=selection, logger=logger) + + # Re-initialize just the selection; preserve others + if selection: + oldpairs = read_checksums(check_path, logger=logger) + if not oldpairs: + return False + else: + oldpairs = [] + + # Generate new checksums + (pairs, _) = generate_checksums(pdsdir, selection, oldpairs, + regardless=True, logger=logger) + if not pairs: + return False + + # Write new checksum file + move_old_checksums(check_path, logger=logger) + write_checksums(check_path, pairs, logger=logger) + return True + +def validate(pdsdir, selection=None, logger=None): + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Make sure checksum file exists + if not os.path.exists(check_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Checksum file does not exist', check_path) + return False + + # Read checksum file + md5pairs = read_checksums(check_path, selection, logger=logger) + if not md5pairs: + return False + + # Generate checksums + (dirpairs, _) = generate_checksums(pdsdir, selection, logger=logger) + if not dirpairs: + return False + + # Validate + return validate_pairs(dirpairs, md5pairs, selection, logger=logger) + +def repair(pdsdir, selection=None, logger=None): + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Make sure checksum file exists + if not os.path.exists(check_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Checksum file does not exist', check_path) + return False + else: + logger.warn('Checksum file does not exist; initializing', check_path) + return initialize(pdsdir, selection=selection, logger=logger) + + # Read checksums file + md5pairs = read_checksums(check_path, logger=logger) + if not md5pairs: + return False + + # Generate new checksums + if selection: + (dirpairs, + latest_mtime) = generate_checksums(pdsdir, selection, md5pairs, + regardless=True, logger=logger) + else: + (dirpairs, + latest_mtime) = generate_checksums(pdsdir, logger=logger) + + if not dirpairs: + return False + + # Compare checksums + md5pairs.sort() + dirpairs.sort() + canceled = (dirpairs == md5pairs) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + check_mtime = os.path.getmtime(check_path) + if latest_mtime > check_mtime: + logger.info('!!! Checksum file content is up to date', + check_path, force=True) + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('!!! Latest holdings file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + check_mtime = os.path.getmtime(check_path) + dt = datetime.datetime.fromtimestamp(check_mtime) + logger.info('!!! Checksum file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + delta = latest_mtime - check_mtime + if delta >= 86400/10: + logger.info('!!! Checksum file is out of date %.1f days' % + (delta / 86400.), force=True) + else: + logger.info('!!! Checksum file is out of date %.1f minutes' % + (delta / 60.), force=True) + + dt = datetime.datetime.now() + os.utime(check_path) + logger.info('!!! Time tag on checksum file set to', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + else: + logger.info('!!! Checksum file is up to date; repair canceled', + check_path, force=True) + return True + + # Write checksum file + move_old_checksums(check_path, logger=logger) + write_checksums(check_path, dirpairs, logger=logger) + return True + +def update(pdsdir, selection=None, logger=None): + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Make sure file exists + if not os.path.exists(check_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Checksum file does not exist', check_path) + return False + else: + logger.warn('Checksum file does not exist; initializing', check_path) + return initialize(pdsdir, selection=selection, logger=logger) + + # Read checksums file + md5pairs = read_checksums(check_path, logger=logger) + if not md5pairs: + return False + + # Generate new checksums if necessary + (dirpairs, + latest_mtime) = generate_checksums(pdsdir, selection, md5pairs, + regardless=False, logger=logger) + if not dirpairs: + return False + + # Compare checksums + md5pairs.sort() + dirpairs.sort() + canceled = (dirpairs == md5pairs) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.info('!!! Checksum file content is complete; update canceled', + check_path) + return True + + # Write checksum file + move_old_checksums(check_path, logger=logger) + write_checksums(check_path, dirpairs, logger=logger) + return True + +################################################################################ +# Executable program +################################################################################ + +if __name__ == '__main__': + + # Set up parser + parser = argparse.ArgumentParser( + description='pdschecksums: Create, maintain and validate MD5 ' + + 'checksum files for PDS volumes and volume sets.') + + parser.add_argument('--initialize', '--init', const='initialize', + default='', action='store_const', dest='task', + help='Create an MD5 checksum file for a volume or ' + + 'volume set. Abort if the checksum file ' + + 'already exists.') + + parser.add_argument('--reinitialize', '--reinit', const='reinitialize', + default='', action='store_const', dest='task', + help='Create an MD5 checksum file for a volume or ' + + 'volume set. Replace the checksum file if it ' + + 'already exists. If a single file is specified, ' + + 'such as one archive file in a volume set, only ' + + 'single checksum is re-initialized.') + + parser.add_argument('--validate', const='validate', + default='', action='store_const', dest='task', + help='Validate every file in a volume directory tree ' + + 'against its MD5 checksum. If a single file ' + + 'is specified, such as one archive file in a ' + + 'volume set, only that single checksum is ' + + 'validated.') + + parser.add_argument('--repair', const='repair', + default='', action='store_const', dest='task', + help='Validate every file in a volume directory tree ' + + 'against its MD5 checksum. If any disagreement ' + + 'is found, the checksum file is replaced; ' + + 'otherwise it is unchanged. If a single file is ' + + 'specified, such as one archive file of a ' + + 'volume set, then only that single checksum is ' + + 'repaired. If any of the files checked are newer' + + 'than the checksum file, update shelf file\'s ' + + 'modification date') + + parser.add_argument('--update', const='update', + default='', action='store_const', dest='task', + help='Search a directory for any new files and add ' + + 'their MD5 checksums to the checksum file. ' + + 'Checksums of pre-existing files are not checked.') + + parser.add_argument('volume', nargs='+', type=str, + help='The path to the root directory of a volume or ' + + 'volume set. For a volume set, all the volume ' + + 'directories inside it are handled in sequence. ' + + 'Note that, for archive directories, checksums ' + + 'are grouped into one file for the entire ' + + 'volume set.') + + parser.add_argument('--log', '-l', type=str, default='', + help='Optional root directory for a duplicate of the ' + + 'log files. If not specified, the value of ' + + 'environment variable "%s" ' % LOGROOT_ENV + + 'is used. In addition, individual logs are ' + + 'written into the "logs" directory parallel to ' + + '"holdings". Logs are created inside the ' + + '"pdschecksums" subdirectory of each log root ' + + 'directory.') + + parser.add_argument('--quiet', '-q', action='store_true', + help='Do not also log to the terminal.') + + parser.add_argument('--archives', '-a', default=False, action='store_true', + help='Instead of referring to a volume, refer to the ' + + 'the archive file for that volume.') + + parser.add_argument('--infoshelf', '-i', dest='infoshelf', + default=False, action='store_true', + help='After a successful run, also execute the ' + + 'equivalent pdsinfoshelf command.') + + + # Parse and validate the command line + args = parser.parse_args() + + if not args.task: + print('pdschecksums error: Missing task') + sys.exit(1) + + # Define the logging directory + if args.log == '': + try: + args.log = os.environ[LOGROOT_ENV] + except KeyError: + args.log = None + + # Initialize the logger + logger = pdslogger.PdsLogger(LOGNAME) + pdsfile.PdsFile.set_log_root(args.log) + + if not args.quiet: + logger.add_handler(pdslogger.stdout_handler) + + if args.log: + path = os.path.join(args.log, 'pdschecksums') + warning_handler = pdslogger.warning_handler(path) + logger.add_handler(warning_handler) + + error_handler = pdslogger.error_handler(path) + logger.add_handler(error_handler) + + # Prepare the list of paths + abspaths = [] + for path in args.volume: + + # Make sure path makes sense + path = os.path.abspath(path) + parts = path.partition('/holdings/') + if not parts[1]: + print('Not a holdings subdirectory: ' + path) + sys.exit(1) + + if parts[2].startswith('checksums-'): + print('No checksums for checksum files: ' + path) + sys.exit(1) + + # Convert to an archives path if necessary + if args.archives and not parts[2].startswith('archives-'): + path = parts[0] + '/holdings/archives-' + parts[2] + + # Convert to a list of absolute paths that exist (volsets or volumes) + try: + pdsf = pdsfile.PdsFile.from_abspath(path, must_exist=True) + abspaths.append(pdsf.abspath) + + except (ValueError, IOError): + # Allow a volume name to stand in for a .tar.gz archive + (dir, basename) = os.path.split(path) + pdsdir = pdsfile.PdsFile.from_abspath(dir) + if pdsdir.archives_ and '.' not in basename: + if pdsdir.voltype_ == 'volumes/': + basename += '.tar.gz' + else: + basename += '_%s.tar.gz' % pdsdir.voltype_[:-1] + + newpaths = glob.glob(os.path.join(dir, basename)) + if len(newpaths) == 0: + raise + + abspaths += newpaths + continue + else: + raise + + # Generate a list of tuples (pdsfile, selection) + info = [] + for path in abspaths: + pdsf = pdsfile.PdsFile.from_abspath(path) + + if pdsf.is_volset_dir: + # Archive directories are checksumed by volset + if pdsf.archives_: + info.append((pdsf, None)) + + # Others are checksumed by volume + else: + children = [pdsf.child(c) for c in pdsf.childnames] + info += [(c, None) for c in children if c.isdir] + # "if c.isdir" is False for volset level readme files + + elif pdsf.is_volume_dir: + # Checksum one volume + info.append((pdsf, None)) + + elif pdsf.isdir: + print('Invalid directory for checksumming: ' + pdsf.logical_path) + sys.exit(1) + + else: + pdsdir = pdsf.parent() + if pdsf.is_volume_file: + # Checksum one archive file + info.append((pdsdir, pdsf.basename)) + elif pdsdir.is_volume_dir: + # Checksum one top-level file in volume + info.append((pdsdir, pdsf.basename)) + else: + print('Invalid file for checksumming: ' + pdsf.logical_path) + sys.exit(1) + + # Begin logging and loop through tuples... + logger.open(' '.join(sys.argv)) + try: + for (pdsdir, selection) in info: + path = pdsdir.abspath + + if selection: + pdsf = pdsdir.child(os.path.basename(selection)) + else: + pdsf = pdsdir + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Save logs in up to two places + if pdsf.volname: + logfiles = set([pdsf.log_path_for_volume('_md5', + task=args.task, + dir='pdschecksums'), + pdsf.log_path_for_volume('_md5', + task=args.task, + dir='pdschecksums', + place='parallel')]) + else: + logfiles = set([pdsf.log_path_for_volset('_md5', + task=args.task, + dir='pdschecksums'), + pdsf.log_path_for_volset('_md5', + task=args.task, + dir='pdschecksums', + place='parallel')]) + + # Create all the handlers for this level in the logger + local_handlers = [] + LOGDIRS = [] # used by move_old_checksums() + for logfile in logfiles: + local_handlers.append(pdslogger.file_handler(logfile)) + logdir = os.path.split(logfile)[0] + LOGDIRS.append(os.path.split(logfile)[0]) + + # These handlers are only used if they don't already exist + warning_handler = pdslogger.warning_handler(logdir) + error_handler = pdslogger.error_handler(logdir) + local_handlers += [warning_handler, error_handler] + + # Open the next level of the log + if len(info) > 1: + logger.blankline() + + if selection: + logger.open('Task "' + args.task + '" for selection ' + + selection, path, handler=local_handlers) + else: + logger.open('Task "' + args.task + '" for', path, + handler=local_handlers) + + try: + for logfile in logfiles: + logger.info('Log file', logfile) + + if args.task == 'initialize': + proceed = initialize(pdsdir, selection) + + elif args.task == 'reinitialize': + if selection: # don't erase everything else! + proceed = update(pdsdir, selection) + else: + proceed = reinitialize(pdsdir, selection) + + elif args.task == 'validate': + proceed = validate(pdsdir, selection) + + elif args.task == 'repair': + proceed = repair(pdsdir, selection) + + else: # update + proceed = update(pdsdir, selection) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + proceed = False + raise + + finally: + _ = logger.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + proceed = False + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + if fatal or errors: + proceed = False + + # If everything went well, execute pdsinfoshelf too + if proceed and args.infoshelf: + new_list = [a.replace('pdschecksums', 'pdsinfoshelf') for a in sys.argv] + new_list = [a for a in new_list if a not in ('--infoshelf', '-i')] + status = os.system(' '.join(new_list)) + sys.exit(status) + + else: + sys.exit(1) + diff --git a/validation/pdsdata-sync.sh b/validation/pdsdata-sync.sh new file mode 100755 index 0000000..cf1cf44 --- /dev/null +++ b/validation/pdsdata-sync.sh @@ -0,0 +1,76 @@ +#! /bin/zsh +################################################################################ +# Synchronize one volume set from one pdsdata drive to another. +# +# Usage: +# pdsdata-sync [--dry-run] +# +# Syncs the specified volume set from the drive /Volumes/pdsdata- +# to the drive /Volumes/pdsdata-. Append "--dry-run" for a test dry run. +# +# Example: +# pdsdata-sync admin raid45 VGx_9xxx +# copies all files relevant to the volume set "VGx_9xxx" from the drive +# pdsdata-admin to the drive pdsdata-raid45. +################################################################################ + +echo "\n\n**** holdings/_volinfo/$3.txt ****" +rsync -av --include="$3.txt" --exclude="*" \ + /Volumes/pdsdata-$1/holdings/_volinfo/ \ + /Volumes/pdsdata-$2/holdings/_volinfo/ $4 + +echo "\n\n**** holdings/documents/$3 ****" +rsync -av \ + /Volumes/pdsdata-$1/holdings/documents/$3/ \ + /Volumes/pdsdata-$2/holdings/documents/$3/ $4 + +for voltype in metadata previews calibrated diagrams volumes +do + if [ -d /Volumes/pdsdata-$1/holdings/$voltype/$3 ]; then + echo "\n\n**** holdings/archives-$voltype/$3 ****" + rsync -av /Volumes/pdsdata-$1/holdings/archives-$voltype/$3/ \ + /Volumes/pdsdata-$2/holdings/archives-$voltype/$3/ $4 + + echo "\n\n**** holdings/checksums-$voltype/$3 ****" + rsync -av /Volumes/pdsdata-$1/holdings/checksums-$voltype/$3/ \ + /Volumes/pdsdata-$2/holdings/checksums-$voltype/$3/ $4 + + echo "\n\n**** holdings/checksums-archives-$voltype/$3_*md5.txt ****" + rsync -av --include="$3_md5.txt" --include="$3_${voltype}_md5.txt" \ + --exclude="*" \ + /Volumes/pdsdata-$1/holdings/checksums-archives-$voltype/ \ + /Volumes/pdsdata-$2/holdings/checksums-archives-$voltype/ $4 + + echo "\n\n**** holdings/_infoshelf-$voltype/$3 ****" + rsync -av /Volumes/pdsdata-$1/holdings/_infoshelf-$voltype/$3/ \ + /Volumes/pdsdata-$2/holdings/_infoshelf-$voltype/$3/ $4 + + echo "\n\n**** holdings/_infoshelf-archives-$voltype/$3_info.py ****" + rsync -av --include="$3_info.py" --include="$3_info.pickle" \ + --exclude="*" \ + /Volumes/pdsdata-$1/holdings/_infoshelf-archives-$voltype/ \ + /Volumes/pdsdata-$2/holdings/_infoshelf-archives-$voltype/ $4 + + if [ -d /Volumes/pdsdata-$1/holdings/_linkshelf-$voltype ]; then + echo "\n\n**** holdings/_linkshelf-$voltype/$3 ****" + rsync -av /Volumes/pdsdata-$1/holdings/_linkshelf-$voltype/$3/ \ + /Volumes/pdsdata-$2/holdings/_linkshelf-$voltype/$3/ $4 + fi + + if [ -d /Volumes/pdsdata-$1/holdings/_indexshelf-$voltype ]; then + echo "\n\n**** holdings/_indexshelf-$voltype/$3 ****" + rsync -av /Volumes/pdsdata-$1/holdings/_indexshelf-$voltype/$3/ \ + /Volumes/pdsdata-$2/holdings/_indexshelf-$voltype/$3/ $4 + fi + + echo "\n\n**** holdings/$voltype/$3 ****" + rsync -av /Volumes/pdsdata-$1/holdings/$voltype/$3/ \ + /Volumes/pdsdata-$2/holdings/$voltype/$3/ $4 + + fi +done + +################################################################################ + +################################################################################ + diff --git a/validation/pdsdependency.py b/validation/pdsdependency.py new file mode 100755 index 0000000..2ec9065 --- /dev/null +++ b/validation/pdsdependency.py @@ -0,0 +1,948 @@ +#!/usr/bin/env python3 +################################################################################ +# pdsdependency.py library and main program +# +# Syntax: +# pdsdependency.py volume_path [volume_path ...] +# +# Enter the --help option to see more information. +################################################################################ + +import sys +import os +import glob +import re +import argparse + +import pdslogger +import pdsfile +import translator + +LOGNAME = 'pds.validation.dependencies' +LOGROOT_ENV = 'PDS_LOG_ROOT' + +################################################################################ +# Translator for tests to apply +# +# Each path to a volume is compared against each regular expression. For those +# regular expressions that match, the associated suite of tests is performed. +# Note that 'general' tests are performed for every volume. +################################################################################ + +TESTS = translator.TranslatorByRegex([ + ('.*', 0, ['general']), + ('.*/COCIRS_0xxx(|_v[3-9])/COCIRS_0[4-9].*', + 0, ['cocirs01']), + ('.*/COCIRS_1xxx(|_v[3-9]).*', 0, ['cocirs01']), + ('.*/COCIRS_[56]xxx/.*', 0, ['cocirs56']), + ('.*/COISS_[12]xxx/.*', 0, ['coiss12', 'metadata', 'inventory', + 'rings', 'moons' ,'cumindex999']), + ('.*/COISS_100[1-7]/.*', 0, ['jupiter']), + ('.*/COISS_100[89]/.*', 0, ['saturn']), + ('.*/COISS_2xxx/.*', 0, ['saturn']), + ('.*/COISS_3xxx.*', 0, ['coiss3']), + ('.*/COUVIS_0xxx/.*', 0, ['couvis', 'metadata', 'supplemental', + 'cumindex999']), + ('.*/COUVIS_0xxx/COUVIS_0006.*', 0, ['saturn', 'rings']), + ('.*/COUVIS_0xxx/COUVIS_000[7-9].*', 0, ['saturn', 'rings', 'moons']), + ('.*/COUVIS_0xxx/COUVIS_00[1-9].*', 0, ['saturn', 'rings', 'moons']), + ('.*/COVIMS_0.*', 0, ['covims', 'metadata', 'cumindex999']), + ('.*/COVIMS_000[4-9].*', 0, ['saturn', 'rings', 'moons']), + ('.*/COVIMS_00[1-9].*', 0, ['saturn', 'rings', 'moons']), + ('.*/CO.*_8xxx/.*', 0, ['metadata', 'supplemental', 'profile']), + ('.*/CORSS_8xxx/.*', 0, ['corss_8xxx']), + ('.*/COUVIS_8xxx/.*', 0, ['couvis_8xxx']), + ('.*/COVIMS_8xxx/.*', 0, ['covims_8xxx']), + ('.*/EBROCC_xxx/.*', 0, ['ebrocc_xxxx', 'metadata', + 'supplemental', 'profile']), + ('.*/GO_0xxx/GO_000[2-9].*', 0, ['metadata', 'cumindex999', + 'go_previews2', 'go_previews3', + 'go_previews4', 'go_previews5']), + ('.*/GO_0xxx/GO_00[12].*', 0, ['metadata', 'cumindex999', + 'go_previews2', 'go_previews3', + 'go_previews4', 'go_previews5']), + ('.*/GO_0xxx_v1/GO_000[2-9].*', 0, ['go_previews2', 'go_previews3', + 'go_previews4', 'go_previews5']), + ('.*/GO_0xxx_v1/GO_00[12]..*', 0, ['go_previews2', 'go_previews3', + 'go_previews4', 'go_previews5']), + ('.*/JNCJIR_[12]xxx/.*', 0, ['metadata', 'cumindex999']), + ('.*/JNCJNC_0xxx/.*', 0, ['metadata', 'cumindex999']), + ('.*/HST.x_xxxx/.*', 0, ['hst', 'metadata', 'cumindex9_9999']), + ('.*/NH..(LO|MV)_xxxx/.*', 0, ['nh', 'metadata', 'supplemental']), + ('.*/NH..LO_xxxx/NH[^K].*', 0, ['inventory', 'rings', 'moons']), + ('.*/NH(JU|LA)MV_xxxx/.*', 0, ['nhbrowse_vx', 'jupiter']), + ('.*/NH(PC|PE)MV_xxxx/.*', 0, ['nhbrowse', 'pluto']), + ('.*/RPX_xxxx/.*', 0, ['metadata']), + ('.*/RPX_xxxx/RPX_000.*', 0, ['obsindex', 'cumindex99']), + ('.*/VGISS_[5678]xxx/.*', 0, ['vgiss', 'metadata', 'raw_image', + 'supplemental', 'cumindex999']), + ('.*/VGISS_5(10[4-9]|20[5-9]|11|21)/.*', + 0, ['jupiter', 'inventory', 'rings', + 'moons']), + ('.*/VGISS_6(10|11[0-5]|2)/.*', 0, ['saturn', 'inventory', 'rings', + 'moons']), + ('.*/VGISS_7xxx/.*', 0, ['uranus', 'inventory', 'rings', + 'moons']), + ('.*/VGISS_8xxx/.*', 0, ['neptune', 'inventory', 'rings', + 'moons']), + ('.*/VG_28xx/.*', 0, ['metadata']), +]) + +################################################################################ +# Class definition +################################################################################ + +class PdsDependency(object): + + DEPENDENCY_SUITES = {} + MODTIME_DICT = {} + + def __init__(self, title, glob_pattern, regex, sublist, suite=None, + newer=True, exceptions=[]): + """Constructor for a PdsDependency. + + Inputs: + title a short description of the dependency. + glob_pattern a glob pattern for finding files. + regex regular expression to match path returned by glob. + sublist a list of substitution strings returning paths to + files that must exist. + suite optional name of a test suite to which this + dependency belongs. + newer True if the file file must be newer; False to + suppress a check of the modification date. + exceptions a list of zero or more regular expressions. If a + file path matches one of these patterns, then it + will not trigger a test. + """ + + self.glob_pattern = glob_pattern + + if type(regex) == str: + self.regex = re.compile('^' + regex + '$', re.I) + else: + self.regex = regex + + self.regex_pattern = self.regex.pattern + + if type(sublist) == str: + self.sublist = [sublist] + else: + self.sublist = list(sublist) + + self.title = title + self.suite = suite + self.newer = newer + + if suite is not None: + if suite not in PdsDependency.DEPENDENCY_SUITES: + PdsDependency.DEPENDENCY_SUITES[suite] = [] + + PdsDependency.DEPENDENCY_SUITES[suite].append(self) + + self.exceptions = [re.compile(pattern, re.I) for pattern in exceptions] + + @staticmethod + def purge_cache(): + PdsDependency.MODTIME_DICT = {} + + @staticmethod + def get_modtime(abspath, logger): + """Return the Unix-style modification time for a file, recursively for + a directory. Cache results for directories.""" + + if os.path.isfile(abspath): + return os.path.getmtime(abspath) + + if abspath in PdsDependency.MODTIME_DICT: + return PdsDependency.MODTIME_DICT[abspath] + + modtime = -1.e99 + files = os.listdir(abspath) + for file in files: + absfile = os.path.join(abspath, file) + + if file == '.DS_Store': # log .DS_Store files; ignore dates + logger.ds_store('.DS_Store ignored', absfile) + continue + + if '/._' in absfile: # log dot-underscore files; ignore dates + logger.dot_underscore('._* file ignored', absfile) + continue + + modtime = max(modtime, PdsDependency.get_modtime(absfile, logger)) + + PdsDependency.MODTIME_DICT[abspath] = modtime + return modtime + + def test1(self, dirpath, check_newer=True, limit=200, logger=None): + """Perform one test and log the results.""" + + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.PdsFile.from_abspath(dirpath) + lskip_ = len(pdsdir.root_) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root([pdsdir.root_, pdsdir.disk_]) + + # Remove "Newer" at beginning of title if check_newer is False + if not check_newer and self.title.startswith('Newer '): + logger.open(self.title[6:].capitalize(), dirpath) + else: + logger.open(self.title, dirpath) + + try: + pattern = pdsdir.root_ + self.glob_pattern + + pattern = pattern.replace('$', pdsdir.volset_[:-1], 1) + if '$' in pattern: + pattern = pattern.replace('$', pdsdir.volname, 1) + + abspaths = glob.glob(pattern) + + if len(abspaths) == 0: + logger.info('No files found') + + else: + for sub in self.sublist: + logger.open('%s >> %s' % (self.regex_pattern[1:-1], sub), + limits={'normal': limit}) + try: + for abspath in abspaths: + + # Check exception list + exception_identified = False + for regex in self.exceptions: + if regex.fullmatch(abspath): + logger.info('Test skipped', abspath) + exception_identified = True + break + + if exception_identified: + continue + + path = abspath[lskip_:] + + (requirement, count) = self.regex.subn(sub, path) + absreq = (pdsdir.root_ + requirement) + + if count == 0: + logger.error('Invalid file path', absreq) + continue + + if not os.path.exists(absreq): + logger.error('Missing file', absreq) + continue + + if self.newer and check_newer: + source_modtime = PdsDependency.get_modtime(abspath, + logger) + requirement_modtime = \ + PdsDependency.get_modtime(absreq, + logger) + + if requirement_modtime < source_modtime: + logger.error('File out of date', absreq) + continue + + logger.normal('Confirmed', absreq) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + logger.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + + return (fatal, errors, warnings, tests) + + @staticmethod + def test_suite(key, dirpath, check_newer=True, limit=200, logger=None): + + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.PdsFile.from_abspath(dirpath) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Dependency test suite "%s"' % key, dirpath) + + try: + for dep in PdsDependency.DEPENDENCY_SUITES[key]: + dep.test1(dirpath, check_newer, limit=limit, logger=logger) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + + return (fatal, errors, warnings, tests) + +################################################################################ +# General test suite +################################################################################ + +for thing in pdsfile.VOLTYPES: + + if thing == 'volumes': + thing_ = '' + else: + thing_ = '_' + thing + + Thing = thing.capitalize() + + _ = PdsDependency( + 'Newer archives and checksums for %s' % thing, + '%s/$/$' % thing, + r'%s/(.*?)/(.*)' % thing, + [r'archives-%s/\1/\2%s.tar.gz' % (thing, thing_), + r'checksums-%s/\1/\2%s_md5.txt' % (thing, thing_)], + suite='general', newer=True, + ) + + _ = PdsDependency( + 'Newer checksum files for archives-%s' % thing, + 'archives-%s/$/$*' % thing, + r'archives-%s/(.*?)/(.*)%s.tar.gz' % (thing, thing_), + r'checksums-archives-%s/\1%s_md5.txt' % (thing, thing_), + suite='general', newer=True, + ) + + _ = PdsDependency( + 'Newer info shelf files for %s' % thing, + 'checksums-%s/$/$%s_md5.txt' % (thing, thing_), + r'checksums-%s/(.*?)/(.*)%s_md5.txt' % (thing, thing_), + [r'_infoshelf-%s/\1/\2_info.pickle' % thing, + r'_infoshelf-%s/\1/\2_info.py' % thing], + suite='general', newer=True, + ) + + _ = PdsDependency( + 'Newer info shelf files for archives-%s' % thing, + 'checksums-archives-%s/$%s_md5.txt' % (thing, thing_), + r'checksums-archives-%s/(.*)%s_md5.txt' % (thing, thing_), + [r'_infoshelf-archives-%s/\1_info.pickle' % thing, + r'_infoshelf-archives-%s/\1_info.py' % thing], + suite='general', newer=True, + ) + +for thing in ['volumes', 'metadata', 'calibrated']: + + _ = PdsDependency( + 'Newer link shelf files for %s' % thing, + '%s/$/$' % thing, + r'%s/(.*?)/(.*)' % thing, + [r'_linkshelf-%s/\1/\2_links.pickle' % thing, + r'_linkshelf-%s/\1/\2_links.py' % thing], + suite='general', newer=True, + ) + +################################################################################ +# Metadata tests +################################################################################ + +# General metadata including *_index.tab +_ = PdsDependency( + 'Metadata index table for each volume', + 'volumes/$/$', + r'volumes/([^/]+?)(|_v[0-9.]+)/(.*?)', + r'metadata/\1/\3/\3_index.tab', + suite='metadata', newer=False, +) + +_ = PdsDependency( + 'Label for every metadata table or CSV', + 'metadata/$*/$/*.(tab|csv)', + r'metadata/(.*)\....', + r'metadata/\1.lbl', + suite='metadata', newer=False, +) + +_ = PdsDependency( + 'Newer index shelf for every metadata table', + 'metadata/$*/$/*.tab', + r'metadata/(.*)\.tab', + [r'_indexshelf-metadata/\1.pickle', + r'_indexshelf-metadata/\1.py'], + suite='metadata', newer=True, + exceptions=[r'.*_inventory.tab', + r'.*GO_0xxx_v1.*'] +) + +# More metadata suites +for (name, suffix) in [('supplemental' , 'supplemental_index.tab'), + ('inventory' , 'inventory.csv'), + ('jupiter' , 'jupiter_summary.tab'), + ('saturn' , 'saturn_summary.tab'), + ('uranus' , 'uranus_summary.tab'), + ('neptune' , 'neptune_summary.tab'), + ('pluto' , 'pluto_summary.tab'), + ('pluto' , 'charon_summary.tab'), + ('rings' , 'ring_summary.tab'), + ('moons' , 'moon_summary.tab'), + ('raw_image' , 'raw_image_index.tab'), + ('profile' , 'profile_index.tab'), + ('obsindex' , 'obsindex.tab')]: + + _ = PdsDependency( + name.capitalize() + ' metadata required', + 'volumes/$/$', + r'volumes/([^/]+?)(|_v[0-9.]+)/(.*?)', + r'metadata/\1/\3/\3_' + suffix, + suite=name, newer=False, + ) + +################################################################################ +# Cumulative index tests where the suffix is "99", "999", or "9_9999" +################################################################################ + +for nines in ('99', '999', '9_9999'): + + dots = nines.replace('9', '.') + name = 'cumindex' + nines + + _ = PdsDependency( + 'Cumulative version of every metadata table', + 'metadata/$*/$/*.(tab|csv)', + r'metadata/(.*?)/(.*)' + dots + r'/(.*)' + dots + r'(.*)\.(tab|csv)', + [r'metadata/\1/\g<2>' + nines + r'/\g<3>' + nines + r'\4.\5', + r'metadata/\1/\g<2>' + nines + r'/\g<3>' + nines + r'\4.lbl'], + suite=name, newer=False, + ) + + _ = PdsDependency( + 'Newer archives and checksums for cumulative metadata', + 'metadata/$*/*' + nines, + r'metadata/(.*?)/(.*)', + [r'archives-metadata/\1/\2_metadata.tar.gz', + r'checksums-metadata/\1/\2_metadata_md5.txt'], + suite=name, newer=True, + ) + + _ = PdsDependency( + 'Newer checksums for cumulative archives-metadata', + 'archives-metadata/$*/*' + nines + '_metadata.tar.gz', + r'archives-metadata/(.*?)/.*_metadata.tar.gz', + r'checksums-archives-metadata/\1_metadata_md5.txt', + suite=name, newer=True, + ) + + _ = PdsDependency( + 'Newer info shelf files for cumulative metadata', + 'checksums-metadata/$*/*' + nines + '_metadata_md5.txt', + r'checksums-metadata/(.*?)/(.*)_metadata_md5.txt', + [r'_infoshelf-metadata/\1/\2_info.pickle', + r'_infoshelf-metadata/\1/\2_info.py'], + suite=name, newer=True, + ) + + _ = PdsDependency( + 'Newer info shelf files for cumulative archives-metadata', + 'checksums-archives-metadata/$*_metadata_md5.txt', + r'checksums-archives-metadata/(.*)_metadata_md5.txt', + [r'_infoshelf-archives-metadata/\1_info.pickle', + r'_infoshelf-archives-metadata/\1_info.py'], + suite=name, newer=True, + ) + + _ = PdsDependency( + 'Newer link shelf files for cumulative metadata', + 'metadata/$/*' + nines, + r'metadata/(.*?)/(.*)', + [r'_linkshelf-metadata/\1/\2_links.pickle', + r'_linkshelf-metadata/\1/\2_links.py'], + suite=name, newer=True, + ) + + _ = PdsDependency( + 'Newer index shelf files for cumulative metadata', + 'metadata/$/*' + nines + '/*.tab', + r'metadata/(.*)\.tab', + [r'_indexshelf-metadata/\1.pickle', + r'_indexshelf-metadata/\1.py'], + suite=name, newer=True, + exceptions=[r'.*GO_0xxx_v1.*'] + ) + +################################################################################ +# Preview tests +################################################################################ + +# For COCIRS_0xxx and COCIRS_1xxx +_ = PdsDependency( + 'Preview versions of every cube file', + 'volumes/$/$/EXTRAS/CUBE_OVERVIEW/*/*.JPG', + r'volumes/(.*)/EXTRAS/CUBE_OVERVIEW/(.*)\.JPG', + [r'previews/\1/DATA/CUBE/\2_thumb.jpg', + r'previews/\1/DATA/CUBE/\2_small.jpg', + r'previews/\1/DATA/CUBE/\2_med.jpg', + r'previews/\1/DATA/CUBE/\2_full.jpg'], + suite='cocirs01', newer=True, +) + +# For COCIRS_5xxx and COCIRS_6xxx +_ = PdsDependency( + 'Diagrams for every interferogram file', + 'volumes/$/$/BROWSE/*/*.PNG', + r'volumes/(.*)\.PNG', + [r'diagrams/\1_thumb.jpg', + r'diagrams/\1_small.jpg', + r'diagrams/\1_med.jpg', + r'diagrams/\1_full.jpg'], + suite='cocirs56', newer=False, +) + +# For COISS_1xxx and COISS_2xxx +_ = PdsDependency( + 'Previews and calibrated versions of every COISS image file', + 'volumes/$/$/data/*/*.IMG', + r'volumes/(.*)\.IMG', + [r'previews/\1_thumb.jpg', + r'previews/\1_small.jpg', + r'previews/\1_med.jpg', + r'previews/\1_full.png', + r'calibrated/\1_CALIB.IMG'], + suite='coiss12', newer=False, +) + +# For COISS_3xxx +_ = PdsDependency( + 'Previews of every COISS derived map image', + 'volumes/$/$/data/images/*.IMG', + r'volumes/(.*?)/data/images/(.*)\.IMG', + [r'previews/\1/data/images/\2_thumb.jpg', + r'previews/\1/data/images/\2_small.jpg', + r'previews/\1/data/images/\2_med.jpg', + r'previews/\1/data/images/\2_full.jpg'], + suite='coiss3', newer=True, +) + +_ = PdsDependency( + 'Previews of every COISS derived map PDF', + 'volumes/$/$/data/maps/*.PDF', + r'volumes/(.*?)/data/maps/(.*)\.PDF', + [r'previews/\1/data/maps/\2_thumb.png', + r'previews/\1/data/maps/\2_small.png', + r'previews/\1/data/maps/\2_med.png', + r'previews/\1/data/maps/\2_full.png'], + suite='coiss3', newer=True, +) + +# For COUVIS_0xxx +_ = PdsDependency( + 'Previews of every COUVIS data file', + 'volumes/$/$/DATA/*/*.DAT', + r'volumes/COUVIS_0xxx(|_v[\.0-9]+)/(.*)\.DAT', + [r'previews/COUVIS_0xxx/\2_thumb.png', + r'previews/COUVIS_0xxx/\2_small.png', + r'previews/COUVIS_0xxx/\2_med.png', + r'previews/COUVIS_0xxx/\2_full.png'], + suite='couvis', newer=False, +) + +# For COVIMS_0xxx +_ = PdsDependency( + 'Previews and calibrated versions of every COVIMS cube', + 'volumes/$/$/data/*/*.qub', + r'volumes/(.*)\.qub', + [r'previews/\1_thumb.png', + r'previews/\1_small.png', + r'previews/\1_med.png', + r'previews/\1_full.png'], + suite='covims', newer=False, +) + +# For CORSS_8xxx +_ = PdsDependency( + 'Previews and diagrams for every CORSS_8xxx data directory', + 'volumes/$/$/data/Rev*/Rev*/*', + r'volumes/CORSS_8xxx[^/]*/(CORSS_8001/data/Rev.../Rev.....?)/(Rev.....?)_(RSS_...._..._..._.)', + [r'previews/CORSS_8xxx/\1/\2_\3/\3_GEO_thumb.jpg', + r'previews/CORSS_8xxx/\1/\2_\3/\3_GEO_small.jpg', + r'previews/CORSS_8xxx/\1/\2_\3/\3_GEO_med.jpg', + r'previews/CORSS_8xxx/\1/\2_\3/\3_GEO_full.jpg', + r'previews/CORSS_8xxx/\1/\2_\3/\3_TAU_thumb.jpg', + r'previews/CORSS_8xxx/\1/\2_\3/\3_TAU_small.jpg', + r'previews/CORSS_8xxx/\1/\2_\3/\3_TAU_med.jpg', + r'previews/CORSS_8xxx/\1/\2_\3/\3_TAU_full.jpg', + r'previews/CORSS_8xxx/\1_thumb.jpg', + r'previews/CORSS_8xxx/\1_small.jpg', + r'previews/CORSS_8xxx/\1_med.jpg', + r'previews/CORSS_8xxx/\1_full.jpg', + r'diagrams/CORSS_8xxx/\1_\3_thumb.jpg', + r'diagrams/CORSS_8xxx/\1_\3_small.jpg', + r'diagrams/CORSS_8xxx/\1_\3_med.jpg', + r'diagrams/CORSS_8xxx/\1_\3_full.jpg'], + suite='corss_8xxx', newer=False, +) + +_ = PdsDependency( + 'Previews of every CORSS_8xxx browse PDF', + 'volumes/$/$/browse/*.pdf', + r'volumes/CORSS_8xxx[^/]*/(.*)\.pdf', + [r'previews/CORSS_8xxx/\1_thumb.jpg', + r'previews/CORSS_8xxx/\1_small.jpg', + r'previews/CORSS_8xxx/\1_med.jpg', + r'previews/CORSS_8xxx/\1_full.jpg'], + suite='corss_8xxx', newer=False, +) +_ = PdsDependency( + 'Previews of every CORSS_8xxx Rev PDF', + 'volumes/$/$/data/Rev*/*.pdf', + r'volumes/CORSS_8xxx[^/]*/(.*)\.pdf', + [r'previews/CORSS_8xxx/\1_thumb.jpg', + r'previews/CORSS_8xxx/\1_small.jpg', + r'previews/CORSS_8xxx/\1_med.jpg', + r'previews/CORSS_8xxx/\1_full.jpg'], + suite='corss_8xxx', newer=False, +) + +_ = PdsDependency( + 'Previews of every CORSS_8xxx data PDF', + 'volumes/$/$/data/Rev*/Rev*/Rev*/*.pdf', + r'volumes/CORSS_8xxx[^/]*/(.*)\.pdf', + [r'previews/CORSS_8xxx/\1_thumb.jpg', + r'previews/CORSS_8xxx/\1_small.jpg', + r'previews/CORSS_8xxx/\1_med.jpg', + r'previews/CORSS_8xxx/\1_full.jpg'], + suite='corss_8xxx', newer=False, +) + +# For COUVIS_8xxx +_ = PdsDependency( + 'Previews and diagrams of every COUVIS_8xxx profile', + 'volumes/$/$/data/*_TAU01KM.TAB', + r'volumes/COUVIS_8xxx[^/]*/(.*)_TAU01KM\.TAB', + [r'previews/COUVIS_8xxx/\1_thumb.jpg', + r'previews/COUVIS_8xxx/\1_small.jpg', + r'previews/COUVIS_8xxx/\1_med.jpg', + r'previews/COUVIS_8xxx/\1_full.jpg', + r'diagrams/COUVIS_8xxx/\1_thumb.jpg', + r'diagrams/COUVIS_8xxx/\1_small.jpg', + r'diagrams/COUVIS_8xxx/\1_med.jpg', + r'diagrams/COUVIS_8xxx/\1_full.jpg'], + suite='couvis_8xxx', newer=False, + exceptions=['.*2005_139_PSICEN_E.*', + '.*2005_139_THEHYA_E.*', + '.*2007_038_SAO205839_I.*', + '.*2010_148_LAMAQL_E.*'] +) + +# For COVIMS_8xxx +_ = PdsDependency( + 'Previews and diagrams of every COVIMS_8xxx profile', + 'volumes/$/$/data/*_TAU01KM.TAB', + r'volumes/COVIMS_8xxx[^/]*/(.*)_TAU01KM\.TAB', + [r'previews/COVIMS_8xxx/\1_thumb.jpg', + r'previews/COVIMS_8xxx/\1_small.jpg', + r'previews/COVIMS_8xxx/\1_med.jpg', + r'previews/COVIMS_8xxx/\1_full.jpg', + r'diagrams/COVIMS_8xxx/\1_thumb.jpg', + r'diagrams/COVIMS_8xxx/\1_small.jpg', + r'diagrams/COVIMS_8xxx/\1_med.jpg', + r'diagrams/COVIMS_8xxx/\1_full.jpg'], + suite='covims_8xxx', newer=False, +) + +_ = PdsDependency( + 'Previews of every COVIMS_8xxx PDF', + 'volumes/$/$/browse/*.PDF', + r'volumes/COVIMS_8xxx[^/]*/(.*)\.PDF', + [r'previews/COVIMS_8xxx/\1_thumb.jpg', + r'previews/COVIMS_8xxx/\1_small.jpg', + r'previews/COVIMS_8xxx/\1_med.jpg', + r'previews/COVIMS_8xxx/\1_full.jpg'], + suite='covims_8xxx', newer=False, +) + +# For EBROCC_xxxx +_ = PdsDependency( + 'Previews of every EBROCC browse PDF', + 'volumes/$/$/BROWSE/*.PDF', + r'volumes/EBROCC_xxxx[^/]*/(.*)\.PDF', + [r'previews/EBROCC_xxxx/\1_thumb.jpg', + r'previews/EBROCC_xxxx/\1_small.jpg', + r'previews/EBROCC_xxxx/\1_med.jpg', + r'previews/EBROCC_xxxx/\1_full.jpg'], + suite='ebrocc_xxxx', newer=False, +) +_ = PdsDependency( + 'Previews of every EBROCC profile', + 'volumes/$/$/data/*/*.TAB', + r'volumes/EBROCC_xxxx[^/]*/(.*)\.TAB', + [r'previews/EBROCC_xxxx/\1_thumb.jpg', + r'previews/EBROCC_xxxx/\1_small.jpg', + r'previews/EBROCC_xxxx/\1_med.jpg', + r'previews/EBROCC_xxxx/\1_full.jpg'], + suite='ebrocc_xxxx', newer=False, +) + +# For GO_xxxx +_ = PdsDependency( + 'Previews of every GO image file, depth 2', + 'volumes/$/$/*/*.IMG', + r'volumes/(.*)\.IMG', + [r'previews/\1_thumb.jpg', + r'previews/\1_small.jpg', + r'previews/\1_med.jpg', + r'previews/\1_full.jpg'], + suite='go_previews2', newer=True, +) + +_ = PdsDependency( + 'Previews of every GO image file, depth 3', + 'volumes/$/$/*/*.IMG', + r'volumes/(.*)\.IMG', + [r'previews/\1_thumb.jpg', + r'previews/\1_small.jpg', + r'previews/\1_med.jpg', + r'previews/\1_full.jpg'], + suite='go_previews3', newer=True, +) + +_ = PdsDependency( + 'Previews of every GO image file, depth 4', + 'volumes/$/$/*/*/*.IMG', + r'volumes/(.*)\.IMG', + [r'previews/\1_thumb.jpg', + r'previews/\1_small.jpg', + r'previews/\1_med.jpg', + r'previews/\1_full.jpg'], + suite='go_previews4', newer=True, +) + +_ = PdsDependency( + 'Previews of every GO image file, depth 5', + 'volumes/$/$/*/*/*/*.IMG', + r'volumes/(.*)\.IMG', + [r'previews/\1_thumb.jpg', + r'previews/\1_small.jpg', + r'previews/\1_med.jpg', + r'previews/\1_full.jpg'], + suite='go_previews5', newer=True, +) + +# For HST*x_xxxx +_ = PdsDependency( + 'Previews of every HST image label', + 'volumes/$/$/data/*/*.LBL', + r'volumes/(HST.._....)(|_v[0-9.]+)/(HST.*)\.LBL', + [r'previews/\1/\3_thumb.jpg', + r'previews/\1/\3_small.jpg', + r'previews/\1/\3_med.jpg', + r'previews/\1/\3_full.jpg'], + suite='hst', newer=False, +) + +# For NHxxLO_xxxx and NHxxMV_xxxx browse, stripping version number +_ = PdsDependency( + 'Previews of every NH image file', + 'volumes/$/$/data/*/*.fit', + r'volumes/(NHxx.._....)(|_v[0-9.]+)/(NH.*?)(|_[0-9]+).fit', + [r'previews/\1/\3_thumb.jpg', + r'previews/\1/\3_small.jpg', + r'previews/\1/\3_med.jpg', + r'previews/\1/\3_full.jpg'], + suite='nhbrowse', newer=False, +) + +# For NHxxLO_xxxx and NHxxMV_xxxx browse, without stripping version number +_ = PdsDependency( + 'Previews of every NH image file', + 'volumes/$/$/data/*/*.fit', + r'volumes/(NHxx.._....)(|_v[0-9.]+)/(NH.*?).fit', + [r'previews/\1/\3_thumb.jpg', + r'previews/\1/\3_small.jpg', + r'previews/\1/\3_med.jpg', + r'previews/\1/\3_full.jpg'], + suite='nhbrowse_vx', newer=False, +) + +_ = PdsDependency( + 'Newer supplemental index for every NH volume', + 'volumes/$/$/data/*/*.lbl', + r'volumes/(NHxx.._....)(|_v[0-9.]+)/(NH...._.00)(.)/.*\.lbl', + r'metadata/\1/\g<3>1/\g<3>1_supplemental_index.tab', + suite='nh', newer=True, +) + +# For VGISS_[5678]xxx +_ = PdsDependency( + 'Previews of every VGISS image file', + 'volumes/$/$/data/*/*RAW.IMG', + r'volumes/(.*)_RAW\.IMG', + [r'previews/\1_thumb.jpg', + r'previews/\1_small.jpg', + r'previews/\1_med.jpg', + r'previews/\1_full.jpg'], + suite='vgiss', newer=True, +) + +################################################################################ +################################################################################ + +def test(pdsdir, logger=None, check_newer=True): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + path = pdsdir.abspath + for suite in TESTS.all(path): + _ = PdsDependency.test_suite(suite, path, check_newer=check_newer, + logger=logger) + +################################################################################ +################################################################################ + +if __name__ == '__main__': + + # Set up parser + parser = argparse.ArgumentParser( + description='pdsdependency: Check all required files associated with ' + + 'with a volume, confirming that they exist and that ' + + 'their creation dates are consistent.') + + parser.add_argument('volume', nargs='+', type=str, + help='The path to the root directory of a volume or ' + + 'a volume set.') + + parser.add_argument('--log', '-l', type=str, default='', + help='Optional root directory for a duplicate of the ' + + 'log files. If not specified, the value of ' + + 'environment variable "%s" ' % LOGROOT_ENV + + 'is used. In addition, individual logs are ' + + 'written into the "logs" directory parallel to ' + + '"holdings". Logs are created inside the ' + + '"pdsdependency" subdirectory of each log root ' + + 'directory.' + ) + + parser.add_argument('--quiet', '-q', action='store_true', + help='Do not also log to the terminal.') + + # Parse and validate the command line + args = parser.parse_args() + + status = 0 + + # Define the logging directory + if args.log == '': + try: + args.log = os.environ[LOGROOT_ENV] + except KeyError: + args.log = None + + # Validate the paths + for path in args.volume: + path = os.path.abspath(path) + pdsdir = pdsfile.PdsFile.from_abspath(path) + if not pdsdir.is_volume_dir and not pdsdir.is_volset_dir: + print('pdsdependency error: ' + \ + 'not a volume or volume set directory: ' + pdsdir.logical_path) + sys.exit(1) + + if pdsdir.category_ != 'volumes/': + print('pdsdependency error: ' + \ + 'not a volume or volume set directory: ' + pdsdir.logical_path) + sys.exit(1) + + # Initialize the logger + logger = pdslogger.PdsLogger(LOGNAME) + pdsfile.PdsFile.set_log_root(args.log) + + if not args.quiet: + logger.add_handler(pdslogger.stdout_handler) + + if args.log: + path = os.path.join(args.log, 'pdsdependency') + warning_handler = pdslogger.warning_handler(path) + logger.add_handler(warning_handler) + + error_handler = pdslogger.error_handler(path) + logger.add_handler(error_handler) + + # Generate a list of file paths before logging + paths = [] + for path in args.volume: + + if not os.path.exists(path): + print('No such file or directory: ' + path) + sys.exit(1) + + path = os.path.abspath(path) + pdsf = pdsfile.PdsFile.from_abspath(path) + + if pdsf.checksums_: + print('No pdsdependency for checksum files: ' + path) + sys.exit(1) + + if pdsf.archives_: + print('No pdsdependency for archive files: ' + path) + sys.exit(1) + + if pdsf.is_volset_dir: + paths += [os.path.join(path, c) for c in pdsf.childnames] + + else: + paths.append(os.path.abspath(path)) + + # Loop through paths... + logger.open(' '.join(sys.argv)) + try: + for path in paths: + + pdsdir = pdsfile.PdsFile.from_abspath(path) + + # Save logs in up to two places + logfiles = set([pdsdir.log_path_for_volume('_dependency', + dir='pdsdependency'), + pdsdir.log_path_for_volume('_dependency', + dir='pdsdependency', + place='parallel')]) + + # Create all the handlers for this level in the logger + local_handlers = [] + for logfile in logfiles: + logfile = logfile.replace('/volumes/', '/') + local_handlers.append(pdslogger.file_handler(logfile)) + logdir = os.path.split(logfile)[0] + + # These handlers are only used if they don't already exist + warning_handler = pdslogger.warning_handler(logdir) + error_handler = pdslogger.error_handler(logdir) + local_handlers += [warning_handler, error_handler] + + # Open the next level of the log + if len(paths) > 1: + logger.blankline() + + logger.open('Dependency tests', path, handler=local_handlers) + + try: + for logfile in logfiles: + logger.info('Log file', logfile) + + test(pdsdir, logger=logger) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + status = 1 + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + if fatal or errors: status = 1 + + sys.exit(status) diff --git a/validation/pdsindexshelf.py b/validation/pdsindexshelf.py new file mode 100755 index 0000000..3d5165b --- /dev/null +++ b/validation/pdsindexshelf.py @@ -0,0 +1,494 @@ +#!/usr/bin/env python3 +################################################################################ +# pdsindexshelf.py library and main program +# +# Syntax: +# pdsindexshelf.py --task index_path.tab [index_path.tab ...] +# +# Enter the --help option to see more information. +################################################################################ + +import argparse +import datetime +import glob +import os +import pickle +import sys + +import pdslogger +import pdsfile +import pdstable + +LOGNAME = 'pds.validation.indexshelf' +LOGROOT_ENV = 'PDS_LOG_ROOT' + +################################################################################ + +def generate_indexdict(pdsf, logger=None): + """Generate a dictionary keyed by row key for each row in the given table. + The value returned is a list containing all the associated row indices. + """ + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsf.root_) + logger.open('Tabulating index rows for', pdsf.abspath) + + try: + table = pdstable.PdsTable(pdsf.label_abspath, + filename_keylen=pdsf.filename_keylen) + + table.index_rows_by_filename_key() # fills in table.filename_keys + childnames = table.filename_keys + index_dict = {c:table.row_indices_by_filename_key(c) + for c in childnames} + + logger.info('Rows tabulated', str(len(index_dict)), force=True) + + latest_mtime = max(os.path.getmtime(pdsf.abspath), + os.path.getmtime(pdsf.label_abspath)) + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('Latest index file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + except (OSError, ValueError) as e: + logger.error(str(e)) + raise e + + finally: + _ = logger.close() + + return (index_dict, latest_mtime) + +################################################################################ + +def write_indexdict(pdsf, index_dict, logger=None): + """Write a new shelf file for the rows of this index.""" + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsf.root_) + logger.open('Writing index shelf file info for', pdsf.abspath) + + try: + pdsfile.PdsFile.close_all_shelves() # prevents using a cached shelf file + + shelf_path = pdsf.indexshelf_abspath + logger.info('Index shelf file', shelf_path) + + # Create parent directory if necessary + parent = os.path.split(shelf_path)[0] + if not os.path.exists(parent): + logger.info('Creating parent directory', parent) + os.makedirs(parent) + + # Write the pickle file + with open(shelf_path, 'wb') as f: + pickle.dump(index_dict, f) + + # Write the Python file + python_path = shelf_path.rpartition('.')[0] + '.py' + logger.info('Writing Python file', python_path) + + # Determine the maximum length of the keys + len_path = 0 + for key in index_dict: + len_path = max(len_path, len(key)) + + name = os.path.basename(shelf_path).rpartition('.')[0] + with open(python_path, 'w', encoding='latin-1') as f: + f.write(name + ' = {\n') + for key in index_dict: + f.write(' "%s: ' % (key + '"' + (len_path-len(key)) * ' ')) + + rows = index_dict[key] + if len(rows) == 1: + f.write('%d,\n' % rows[0]) + else: + f.write('(') + for row in rows[:-1]: + f.write('%d, ' % row) + f.write('%d),\n' % rows[-1]) + + f.write('}\n\n') + + logger.info('Two files written') + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def load_indexdict(pdsf, logger=None): + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsf.root_) + logger.open('Reading index shelf file for', pdsf.abspath) + + try: + shelf_path = pdsf.indexshelf_abspath + logger.info('Index shelf file', shelf_path) + + if not os.path.exists(shelf_path): + logger.error('Index shelf file not found', shelf_path) + return {} + + with open(shelf_path, 'rb') as f: + index_dict = pickle.load(f) + + logger.info('Shelf records loaded', str(len(index_dict))) + + except pickle.PickleError as e: + logger.exception(e) + raise + + finally: + logger.close() + + return index_dict + +################################################################################ + +def validate_infodict(pdsf, tabdict, shelfdict, logger=None): + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsf.root_) + logger.info('Validating index file for', pdsf.abspath) + + if tabdict == shelfdict: + logger.info('Validation complete') + else: + logger.error('Validation failed for', pdsf.abspath) + +################################################################################ +# Simplified functions to perform tasks +################################################################################ + +def initialize(pdsf, logger=None): + + shelf_path = pdsf.indexshelf_abspath + + # Make sure file does not exist + if os.path.exists(pdsf.indexshelf_abspath): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Index shelf file already exists', shelf_path) + return + + # Generate info + (index_dict, _) = generate_indexdict(pdsf, logger=logger) + if index_dict is None: + return + + # Save info file + write_indexdict(pdsf, index_dict, logger=logger) + +def reinitialize(pdsf, logger=None): + + shelf_path = pdsf.indexshelf_abspath + + # Warn if shelf file does not exist + if not os.path.exists(shelf_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Index shelf file does not exist; initializing', shelf_path) + initialize(pdsdir, logger=logger) + return + + # Generate info + (index_dict, _) = generate_indexdict(pdsf, logger=logger) + if not index_dict: + return + + # Save info file + write_indexdict(pdsf, index_dict, logger=logger) + +def validate(pdsf, logger=None): + + shelf_path = pdsf.indexshelf_abspath + + # Make sure file exists + if not os.path.exists(shelf_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Index shelf file does not exist', shelf_path) + return + + (table_indexdict, _) = generate_indexdict(pdsf, logger=logger) + if table_indexdict is None: + return + + shelf_indexdict = load_indexdict(pdsf, logger=logger) + if not shelf_indexdict: + return + + # Validate + validate_infodict(pdsf, table_indexdict, shelf_indexdict, + logger=logger) + +def repair(pdsf, logger=None, op='repair'): + + shelf_path = pdsf.indexshelf_abspath + + # Make sure file exists + if not os.path.exists(shelf_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Index shelf file does not exist; initializing', + shelf_path) + initialize(pdsdir, logger=logger) + return + + (table_indexdict, latest_mtime) = generate_indexdict(pdsf, logger=logger) + if not table_indexdict: + return + + shelf_indexdict = load_indexdict(pdsf, logger=logger) + if not shelf_indexdict: + return + + # Compare + canceled = (table_indexdict == shelf_indexdict) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + shelf_pypath = shelf_path.replace('.pickle', '.py') + shelf_mtime = min(os.path.getmtime(shelf_path), + os.path.getmtime(shelf_pypath)) + if latest_mtime > shelf_mtime: + logger.info('!!! Index shelf file content is up to date', + shelf_path, force=True) + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('!!! Index file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + dt = datetime.datetime.fromtimestamp(shelf_mtime) + logger.info('!!! Index shelf file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + delta = latest_mtime - shelf_mtime + if delta >= 86400/10: + logger.info('!!! Index shelf file is out of date %.1f days' % + (delta / 86400.), force=True) + else: + logger.info('!!! Index shelf file is out of date %.1f minutes' % + (delta / 60.), force=True) + + dt = datetime.datetime.now() + os.utime(shelf_path) + os.utime(shelf_pypath) + logger.info('!!! Time tag on index shelf files set to', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + else: + logger.info('!!! Index shelf file is up to date; repair canceled', + shelf_path, force=True) + + return + + # Write new info + write_indexdict(pdsf, table_indexdict, logger=logger) + +def update(pdsf, selection=None, logger=None): + + shelf_path = pdsf.indexshelf_abspath + if os.path.exists(shelf_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.info('!!! Index shelf file exists; not updated', pdsf.abspath) + + else: + initialize(pdsf, logger) + +################################################################################ +################################################################################ + +if __name__ == '__main__': + + # Set up parser + parser = argparse.ArgumentParser( + description='pdsindexshelf: Create, maintain and validate shelf files ' + + 'containing row lookup information for index files.') + + parser.add_argument('--initialize', '--init', const='initialize', + default='', action='store_const', dest='task', + help='Create an indexshelf file for an index or for ' + + 'an entire metadata directory. Abort if the file '+ + 'already exists.') + + parser.add_argument('--reinitialize', '--reinit', const='reinitialize', + default='', action='store_const', dest='task', + help='Create an indexshelf file for an index or for ' + + 'an entire metadata directory. Replace any files '+ + 'that already exists.') + + parser.add_argument('--validate', const='validate', + default='', action='store_const', dest='task', + help='Validate an indexshelf file or metadata ' + + 'directory.') + + parser.add_argument('--repair', const='repair', + default='', action='store_const', dest='task', + help='Validate an index shelf file; replace only if ' + + 'necessary. If the shelf file content is correct '+ + 'but it is older than either the file or the ' + + 'label, update the shelf file\'s modification ' + + 'date.') + + parser.add_argument('--update', const='update', + default='', action='store_const', dest='task', + help='Search a metadata directory for any new index ' + + 'files and add create an index shelf file for ' + + 'each one. Existing index shelf files are not ' + + 'checked.') + + parser.add_argument('table', nargs='+', type=str, + help='Path to an index file or metadata directory.') + + parser.add_argument('--log', '-l', type=str, default='', + help='Optional root directory for a duplicate of the ' + + 'log files. If not specified, the value of ' + + 'environment variable "%s" ' % LOGROOT_ENV + + 'is used. In addition, individual logs are ' + + 'written into the "logs" directory parallel to ' + + '"holdings". Logs are created inside the "index" '+ + 'subdirectory of each log root directory.') + + parser.add_argument('--quiet', '-q', action='store_true', + help='Do not also log to the terminal.') + + # Parse and validate the command line + args = parser.parse_args() + + if not args.task: + print('pdsindexshelf error: Missing task') + sys.exit(1) + + status = 0 + + # Define the logging directory + if args.log == '': + try: + args.log = os.environ[LOGROOT_ENV] + except KeyError: + args.log = None + + # Initialize the logger + logger = pdslogger.PdsLogger(LOGNAME) + pdsfile.PdsFile.set_log_root(args.log) + + if not args.quiet: + logger.add_handler(pdslogger.stdout_handler) + + if args.log: + path = os.path.join(args.log, 'pdsindexshelf') + warning_handler = pdslogger.warning_handler(path) + logger.add_handler(warning_handler) + + error_handler = pdslogger.error_handler(path) + logger.add_handler(error_handler) + + # Generate a list of PdsFile objects before logging + pdsfiles = [] + for path in args.table: + + if not os.path.exists(path): + print('No such file or directory: ' + path) + sys.exit(1) + + path = os.path.abspath(path) + pdsf = pdsfile.PdsFile.from_abspath(path) + + if pdsf.isdir: + if not '/metadata/' in path: + print('Not a metadata directory: ' + path) + sys.exit(1) + + tables = glob.glob(os.path.join(path, '*.tab')) + if not tables: + tables = glob.glob(os.path.join(path, '*/*.tab')) + + if not tables: + print('No .tab files in directory: ' + path) + sys.exit(1) + + pdsfiles += pdsfile.PdsFile.pdsfiles_for_abspaths(tables) + + else: + if not '/metadata/' in path: + print('Not a metadata file: ' + path) + sys.exit(1) + if not path.endswith('.tab'): + print('Not a table file: ' + path) + sys.exit(1) + + pdsfiles.append(pdsf) + + # Open logger and loop through tables... + logger.open(' '.join(sys.argv)) + try: + for pdsf in pdsfiles: + + # Save logs in up to two places + logfiles = [pdsf.log_path_for_index(task=args.task, + dir='pdsindexshelf'), + pdsf.log_path_for_index(task=args.task, + dir='pdsindexshelf', + place='parallel')] + if logfiles[0] == logfiles[1]: + logfiles = logfiles[:-1] + + # Create all the handlers for this level in the logger + local_handlers = [] + for logfile in logfiles: + local_handlers.append(pdslogger.file_handler(logfile)) + logdir = (logfile.rpartition('/pdsindexshelf/')[0] + + '/pdsindexshelf') + + # These handlers are only used if they don't already exist + warning_handler = pdslogger.warning_handler(logdir) + error_handler = pdslogger.error_handler(logdir) + local_handlers += [warning_handler, error_handler] + + # Open the next level of the log + if len(pdsfiles) > 1: + logger.blankline() + + logger.open('Task "' + args.task + '" for', pdsf.abspath, + handler=local_handlers) + + try: + for logfile in logfiles: + logger.info('Log file', logfile) + + if args.task == 'initialize': + initialize(pdsf) + + elif args.task == 'reinitialize': + reinitialize(pdsf) + + elif args.task == 'validate': + validate(pdsf) + + elif args.task == 'repair': + repair(pdsf) + + else: # update + update(pdsf) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + print(sys.exc_info()[2]) + status = 1 + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + if fatal or errors: status = 1 + + sys.exit(status) diff --git a/validation/pdsinfoshelf.py b/validation/pdsinfoshelf.py new file mode 100755 index 0000000..81f32d0 --- /dev/null +++ b/validation/pdsinfoshelf.py @@ -0,0 +1,883 @@ +#!/usr/bin/env python3 +################################################################################ +# pdsinfoshelf.py library and main program +# +# Syntax: +# pdsinfoshelf.py --task path [path ...] +# +# Enter the --help option to see more information. +################################################################################ + +import argparse +import datetime +import glob +import os +import pickle +import shutil +import sys +from PIL import Image + +import pdslogger +import pdsfile +import pdschecksums + +# Holds log file directories temporarily, used by move_old_info() +LOGDIRS = [] + +LOGNAME = 'pds.validation.fileinfo' +LOGROOT_ENV = 'PDS_LOG_ROOT' + +PREVIEW_EXTS = set(['.jpg', '.png', '.gif', '.tif', '.tiff', + '.jpeg', '.jpeg_small']) + +################################################################################ + +def generate_infodict(pdsdir, selection, old_infodict={}, + limits={'normal':-1}, logger=None): + """Generate a dictionary keyed by absolute file path for each file in the + directory tree. Value returned is a tuple (nbytes, child_count, modtime, + checksum, preview size). + + If a selection is specified, it is interpreted as the basename of a file, + and only that file is processed. + + The optional old_infodict overrides information found in the directory. + This dictionary is merged with the new information assembled. However, if + a selection is specified, information about the selection is always updated. + + Also return the latest modification date among all the files checked. + """ + + ### Internal function + + def get_info_for_file(abspath, latest_mtime): + + nbytes = os.path.getsize(abspath) + children = 0 + mtime = os.path.getmtime(abspath) + latest_mtime = max(latest_mtime, mtime) + + dt = datetime.datetime.fromtimestamp(mtime) + modtime = dt.strftime('%Y-%m-%d %H:%M:%S.%f') + try: + checksum = checkdict[abspath] + except KeyError: + logger.error('Missing entry in checksum file', abspath) + checksum = '' + + size = (0,0) + ext = os.path.splitext(abspath)[1] + if ext.lower() in PREVIEW_EXTS: + try: + im = Image.open(abspath) + size = im.size + im.close() + except Exception: + logger.error('Preview size not found', abspath) + + return (nbytes, children, modtime, checksum, size) + + def get_info(abspath, infodict, old_infodict, checkdict, latest_mtime): + """Info about the given abspath.""" + + if os.path.isdir(abspath): + nbytes = 0 + children = 0 + modtime = '' + + files = os.listdir(abspath) + for file in files: + absfile = os.path.join(abspath, file) + + if file == '.DS_Store': # skip .DS_Store files + logger.ds_store('.DS_Store skipped', absfile) + continue + + if file.startswith('._'): # skip dot-underscore files + logger.dot_underscore('._* file skipped', absfile) + continue + + if '/.' in abspath: # flag invisible files + logger.invisible('Invisible file', absfile) + + info = get_info(absfile, infodict, old_infodict, checkdict, + latest_mtime) + nbytes += info[0] + children += 1 + modtime = max(modtime, info[2]) + + info = (nbytes, children, modtime, '', (0,0)) + + elif abspath in old_infodict: + info = old_infodict[abspath] + iso = 'T'.join(info[2].split()) + dt = datetime.datetime.fromisoformat(iso) + mtime = datetime.datetime.timestamp(dt) + latest_mtime = max(latest_mtime, mtime) + + else: + info = get_info_for_file(abspath, latest_mtime) + logger.normal('File info generated', abspath) + + infodict[abspath] = info + + return info + + ################################ + # Begin executable code + ################################ + + dirpath = pdsdir.abspath + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + + if selection: + logger.open('Generating file info for selection "%s"' % selection, + dirpath, limits) + else: + logger.open('Generating file info', dirpath, limits) + + latest_mtime = 0. + found = False + try: + # Load checksum dictionary + checkdict = pdschecksums.checksum_dict(dirpath, logger=logger) +# Removed... because we can't ignore empty directories +# if not checkdict: +# return ({}, 0.) + + # Generate info recursively + infodict = {} + if selection: + root = os.path.join(dirpath, selection) + else: + root = pdsdir.abspath + + _ = get_info(root, infodict, old_infodict, checkdict, latest_mtime) + + # Merge dictionaries + merged = old_infodict.copy() + + if selection: + merged[root] = infodict[root] + + else: + for (key, value) in infodict.items(): + if key not in merged: + merged[key] = infodict[key] + + if latest_mtime == 0.: + logger.info('No files found') + else: + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('Latest holdings file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + return (merged, latest_mtime) + +################################################################################ + +def load_infodict(pdsdir, logger=None): + + dirpath = pdsdir.abspath + dirpath_ = dirpath.rstrip('/') + '/' + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Reading info shelf file for', dirpath_[:-1]) + + try: + (info_path, lskip) = pdsdir.shelf_path_and_lskip('info') + logger.info('Info shelf file', info_path) + + if not os.path.exists(info_path): + logger.error('Info shelf file not found', info_path) + return {} + + # Read the shelf file and convert to a dictionary + with open(info_path, 'rb') as f: + shelf = pickle.load(f) + + infodict = {} + for (key,info) in shelf.items(): + # Remove a 'null' checksum indicated by a string of dashes + # (Directories do not have checksums.) + if info[3] and info[3][0] == '-': + info = info[:3] + ('',) + info[4:] + + if key == '': + infodict[dirpath_[:-1]] = info + else: + infodict[dirpath_[:lskip] + key] = info + + return infodict + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def write_infodict(pdsdir, infodict, limits={}, logger=None): + """Write a new info shelf file for a directory tree.""" + + # Initialize + dirpath = pdsdir.abspath + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Writing info file info for', dirpath, limits=limits) + + try: + (info_path, lskip) = pdsdir.shelf_path_and_lskip('info') + logger.info('Info shelf file', info_path) + + # Create parent directory if necessary + parent = os.path.split(info_path)[0] + if not os.path.exists(parent): + logger.info('Creating parent directory', parent) + os.makedirs(parent) + + # Write the pickle file + pickle_dict = {} + for (key, values) in infodict.items(): + short_key = key[lskip:] + pickle_dict[short_key] = values + + with open(info_path, 'wb') as f: + pickle.dump(pickle_dict, f) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + logger.open('Writing Python dictionary', dirpath, limits=limits) + try: + # Determine the maximum length of the file path + len_path = 0 + for (abspath, values) in infodict.items(): + len_path = max(len_path, len(abspath)) + + len_path -= lskip + + # Write the python dictionary version + python_path = info_path.rpartition('.')[0] + '.py' + name = os.path.basename(python_path) + parts = name.split('_') + name = '_'.join(parts[:2]) + '_info' + abspaths = list(infodict.keys()) + abspaths.sort() + + with open(python_path, 'w', encoding='latin-1') as f: + f.write(name + ' = {\n') + for abspath in abspaths: + path = abspath[lskip:] + (nbytes, children, modtime, checksum, size) = infodict[abspath] + f.write(' "%s: ' % (path + '"' + (len_path-len(path)) * ' ')) + f.write('(%11d, %3d, ' % (nbytes, children)) + f.write('"%s", ' % modtime) + f.write('"%-33s, ' % (checksum + '"')) + f.write('(%4d,%4d)),\n' % size) + + f.write('}\n\n') + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def validate_infodict(pdsdir, dirdict, shelfdict, selection, + limits={'normal': 0}, logger=None): + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + + if selection: + logger.open('Validating file info for selection %s' % selection, + pdsdir.abspath, limits=limits) + else: + logger.open('Validating file info for', pdsdir.abspath, limits=limits) + + # Prune the shelf dictionary if necessary + if selection: + keys = list(shelfdict.keys()) + full_path = os.path.join(pdsdir.abspath, selection) + for key in keys: + if key != full_path: + del shelfdict[key] + + try: + keys = list(dirdict.keys()) + for key in keys: + if key in shelfdict: + dirinfo = dirdict[key] + shelfinfo = shelfdict[key] + + (bytes1, count1, modtime1, checksum1, size1) = dirinfo + (bytes2, count2, modtime2, checksum2, size2) = shelfinfo + + # Truncate modtimes to seconds + modtime1 = modtime1.rpartition('.')[0] + modtime2 = modtime2.rpartition('.')[0] + + agreement = True + if bytes1 != bytes2: + logger.error('File size mismatch %d %d' % + (bytes1, bytes2), key) + agreement = False + + if count1 != count2: + logger.error('Child count mismatch %d %d' % + (count1, count1), key) + agreement = False + + if abs(modtime1 != modtime2) > 1: + logger.error('Modification time mismatch "%s" "%s"' % + (modtime1, modtime2), key) + agreement = False + + if checksum1 != checksum1: + logger.error('Checksum mismatch', key) + agreement = False + + if size1 != size2: + logger.error('Display size mismatch', key) + agreement = False + + if agreement: + logger.normal('File info matches', key) + + del shelfdict[key] + del dirdict[key] + + keys = list(dirdict.keys()) + keys.sort() + for key in keys: + logger.error('Missing shelf info for', key) + + keys = list(shelfdict.keys()) + keys.sort() + for key in keys: + logger.error('Shelf info for missing file', key) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + return logger.close() + +################################################################################ + +def move_old_info(shelf_file, logger=None): + """Move a file to the /logs/ directory tree and append a time tag.""" + + if not os.path.exists(shelf_file): return + + shelf_basename = os.path.basename(shelf_file) + (shelf_prefix, shelf_ext) = os.path.splitext(shelf_basename) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + from_logged = False + for log_dir in LOGDIRS: + dest_template = log_dir + '/' + shelf_prefix + '_v???' + shelf_ext + version_paths = glob.glob(dest_template) + + max_version = 0 + lskip = len(shelf_ext) + for version_path in version_paths: + version = int(version_path[-lskip-3:-lskip]) + max_version = max(max_version, version) + + new_version = max_version + 1 + dest = dest_template.replace('???', '%03d' % new_version) + shutil.copy(shelf_file, dest) + + if not from_logged: + logger.info('Info shelf file moved from: ' + shelf_file) + from_logged = True + + logger.info('Info shelf file moved to', dest) + + python_file = shelf_file.rpartition('.')[0] + '.py' + dest = dest.rpartition('.')[0] + '.py' + shutil.copy(python_file, dest) + +################################################################################ +# Simplified functions to perform tasks +################################################################################ + +def initialize(pdsdir, selection=None, logger=None): + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + # Make sure file does not exist + if os.path.exists(info_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Info shelf file already exists', info_path) + return + + # Check selection + if selection: + logger.error('File selection is disallowed for task "initialize"', + selection) + return + + # Generate info + (infodict, _) = generate_infodict(pdsdir, selection, logger=logger) + + # Save info file + write_infodict(pdsdir, infodict, logger=logger) + +def reinitialize(pdsdir, selection=None, logger=None): + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + # Warn if shelf file does not exist + if not os.path.exists(info_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Info shelf file does not exist', info_path) + else: + logger.warn('Info shelf file does not exist; initializing', + info_path) + initialize(pdsdir, selection=selection, logger=logger) + return + + # Generate info + (infodict, _) = generate_infodict(pdsdir, selection, logger=logger) + if not infodict: + return + + # Move old file if necessary + if os.path.exists(info_path): + move_old_info(info_path, logger=logger) + + # Save info file + write_infodict(pdsdir, infodict, logger=logger) + +def validate(pdsdir, selection=None, logger=None): + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + # Make sure file exists + if not os.path.exists(info_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Info shelf file does not exist', info_path) + return + + # Read info shelf file + shelf_infodict = load_infodict(pdsdir, logger=logger) + + # Generate info + (dir_infodict, _) = generate_infodict(pdsdir, selection, logger=logger) + + # Validate + validate_infodict(pdsdir, dir_infodict, shelf_infodict, selection=selection, + logger=logger) + +def repair(pdsdir, selection=None, logger=None): + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + # Make sure file exists + if not os.path.exists(info_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Info shelf file does not exist', info_path) + else: + logger.warn('Info shelf file does not exist; initializing', + info_path) + initialize(pdsdir, selection=selection, logger=logger) + return + + # Read info shelf file + shelf_infodict = load_infodict(pdsdir, logger=logger) + + # Generate info + (dir_infodict, latest_mtime) = generate_infodict(pdsdir, selection, + logger=logger) + + # For a single selection, use the old information + if selection: + key = list(dir_infodict.keys())[0] + value = dir_infodict[key] + dir_infodict = shelf_infodict.copy() + dir_infodict[key] = value + + # Compare + canceled = (dir_infodict == shelf_infodict) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + info_pypath = info_path.replace('.pickle', '.py') + info_mtime = min(os.path.getmtime(info_path), + os.path.getmtime(info_pypath)) + if latest_mtime > info_mtime: + logger.info('!!! Info shelf file content is up to date', + info_path, force=True) + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('!!! Latest holdings file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + dt = datetime.datetime.fromtimestamp(info_mtime) + logger.info('!!! Info shelf file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + delta = latest_mtime - info_mtime + if delta >= 86400/10: + logger.info('!!! Info shelf file is out of date %.1f days' % + (delta / 86400.), force=True) + else: + logger.info('!!! Info shelf file is out of date %.1f minutes' % + (delta / 60.), force=True) + + dt = datetime.datetime.now() + os.utime(info_path) + os.utime(info_pypath) + logger.info('!!! Time tag on info shelf files set to', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + else: + logger.info(f'!!! Info shelf file is up to date; repair canceled', + info_path, force=True) + return + + # Move files and write new info + move_old_info(info_path, logger=logger) + write_infodict(pdsdir, dir_infodict, logger=logger) + +def update(pdsdir, selection=None, logger=None): + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + # Make sure info shelf file exists + if not os.path.exists(info_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Info shelf file does not exist', info_path) + else: + logger.warn('Info shelf file does not exist; initializing', + info_path) + initialize(pdsdir, selection=selection, logger=logger) + return + + # Read info shelf file + shelf_infodict = load_infodict(pdsdir, logger=logger) + + # Generate info + (dir_infodict, _) = generate_infodict(pdsdir, selection, shelf_infodict, + logger=logger) + + # Compare + canceled = (dir_infodict == shelf_infodict) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.info('!!! Info shelf file content is complete; update canceled', + info_path, force=True) + return + + # Write checksum file + move_old_info(info_path, logger=logger) + write_infodict(pdsdir, dir_infodict, logger=logger) + +################################################################################ +################################################################################ + +if __name__ == '__main__': + + # Set up parser + parser = argparse.ArgumentParser( + description='pdsinfoshelf: Create, maintain and validate shelf files ' + + 'containing basic information about each file.') + + parser.add_argument('--initialize', '--init', const='initialize', + default='', action='store_const', dest='task', + help='Create an infoshelf file for a volume. Abort ' + + 'if the file already exists.') + + parser.add_argument('--reinitialize', '--reinit', const='reinitialize', + default='', action='store_const', dest='task', + help='Create an infoshelf file for a volume. Replace ' + + 'the file if it already exists. If a single ' + + 'file is specified, such as one archive file in ' + + 'a volume set, then only information about that ' + + 'file is re-initialized.') + + parser.add_argument('--validate', const='validate', + default='', action='store_const', dest='task', + help='Validate every file in a volume against the ' + + 'contents of its infoshelf file. If a single ' + + 'file is specified, such as an archive file in ' + + 'a volume set, then only information about that ' + + 'file is validated') + + parser.add_argument('--repair', const='repair', + default='', action='store_const', dest='task', + help='Validate every file in a volume against the ' + + 'contents of its infoshelf file. If any file ' + + 'has changed, the infoshelf file is replaced. ' + + 'If a single file is specified, such as an ' + + 'archive file in a volume set, then only ' + + 'information about that file is repaired. If any '+ + 'of the files checked are newer than the shelf ' + + 'file, update the shelf file\'s modification ' + + 'date.') + + parser.add_argument('--update', const='update', + default='', action='store_const', dest='task', + help='Search a directory for any new files and add ' + + 'their information to the infoshelf file. ' + + 'Information about pre-existing files is not ' + + 'updated. If any of the files checked are newer ' + + 'than the shelf file, update the shelf file\'s ' + + 'modification date.') + + parser.add_argument('volume', nargs='+', type=str, + help='The path to the root of the volume or volume ' + + 'set. For a volume set, all the volume ' + + 'directories inside it are handled in sequence.') + + parser.add_argument('--log', '-l', type=str, default='', + help='Optional root directory for a duplicate of the ' + + 'log files. If not specified, the value of ' + + 'environment variable "%s" ' % LOGROOT_ENV + + 'is used. In addition, individual logs are ' + + 'written into the "logs" directory parallel to ' + + '"holdings". Logs are created inside the ' + + '"pdsinfoshelf" subdirectory of each log root ' + + 'directory.' + ) + + parser.add_argument('--quiet', '-q', action='store_true', + help='Do not also log to the terminal.') + + parser.add_argument('--archives', '-a', default=False, action='store_true', + help='Instead of referring to a volume, refer to the ' + + 'the archive file for that volume.') + + + # Parse and validate the command line + args = parser.parse_args() + + if not args.task: + print('pdsinfoshelf error: Missing task') + sys.exit(1) + + status = 0 + + # Define the logging directory + if args.log == '': + try: + args.log = os.environ[LOGROOT_ENV] + except KeyError: + args.log = None + + # Initialize the logger + logger = pdslogger.PdsLogger(LOGNAME) + pdsfile.PdsFile.set_log_root(args.log) + + if not args.quiet: + logger.add_handler(pdslogger.stdout_handler) + + if args.log: + path = os.path.join(args.log, 'pdsinfoshelf') + warning_handler = pdslogger.warning_handler(path) + logger.add_handler(warning_handler) + + error_handler = pdslogger.error_handler(path) + logger.add_handler(error_handler) + + # Prepare the list of paths + abspaths = [] + for path in args.volume: + + # Make sure path makes sense + path = os.path.abspath(path) + parts = path.partition('/holdings/') + if not parts[1]: + print('Not a holdings subdirectory: ' + path) + sys.exit(1) + + if parts[2].startswith('checksums-'): + print('No infoshelves for checksum files: ' + path) + sys.exit(1) + + # Convert to an archives path if necessary + if args.archives and not parts[2].startswith('archives-'): + path = parts[0] + '/holdings/archives-' + parts[2] + + # Convert to a list of absolute paths that exist (volsets or volumes) + try: + pdsf = pdsfile.PdsFile.from_abspath(path, must_exist=True) + abspaths.append(pdsf.abspath) + + except (ValueError, IOError): + # Allow a volume name to stand in for a .tar.gz archive + (dir, basename) = os.path.split(path) + pdsdir = pdsfile.PdsFile.from_abspath(dir) + if pdsdir.archives_ and '.' not in basename: + if pdsdir.voltype_ == 'volumes/': + basename += '.tar.gz' + else: + basename += '_%s.tar.gz' % pdsdir.voltype_[:-1] + + newpaths = glob.glob(os.path.join(dir, basename)) + if len(newpaths) == 0: + raise + + abspaths += newpaths + continue + else: + raise + + # Generate a list of tuples (pdsfile, selection) + info = [] + for path in abspaths: + pdsf = pdsfile.PdsFile.from_abspath(path) + + if pdsf.is_volset_dir: + # Info about archive directories is stored by volset + if pdsf.archives_: + info.append((pdsf, None)) + + # Others are checksumed by volume + else: + children = [pdsf.child(c) for c in pdsf.childnames] + info += [(c, None) for c in children if c.isdir] + # "if c.isdir" is False for volset level readme files + + elif pdsf.is_volume_dir: + # Shelve one volume + info.append((pdsf, None)) + + elif pdsf.isdir: + print('Invalid directory for an infoshelf: ' + pdsf.logical_path) + sys.exit(1) + + else: + pdsdir = pdsf.parent() + if pdsf.is_volume_file: + # Shelve one archive file + info.append((pdsdir, pdsf.basename)) + elif pdsdir.is_volume_dir: + # Shelve one top-level file in volume + info.append((pdsdir, pdsf.basename)) + else: + print('Invalid file for an infoshelf: ' + pdsf.logical_path) + sys.exit(1) + + # Open logger and loop through tuples... + logger.open(' '.join(sys.argv)) + try: + for (pdsdir, selection) in info: + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + if selection: + pdsf = pdsdir.child(os.path.basename(selection)) + else: + pdsf = pdsdir + + # Save logs in up to two places + if pdsf.volname: + logfiles = set([pdsf.log_path_for_volume('_info', + task=args.task, + dir='pdsinfoshelf'), + pdsf.log_path_for_volume('_info', + task=args.task, + dir='pdsinfoshelf', + place='parallel')]) + else: + logfiles = set([pdsf.log_path_for_volset('_info', + task=args.task, + dir='pdsinfoshelf'), + pdsf.log_path_for_volset('_info', + task=args.task, + dir='pdsinfoshelf', + place='parallel')]) + + # Create all the handlers for this level in the logger + local_handlers = [] + LOGDIRS = [] # used by move_old_info() + for logfile in logfiles: + local_handlers.append(pdslogger.file_handler(logfile)) + logdir = os.path.split(logfile)[0] + LOGDIRS.append(os.path.split(logfile)[0]) + + # These handlers are only used if they don't already exist + warning_handler = pdslogger.warning_handler(logdir) + error_handler = pdslogger.error_handler(logdir) + local_handlers += [warning_handler, error_handler] + + # Open the next level of the log + if len(info) > 1: + logger.blankline() + + if selection: + logger.open('Task "' + args.task + '" for selection ' + + selection, pdsdir.abspath, handler=local_handlers) + else: + logger.open('Task "' + args.task + '" for', pdsdir.abspath, + handler=local_handlers) + + try: + for logfile in logfiles: + logger.info('Log file', logfile) + + if args.task == 'initialize': + initialize(pdsdir, selection) + + elif args.task == 'reinitialize': + if selection: # don't erase everything else! + update(pdsdir, selection) + else: + reinitialize(pdsdir, selection) + + elif args.task == 'validate': + validate(pdsdir, selection) + + elif args.task == 'repair': + repair(pdsdir, selection) + + else: # update + update(pdsdir, selection) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + print(sys.exc_info()[2]) + status = 1 + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + if fatal or errors: + status = 1 + + sys.exit(status) diff --git a/validation/pdslinkshelf.py b/validation/pdslinkshelf.py new file mode 100755 index 0000000..3874214 --- /dev/null +++ b/validation/pdslinkshelf.py @@ -0,0 +1,1718 @@ +#!/usr/bin/env python3 +################################################################################ +# # pdslinkshelf.py library and main program +# +# Syntax: +# pdslinkshelf.py --task path [path ...] +# +# Enter the --help option to see more information. +################################################################################ + +import argparse +import datetime +import glob +import os +import pickle +import re +import shutil +import sys + +import pdslogger +import pdsfile +import translator + +LOGNAME = 'pds.validation.links' +LOGROOT_ENV = 'PDS_LOG_ROOT' + +# Holds log file directories temporarily, used by move_old_links() +LOGDIRS = [] + +REPAIRS = translator.TranslatorByRegex([ + + # COCIRS + ('.*/COCIRS_[01].*/DATAINFO\.TXT', 0, + translator.TranslatorByDict( + {'DIAG.FMT' : 'UNCALIBR/DIAG.FMT', + 'FRV.FMT' : 'UNCALIBR/FRV.FMT', + 'GEO.FMT' : 'NAV_DATA/GEO.FMT', + 'HSK.FMT' : 'HSK_DATA/HSK.FMT', + 'IFGM.FMT' : 'UNCALIBR/IFGM.FMT', + 'IHSK.FMT' : 'UNCALIBR/IHSK.FMT', + 'ISPM.FMT' : 'APODSPEC/ISPM.FMT', + 'OBS.FMT' : 'UNCALIBR/OBS.FMT', + 'POI.FMT' : 'NAV_DATA/POI.FMT', + 'RIN.FMT' : 'NAV_DATA/RIN.FMT', + 'TAR.FMT' : 'NAV_DATA/TAR.FMT'})), + ('.*/COCIRS_[01].*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'DATASIS.TXT' : 'DOCUMENT/DATASIS.PDF', + 'VOLSYS.TXT' : 'DOCUMENT/VOLSYS.PDF'})), + ('.*/COCIRS_[01].*/DATASET\.CAT', 0, + translator.TranslatorByDict( + {'DATASIS.TXT' : 'DATASIS.PDF'})), + ('.*/COCIRS_[01].*/SOFTWARE/DOC/SDOCINFO\.TXT', 0, + translator.TranslatorByDict( + {'vanilla_guide.htm' : 'vanilla-guide.html', + 'vanilla_guide.pdf' : 'vanilla-guide.pdf'})), + ('.*/COCIRS_[01].*/DOCUMENT/DOCINFO\.TXT', 0, + translator.TranslatorByDict( + {'cirs_fov_overview.fig1.tiff' : 'cirs_fov_overview_fig1.tiff', + 'cirs_fov_overview.fig2.tiff' : 'cirs_fov_overview_fig2.tiff', + 'cirs_fov_overview.fig3.tiff' : 'cirs_fov_overview_fig3.tiff'})), + ('.*/COCIRS_[01].*/CUBE/.*\.(LBL|lbl)', 0, + translator.TranslatorByRegex([ + (r'([0-9A-Z_]+)\.DAT', 0, r'\1.tar.gz')])), + ('.*/COCIRS_[56].*/TUTORIAL\.TXT', 0, + translator.TranslatorByDict( + {'GEODATA.FMT' : '../DATA/GEODATA/GEODATA.FMT', + 'ISPMDATA.FMT' : '../DATA/ISPMDATA/ISPMDATA.FMT', + 'POIDATA.FMT' : '../DATA/POIDATA/POIDATA.FMT', + 'RINDATA.FMT' : '../DATA/RINDATA/RINDATA.FMT', + 'TARDATA.FMT' : '../DATA/TARDATA/TARDATA.FMT', + 'filename.FMT' : ''})), + ('.*/COCIRS_[56].*/BROWSE/.*\.LBL', 0, + translator.TranslatorByRegex([ + (r'(SPEC[0-9]{10}_FP[134]\.DAT)', 0, r'../../DATA/APODSPEC/\1'), + (r'(ISPM[0-9]{10}_FP[134]\.TAB)', 0, r'../../DATA/ISPMDATA/\1'), + (r'(RIN[0-9]{10}_FP[134]\.TAB)', 0, r'../../DATA/RINDATA/\1'), + (r'(POI[0-9]{10}_FP[134]\.TAB)', 0, r'../../DATA/POIDATA/\1'), + (r'(TAR[0-9]{10}_FP[134]\.TAB)', 0, r'../../DATA/TARDATA/\1'), + (r'(GEO[0-9]{10}_[0-9]{3}\.TAB)', 0, r'../../DATA/GEODATA/\1')])), + ('.*/COCIRS_[56].*/DATA/APODSPEC/.*\.LBL', 0, + translator.TranslatorByRegex([ + (r'(ISPM[0-9]{10}_FP[134]\.TAB)', 0, r'../ISPMDATA/\1'), + (r'(RIN[0-9]{10}_FP[134]\.TAB)', 0, r'../RINDATA/\1'), + (r'(POI[0-9]{10}_FP[134]\.TAB)', 0, r'../POIDATA/\1'), + (r'(TAR[0-9]{10}_FP[134]\.TAB)', 0, r'../TARDATA/\1')])), + ('.*/COCIRS_[56].*/DATA/ISPMDATA/.*\.LBL', 0, + translator.TranslatorByRegex([ + (r'(SPEC[0-9]{10}_FP[134]\.DAT)', 0, r'../APODSPEC/\1'), + (r'(RIN[0-9]{10}_FP[134]\.TAB)', 0, r'../RINDATA/\1'), + (r'(POI[0-9]{10}_FP[134]\.TAB)', 0, r'../POIDATA/\1'), + (r'(TAR[0-9]{10}_FP[134]\.TAB)', 0, r'../TARDATA/\1')])), + ('.*/COCIRS_[56].*/DATA/RINDATA/.*\.LBL', 0, + translator.TranslatorByRegex([ + (r'(SPEC[0-9]{10}_FP[134]\.DAT)', 0, r'../APODSPEC/\1'), + (r'(ISPM[0-9]{10}_FP[134]\.TAB)', 0, r'../ISPMDATA/\1'), + (r'(POI[0-9]{10}_FP[134]\.TAB)', 0, r'../POIDATA/\1'), + (r'(TAR[0-9]{10}_FP[134]\.TAB)', 0, r'../TARDATA/\1')])), + ('.*/COCIRS_[56].*/DATA/POIDATA/.*\.LBL', 0, + translator.TranslatorByRegex([ + (r'(SPEC[0-9]{10}_FP[134]\.DAT)', 0, r'../APODSPEC/\1'), + (r'(ISPM[0-9]{10}_FP[134]\.TAB)', 0, r'../ISPMDATA/\1'), + (r'(RIN[0-9]{10}_FP[134]\.TAB)', 0, r'../RINDATA/\1'), + (r'(TAR[0-9]{10}_FP[134]\.TAB)', 0, r'../TARDATA/\1')])), + ('.*/COCIRS_[56].*/DATA/TARDATA/.*\.LBL', 0, + translator.TranslatorByRegex([ + (r'(SPEC[0-9]{10}_FP[134]\.DAT)', 0, r'../APODSPEC/\1'), + (r'(ISPM[0-9]{10}_FP[134]\.TAB)', 0, r'../ISPMDATA/\1'), + (r'(RIN[0-9]{10}_FP[134]\.TAB)', 0, r'../RINDATA/\1'), + (r'(POI[0-9]{10}_FP[134]\.TAB)', 0, r'../POIDATA/\1')])), + ('.*/COCIRS_[56].*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'REF.CAT' : 'CATALOG/CIRSREF.CAT'})), + + # COISS + ('.*/COISS_0.*\.lbl', 0, + translator.TranslatorByDict( + {'PREFIX8.FMT' : 'prefix.fmt'})), + ('.*/COISS_00.*/aareadme\.txt', 0, + translator.TranslatorByDict( + {'calinfo.txt' : '../COISS_0011/calib/calinfo.txt', + 'extrinfo.txt' : '../COISS_0011/extras/extrinfo.txt'})), + ('.*/COISS_0.*/index\.lbl', 0, + translator.TranslatorByDict( + {'CUMINDEX.TAB' : 'index.tab'})), + ('.*/COISS_0011/calib/darkcurrent/wac_\w+_dark_parameters04222\.lbl', 0, + translator.TranslatorByRegex([ + (r'wac_(\w+)_dark_parameters04228\.xdr', 0, r'wac_\1_dark_parameters04222.xdr')])), + ('.*/COISS_[012].*/aareadme\.txt', 0, + translator.TranslatorByDict( + {'Calds.CAT' : '../../COISS_0xxx/COISS_0001/catalog/calds.cat', + 'calds.cat' : '../../COISS_0xxx/COISS_0001/catalog/calds.cat', + 'Jupiterds.CAT' : '../../COISS_1xxx/COISS_1001/catalog/jupiterds.cat', + 'jupiterds.cat' : '../../COISS_1xxx/COISS_1001/catalog/jupiterds.cat', + 'Saturnds.CAT' : '../../COISS_2xxx/COISS_2001/catalog/saturnds.cat', + 'saturnds.cat' : '../../COISS_2xxx/COISS_2001/catalog/saturnds.cat', + 'calinfo.txt' : '../../COISS_0xxx/COISS_0011/calib/calinfo.txt', + 'calib.tar.gz' : '../../COISS_0xxx/COISS_0011/calib/calib.tar.gz', + 'in_flight_cal.tex' : '../../COISS_0xxx/COISS_0011/document/in_flight_cal.tex', + 'in_flight_cal.pdf' : '../../COISS_0xxx/COISS_0011/document/in_flight_cal.pdf', + 'in_flight_cal.lbl' : '../../COISS_0xxx/COISS_0011/document/in_flight_cal.lbl', + 'theoretical_basis.tex': '../../COISS_0xxx/COISS_0011/document/theoretical_basis.tex', + 'theoretical_basis.pdf': '../../COISS_0xxx/COISS_0011/document/theoretical_basis.pdf', + 'theoretical_basis.lbl': '../../COISS_0xxx/COISS_0011/document/theoretical_basis.lbl', + 'theoretical_basis.ps' : '../../COISS_0xxx/COISS_0011/document/theoretical_basis.pdf', + 'cisscal.tar.gz' : '../../COISS_0xxx/COISS_0011/extras/cisscal.tar.gz'})), + ('.*/COISS_[012].*/archsis\.txt', 0, + translator.TranslatorByDict( + {'Calds.CAT' : '../../../COISS_0xxx/COISS_0001/catalog/calds.cat', + 'calds.cat' : '../../../COISS_0xxx/COISS_0001/catalog/calds.cat', + 'Jupiterds.CAT' : '../../../COISS_1xxx/COISS_1001/catalog/jupiterds.cat', + 'jupiterds.cat' : '../../../COISS_1xxx/COISS_1001/catalog/jupiterds.cat', + 'Saturnds.CAT' : '../../../COISS_2xxx/COISS_2001/catalog/saturnds.cat', + 'saturnds.cat' : '../../../COISS_2xxx/COISS_2001/catalog/saturnds.cat'})), + + # COUVIS + ('.*/COUVIS_0.*/INDEX\.LBL', 0, + translator.TranslatorByDict( + {'CUBEDS.CAT' : '../CATALOG/SCUBEDS.CAT'})), + ('.*/COUVIS_0.*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'INST.CAT' : 'CATALOG/UVISINST.CAT', + 'XCALDS.CAT' : 'CATALOG/SCALDS.CAT', + 'XCUBEDS.CAT' : 'CATALOG/SCUBEDS.CAT', + 'XSPECDS.CAT' : 'CATALOG/SSPECDS.CAT', + 'XSSBDS.CAT' : 'CATALOG/SSSBDS.CAT', + 'XWAVDS.CAT' : 'CATALOG/SWAVDS.CAT'})), + ('.*/COUVIS_0.*/CATALOG/.*\.CAT', 0, + translator.TranslatorByDict( + {'SPECDS.CAT' : 'SSPECDS.CAT', + 'CUBEDS.CAT' : 'SCUBEDS.CAT'})), + ('.*/COUVIS_0.*/SOFTWARE/READERS/READERS_README.TXT', 0, + translator.TranslatorByDict( + {'CATALOG/CUBEDS.CAT' : '../../CATALOG/SCUBEDS.CAT'})), + ('.*/COUVIS_0.*/SOFTWARE/READERS/OLD.*/READERS_README.TXT', 0, + translator.TranslatorByDict( + {'CATALOG/CUBEDS.CAT' : '../../../CATALOG/SCUBEDS.CAT'})), + ('.*/COUVIS_8xxx/.*/aareadme\.txt', 0, + translator.TranslatorByDict( + {'inst.cat' : 'catalog/uvisinst.cat'})), + ('.*/COUVIS_8xxx_v1.*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'INST.CAT' : 'CATALOG/UVISINST.CAT'})), + ('.*/COUVIS_8xxx_v2.*/voldesc\.cat', 0, + translator.TranslatorByDict( + {'UVISINST.CAT' : 'catalog/inst.cat', + 'PROJREF.CAT' : ''})), + ('.*/COUVIS_8xxx_v1/.*/CATINFO\.TXT', re.I, + translator.TranslatorByDict( + {'INST.CAT' : 'UVISINST.CAT'})), + ('.*/COUVIS_8xxx(|_v2\.0)/.*/voldesc\.cat', re.I, + translator.TranslatorByDict( + {'PROJREF.CAT' : ''})), + ('.*/metadata/.*/COUVIS_0.*_index\.lbl', 0, + translator.TranslatorByDict( + {'CUBEDS.CAT' : ''})), + + # COVIMS + ('.*/COVIMS_0001/aareadme\.txt', 0, + translator.TranslatorByDict( + {'band_bin_center.fmt' : '../COVIMS_0002/label/band_bin_center.fmt', + 'core_description.fmt' : '../COVIMS_0002/label/core_description.fmt', + 'suffix_description.fmt': '../COVIMS_0002/label/suffix_description.fmt', + 'labinfo.txt' : '../COVIMS_0002/label/labinfo.txt'})), + ('.*/COVIMS_0.../aareadme\.txt', 0, + translator.TranslatorByDict( + {'caldoc.txt' : 'software/doc/caldoc.txt', + 'make_dark.sav' : 'software/bin/make_dark.sav', + 'ppvl_10_1.zip' : 'software/lib/ppvl_1_10.zip', + 'ppvl_1_10.zip' : 'software/lib/ppvl_1_10.zip', + 'libPPVL.a' : 'software/lib/ppvl_1_10/libPPVL.a', + 'Makefile' : 'software/lib/ppvl_1_10/Makefile', + 'Makefile.sun' : 'software/lib/ppvl_1_10/Makefile.sun', + 'PIRL_strings.c' : 'software/lib/ppvl_1_10/PIRL_strings.c', + 'PIRL_strings.h' : 'software/lib/ppvl_1_10/PIRL_strings.h', + 'PPVL.c' : 'software/lib/ppvl_1_10/PPVL.c', + 'PPVL.h' : 'software/lib/ppvl_1_10/PPVL.h', + 'PPVL-README' : 'software/lib/ppvl_1_10/PPVL-README', + 'PPVL_report.c' : 'software/lib/ppvl_1_10/PPVL_report.c', + 'PPVL_selections.c' : 'software/lib/ppvl_1_10/PPVL_selections.c', + 'PPVL_selections.h' : 'software/lib/ppvl_1_10/PPVL_selections.h', + 'RANLIB.csh' : 'software/lib/ppvl_1_10/RANLIB.csh', + 'README' : 'software/lib/ppvl_1_10/README', + 'PPVL.3' : 'software/lib/ppvl_1_10/doc/PPVL.3', + 'PPVL_selections.3' : 'software/lib/ppvl_1_10/doc/PPVL_selections.3', + 'PPVL_report.1' : 'software/lib/ppvl_1_10/doc/PPVL_report.1', + 'PPVL_get_PDS_EOL.3' : 'software/lib/ppvl_1_10/doc/PPVL_get_PDS_EOL.3', + 'bp_trans.c' : 'software/src/c/cube_prep/bp_trans.c', + 'cube_prep.c' : 'software/src/c/cube_prep/cube_prep.c', + 'error.h' : 'software/src/c/ir_bg/error.h', + 'fit.c' : 'software/src/c/ir_bg/fit.c', + 'ir_bg.c' : 'software/src/c/ir_bg/ir_bg.c', + 'ir_bg_sub.c' : 'software/src/c/ir_bg_sub/ir_bg_sub.c', + 'mark_saturated.c' : 'software/src/c/mark_saturated/mark_saturated.c', + 'make_dark.pro' : 'software/src/idl/make_dark.pro', + 'vims_cal_pipe.pl' : 'software/src/perl/vims_cal_pipe.pl', + 'cal_pipe2.pm' : 'software/src/perl/cal_pipe2/cal_pipe2.pm', + 'cal_occultation.pm' : 'software/src/perl/cal_pipe2/cal_occultation.pm', + 'cal_point.pm' : 'software/src/perl/cal_pipe2/cal_point.pm', + 'dark_vis.pm' : 'software/src/perl/cal_pipe2/dark_vis.pm', + 'flat_ir2.pm' : 'software/src/perl/cal_pipe2/flat_ir2.pm', + 'flat_vis2.pm' : 'software/src/perl/cal_pipe2/flat_vis2.pm', + 'isis_geo.pm' : 'software/src/perl/cal_pipe2/isis_geo.pm', + 'solar_remove.pm' : 'software/src/perl/cal_pipe2/solar_remove.pm', + 'specific_energy.pm' : 'software/src/perl/cal_pipe2/specific_energy.pm'})), + ('.*/COVIMS_0001/data/.*\.lbl', 0, + translator.TranslatorByDict( + {'band_bin_center.fmt' : '../../../COVIMS_0002/label/band_bin_center.fmt', + 'core_description.fmt' : '../../../COVIMS_0002/label/core_description.fmt', + 'suffix_description.fmt': '../../../COVIMS_0002/label/suffix_description.fmt', + 'BAND_BIN_CENTER.FMT' : '../../../COVIMS_0002/label/band_bin_center.fmt', + 'CORE_DESCRIPTION.FMT' : '../../../COVIMS_0002/label/core_description.fmt', + 'SUFFIX_DESCRIPTION.FMT': '../../../COVIMS_0002/label/suffix_description.fmt'})), + ('.*/COVIMS_0001/document/archsis\.txt', 0, + translator.TranslatorByDict( + {'band_bin_center.fmt' : '../../COVIMS_0002/label/band_bin_center.fmt', + 'core_description.fmt' : '../../COVIMS_0002/label/core_description.fmt', + 'suffix_description.fmt': '../../COVIMS_0002/label/suffix_description.fmt', + 'BAND_BIN_CENTER.FMT' : '../../COVIMS_0002/label/band_bin_center.fmt', + 'CORE_DESCRIPTION.FMT' : '../../COVIMS_0002/label/core_description.fmt', + 'SUFFIX_DESCRIPTION.FMT': '../../COVIMS_0002/label/suffix_description.fmt'})), + ('.*/COVIMS_0.*/document/archsis\.txt', 0, + translator.TranslatorByDict( + {'suffix.cat' : ''})), + ('.*/COVIMS_0.*/errata\.txt', 0, + translator.TranslatorByDict( + {'center.fmt' : 'label/band_bin_center.fmt'})), + ('.*/COVIMS_0024/data/2008017T190718_2008017T201544/v1579292302_1\.lbl', 0, + translator.TranslatorByDict( + {"v1579292302.qub" : "v1579292302_1.qub"})), + ('.*/metadata/COVIMS.*/.*supplemental_index.lbl', 0, + translator.TranslatorByDict( + {'dpsis.txt': '../../../volumes/COVIMS_0xxx/COVIMS_0001/document/dpsis.txt'})), + ('.*/COVIMS_8xxx_v2.*/voldesc.cat', 0, + translator.TranslatorByDict( + {'PROJREF.CAT' : ''})), + + # EBROCC + ('.*/EBROCC_0001/INDEX/MCD_INDEX\.LBL', 0, + translator.TranslatorByDict( + {'LIC_INDEX.TAB' : 'MCD_INDEX.TAB'})), + ('.*/EBROCC_0001/INDEX/PAL_INDEX\.LBL', 0, + translator.TranslatorByDict( + {'LIC_INDEX.TAB' : 'PAL_INDEX.TAB'})), + ('.*/EBROCC_0001/SORCDATA/ESO1M/ES1_INGRESS_GEOMETRY\.LBL', 0, + translator.TranslatorByDict( + {'ES1_INGRESS_GEOMETRY.LBL': 'ES1_INGRESS_GEOMETRY.DAT'})), + + # GO + ('.*/GO_0xxx.*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'ttds.cat' : '../GO_0020/CATALOG/TTDS.CAT'})), + ('.*/GO_0xxx_v1/GO_00(0[789]|1[0-6])/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'CATSTATUS.TXT' : 'DOCUMENT/CATSTAT.TXT'})), + ('.*/GO_0xxx.*/GO_0001/CATALOG/DATASET\.CAT', 0, + translator.TranslatorByRegex( + [(r'(\w\w\w[1-4][sf]_blm02\.img)', 0, r'../BLEMISH/#UPPER#\1'), + (r'(\w\w\w[sf]_cal0[1-5]\.dat)', 0, r'../SLOPE/#UPPER#\1'), + (r'([123][sf]\w+_dc0[1-5]\.dat)', 0, r'../DARK/#UPPER#\1'), + (r'calibration_so02.img', 0, r'../SHUTTER/CALIBRATION_SO02.IMG')])), + ('.*/GO_0xxx.*/GO_000[2-6]/CATALOG/DATASET\.CAT', 0, + translator.TranslatorByDict( + {'V_E1DS.CAT' : ''})), + ('.*/GO_0xxx.*/GO_0001/DOCUMENT/PDSLABEL\.TXT', 0, + translator.TranslatorByDict( + {'RLINEPRX.FMT' : '../../GO_0002/LABEL/RLINEPRX.FMT', + 'RTLMTAB.FMT' : '../../GO_0002/LABEL/RTLMTAB.FMT'})), + ('.*/GO_0xxx_v1/GO_0001/INDEX/CUMINDEX\.LBL', 0, + translator.TranslatorByDict( + {'IMGINDEX.TAB' : 'CUMINDEX.TAB'})), + ('.*/GO_0xxx_v1/GO_0001/INDEX/P1CUMINDEX\.LBL', 0, + translator.TranslatorByDict( + {'IMGINDEX.TAB' : 'P1CUMINDEX.TAB'})), + + # HST + ('.*/HSTJ.*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'NST.CAT' : 'CATALOG/INST.CAT'})), + ('.*/HSTJ.*/CATINFO\.TXT', 0, + translator.TranslatorByDict( + {'NST.CAT' : 'INST.CAT'})), + ('.*/HSTJ.*_v.*/HSTJ1_0427/DATA/VISIT_02/.*\.LBL', 0, + translator.TranslatorByDict( + {'J96O02JLQ_FLT_WFC1.JPG': '', + 'J96O02JMQ_FLT_WFC1.JPG': '', + 'J96O02JLQ_FLT_WFC2.JPG': 'J96O02JLQ_FLT.JPG', + 'J96O02JMQ_FLT_WFC2.JPG': 'J96O02JMQ_FLT.JPG', + 'J96O02JOQ_FLT_WFC2.JPG': 'J96O02JOQ_FLT.JPG', + 'J96O02JQQ_FLT_WFC2.JPG': 'J96O02JQQ_FLT.JPG', + 'J96O02JSQ_FLT_WFC2.JPG': 'J96O02JSQ_FLT.JPG'})), + ('.*/HSTJx_xxxx.*_v.*/HSTJ1_2395/DATA/.*\.LBL', 0, + translator.TranslatorByDict( + {'JBNY02SOQ_FLT_WFC1.JPG': '', + 'JBNY02SOQ_FLT_WFC2.JPG': 'JBNY02SOQ_FLT.JPG', + 'JBNY02SQQ_FLT_WFC2.JPG': 'JBNY02SQQ_FLT.JPG', + 'JBNY02SSQ_FLT_WFC2.JPG': 'JBNY02SSQ_FLT.JPG', + 'JBNYA1T2Q_FLT_WFC2.JPG': 'JBNYA1T2Q_FLT.JPG', + 'JBNYA2SUQ_FLT_WFC2.JPG': 'JBNYA2SUQ_FLT.JPG'})), + + # JNOJIR + ('.*/JNOJIR.*/AAREADME.TXT', 0, + translator.TranslatorByDict( + {'PERSON.CAT' : 'JNO_JIRAM_PERSON.CAT', + 'DATAINFO.TXT' : ''})), + ('.*/JNOJIR.*/JIR_IMG_\w+_RESPONSIVITY_V03.LBL', 0, + translator.TranslatorByRegex( + [(r'(JIR_IMG_\w+_RESPONSIVITY)_V02\.DAT', 0, r'\1_V03.DAT')])), + ('.*/JNOJIR_20(2[789]|3\d)/DATA/JIR_\w+.LBL', 0, + translator.TranslatorByRegex( + [(r'(JIR_IMG_\w+_RESPONSIVITY)_V02\.DAT', 0, r'../CALIB/\1_V03.DAT')])), + # Embedded list comprehension + # Each links a SOURCE_PRODUCT_ID on JNOJIR_2nnn to the associated EDR in + # the parallel directory on JNOJIR_1nnn. Set up through volume _2049. + ] + [ + (f'.*/JNOJIR_xxxx/JNOJIR_20{nn:02d}/DATA/JIR_\w+.LBL', 0, + translator.TranslatorByRegex( + [(r'(JIR_\w+_EDR_20\w+)\.(DAT|IMG)', 0, + f'../../JNOJIR_10{nn:02d}/DATA/' + r'\1.\2')])) + for nn in range(0,50)] + [ + + # JNOJNC + ('.*/JNOJNC.*/(AAREADME|CATINFO).TXT', 0, + translator.TranslatorByDict( + {'JUNO_REF.CAT' : 'JUNO_PROJREF.CAT'})), + + # NHSP (and *SP_xxxx) + ('.*/NHSP_xxxx_v1.*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'personel.cat' : 'CATALOG/PERSONNEL.CAT', + 'spiceds.cat' : 'CATALOG/SPICE_INST.CAT'})), + ('.*SP_xxxx.*/aareadme\.txt', 0, + translator.TranslatorByDict( + {'dataset.cat' : 'catalog/spiceds.cat', + 'ckinfo.txt' : 'data/ck/ckinfo.txt', + 'ekinfo.txt' : 'data/ek/ekinfo.txt', + 'fkinfo.txt' : 'data/fk/fkinfo.txt', + 'ikinfo.txt' : 'data/ik/ikinfo.txt', + 'lskinfo.txt' : 'data/lsk/lskinfo.txt', + 'pckinfo.txt' : 'data/pck/pckinfo.txt', + 'sclkinfo.txt' : 'data/sclk/sclkinfo.txt', + 'spkinfo.txt' : 'data/spk/spkinfo.txt', + 'ckdoc.txt' : 'document/ck/ckdoc.txt', + 'ekdoc.txt' : 'document/ek/ekdoc.txt', + 'mkinfo.txt' : 'extras/mk/mkinfo.txt', + 'orbinfo.txt' : 'extras/orbnum/orbinfo.txt', + 'spkxinfo.txt' : 'extras/spkxtra/spkxinfo.txt', + 'covinfo.txt' : 'extras/spkxtra/covtab/covinfo.txt', + 'ckxtinfo.txt' : 'extras/ckxtra/ckxtinfo.txt', + 'navinfo.txt' : 'extras/ckxtra/cknav/navinfo.txt', + 'issinfo.txt' : 'extras/ckxtra/ckiss/issinfo.txt'})), + + # NHxxMV/NHxxLO + ('.*/NHxx.._xxxx_v1/NH(JU|LA).*/aareadme\.txt', 0, + translator.TranslatorByDict( + {'PAYLOAD_SSR.LBL' : 'document/payload_ssr/payload_ssr.lbl', + 'RALPH_SSR.LBL' : 'document/ralph_ssr/ralph_ssr.lbl', + 'SOC_INST_ICD.LBL' : 'document/soc_inst_icd/soc_inst_icd.lbl'})), + ('.*/NHxx.._xxxx_v1/NH(JU|LA).*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'PAYLOAD_SSR.LBL' : 'DOCUMENT/PAYLOAD_SSR/PAYLOAD_SSR.LBL', + 'RALPH_SSR.LBL' : 'DOCUMENT/RALPH_SSR/RALPH_SSR.LBL', + 'SOC_INST_ICD.LBL' : 'DOCUMENT/SOC_INST_ICD/SOC_INST_ICD.LBL'})), + ('.*/NHxxLO_xxxx.*/NH..LO_2001/data/\w+/.*\.lbl', 0, + translator.TranslatorByRegex( + [(r'cflat_grnd_SFA_(\w+\.fit)', 0, r'../../calib/cflat_grnd_sfa_\1'), + (r'(cflat|dead|delta|dsmear|hot|sap)_(\w+\.fit)', 0, r'../../calib/\1_\2')])), + ('.*/NHxxMV_xxxx.*/NH..MV_2001/data/\w+/.*\.lbl', 0, + translator.TranslatorByRegex( + [(r'(mc[0-3])_(flat_\w+\.fit)s', 0, r'../../calib/mcl/\1_\2'), + (r'(mp[12])_(flat_\w+\.fit)s', 0, r'../../calib/mp/\1_\2'), + (r'(mfr_flat_\w+\.fit)s', 0, r'../../calib/mfr/\1')])), + + # RPX + ('.*/RPX_0101.*/R_HARRIS\.LBL', 0, + translator.TranslatorByDict( + {'R_HARRIS.DF' : 'R_HARRIS.PDF'})), + ('.*/RPX_0101.*/F161225AB\.LBL', 0, + translator.TranslatorByDict( + {'F161225RB.GIF' : 'F161225AB.GIF'})), + ('.*/RPX_0201.*/T0808_F1498_CAL\.LBL', 0, + translator.TranslatorByDict( + {'T0808_F1497_CAL.IMG' : 'T0808_F1498_CAL.IMG'})), + ('.*/RPX_0401/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'INSTHOST.CAT' : 'CATALOG/HOST.CAT'})), + + # Any VG + ('.*/VG.*/CATALOG/CATINFO\.TXT', 0, + translator.TranslatorByDict( + {'VGnNINST.CAT' : 'VG1NINST.CAT', + 'VGnHOST.CAT' : 'VG1HOST.CAT'})), + + # VG_20xx (IRIS) + ('.*/VG_2001/.*/VG2_SAT\.LBL', 0, + translator.TranslatorByDict( + {'IRIS_ROWFMT.FMT' : '../JUPITER/IRISHEDR.FMT'})), + ('.*/VG_2001/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'IRISHEDR.FMT' : 'JUPITER/IRISHEDR.FMT', + 'IRISTRGP.FMT' : 'JUPITER/CALIB/IRISTRGP.FMT'})), + + # VG_28xx (ring profiles) + ('.*/VG_28[0-9]{2}/.*INFO\.TXT', 0, + translator.TranslatorByDict( + {'RS1SINST.CAT' : 'VG1SINST.CAT', + 'RS2UINST.CAT' : 'VG2UINST.CAT'})), + ('.*/VG_28xx/VG_2801/CALIB/PS2C01\.LBL', 0, + translator.TranslatorByDict( + {'PS1C01.TAB' : 'PS2C01.TAB'})), + ('.*/VG_28xx/VG_2801/JITTER/PS1J01\.LBL', 0, + translator.TranslatorByDict( + {'PS1J02.TAB' : 'PS1J01.TAB'})), + ('.*/VG_28xx/VG_2801/JITTER/PU2J02\.LBL', 0, + translator.TranslatorByDict( + {'PU2J01.TAB' : 'PU2J02.TAB'})), + ('.*/VG_280./.*/L3GUIDE\.TXT', 0, + translator.TranslatorByDict( + {'RTLMTAB.FMT' : ''})), + ('.*/VG_2802/EDITDATA/DATAINFO\.TXT', 0, + translator.TranslatorByDict( + {'INST.CAT' : '../CATALOG/VG1INST.CAT'})), + ('.*/VG_2802/EDITDATA/US3D01P\.LBL', 0, + translator.TranslatorByDict( + {'US3D01I.DAT' : 'US3D01P.DAT'})), + ('.*/VG_2802/SORCDATA/DATAINFO\.TXT', 0, + translator.TranslatorByDict( + {'BETAPER.VOY' : 'BETPER.VOY', + 'BETAPER.LBL' : 'BETPER.LBL'})), + ('.*/VG_2803.*/RS.R1BFV\.LBL', 0, + translator.TranslatorByDict( + {'RS_R1BFT.FMT' : 'RS_R1BFV.FMT'})), + + # VGn_9xxx (RSS) + ('.*/VG[12]_9.*/CHECKSUMS.TXT', 0, # any file referenced in CHECKSUMS.TXT + # already has a full path; don't search + translator.TranslatorByRegex([(r'(.*)', 0, r'\1')])), + ('.*/VG[12]_9.*/ERRATA.TXT', 0, + translator.TranslatorByDict( + {'_PERSON.CAT' : 'CATALOG/VG_RSS_PERSON.CAT'})), + ('.*/VG1_9050/CATALOG/CATINFO.TXT', 0, + translator.TranslatorByDict( + {'MISSION.CAT' : 'VG_MISSION.CAT', + 'INST_HOST.CAT' : 'VG1_INST_HOST.CAT', + 'INST.CAT' : 'VG1_RSS_INST.CAT', + 'DS.CAT' : 'VG1_SAT_RSS_DS.CAT', + 'PERSON.CAT' : 'VG_RSS_PERSON.CAT', + 'REF.CAT' : 'VG1_S_RSS_REF.CAT', + 'TARGET.CAT' : 'VG_SAT_TARGET.CAT', + 'VG1_SAT_TARGET.CAT' : 'VG_SAT_TARGET.CAT'})), + ('.*/VG1_9056/CATALOG/CATINFO.TXT', 0, + translator.TranslatorByDict( + {'MISSION.CAT' : 'VG_MISSION.CAT', + 'INSTHOST.CAT' : 'VG1_INST_HOST.CAT', + 'INST.CAT' : 'VG1_RSS_INST.CAT', + 'DS.CAT' : 'VG1_SSA_RSS_DS.CAT', + 'PERSON.CAT' : 'VG_RSS_PERSON.CAT', + 'REF.CAT' : 'VG1_SSA_RSS_REF.CAT', + 'TARGET.CAT' : 'VG_TITAN_TARGET.CAT'})), + ('.*/VG2_9065/CATALOG/CATINFO.TXT', 0, + translator.TranslatorByDict( + {'MISSION.CAT' : 'VG_MISSION.CAT', + 'INSTHOST.CAT' : 'VG2_INST_HOST.CAT', + 'INST.CAT' : 'VG2_RSS_INST.CAT', + 'DS.CAT' : 'VG2_S_RSS_DS.CAT', + 'PERSON.CAT' : 'VG_RSS_PERSON.CAT', + 'REF.CAT' : 'VG2_S_RSS_REF.CAT', + 'TARGET.CAT' : 'VG_SAT_TARGET.CAT'})), + + # VGIRIS + ('.*/VGIRIS_0001/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'JUPITER_ASCII.FMT' : 'DATA/JUPITER_VG1/JUPITER_ASCII.FMT', + 'JUPITER_LSB.FMT' : 'DATA/JUPITER_VG1/JUPITER_LSB.FMT', + 'JUPITER_MSB.FMT' : 'DATA/JUPITER_VG1/JUPITER_MSB.FMT', + 'SATURN_ASCII.FMT' : '', + 'SATURN_LSB.FMT' : '', + 'SATURN_MSB.FMT' : '', + 'VGnINST.CAT' : 'CATALOG/VG1INST.CAT', + 'VGnHOST.CAT' : 'CATALOG/VG1HOST.CAT'})), + ('.*/VGIRIS_0001/DATA/DATAINFO\.TXT', 0, + translator.TranslatorByDict( + {'JUPITER_ASCII.FMT' : 'JUPITER_VG1/JUPITER_ASCII.FMT', + 'JUPITER_LSB.FMT' : 'JUPITER_VG1/JUPITER_LSB.FMT', + 'JUPITER_MSB.FMT' : 'JUPITER_VG1/JUPITER_MSB.FMT', + 'SATURN_ASCII.FMT' : '', + 'SATURN_LSB.FMT' : '', + 'SATURN_MSB.FMT' : '', + 'VGnINST.CAT' : '../CATALOG/VG1INST.CAT', + 'VGnHOST.CAT' : '../CATALOG/VG1HOST.CAT'})), + ('.*/VGIRIS_0002/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'JUPITER_ASCII.FMT' : '', + 'JUPITER_LSB.FMT' : '', + 'JUPITER_MSB.FMT' : '', + 'SATURN_ASCII.FMT' : 'DATA/SATURN_VG1/SATURN_ASCII.FMT', + 'SATURN_LSB.FMT' : 'DATA/SATURN_VG1/SATURN_LSB.FMT', + 'SATURN_MSB.FMT' : 'DATA/SATURN_VG1/SATURN_MSB.FMT', + 'VGnINST.CAT' : 'CATALOG/VG1INST.CAT', + 'VGnHOST.CAT' : 'CATALOG/VG1HOST.CAT'})), + ('.*/VGIRIS_0002/DATA/DATAINFO\.TXT', 0, + translator.TranslatorByDict( + {'JUPITER_ASCII.FMT' : '', + 'JUPITER_LSB.FMT' : '', + 'JUPITER_MSB.FMT' : '', + 'SATURN_ASCII.FMT' : 'SATURN_VG1/SATURN_ASCII.FMT', + 'SATURN_LSB.FMT' : 'SATURN_VG1/SATURN_LSB.FMT', + 'SATURN_MSB.FMT' : 'SATURN_VG1/SATURN_MSB.FMT', + 'VGnINST.CAT' : '../CATALOG/VG1INST.CAT', + 'VGnHOST.CAT' : '../CATALOG/VG1HOST.CAT'})), + + # VGISS + ('.*/VGISS.*/BROWSE/C34801XX/C3480139_.*\.LBL', 0, + translator.TranslatorByDict( + {'C3480140_CALIB.JPG' : 'C3480139_CALIB.JPG', + 'C3480140_CLEANED.JPG' : 'C3480139_CLEANED.JPG', + 'C3480140_GEOMED.JPG' : 'C3480139_GEOMED.JPG', + 'C3480140_RAW.JPG' : 'C3480139_RAW.JPG'})), + ('.*/VGISS.*/BROWSE/C43892XX/C4389208_.*\.LBL', 0, + translator.TranslatorByDict( + {'C4389209_CALIB.JPG' : 'C4389208_CALIB.JPG', + 'C4389209_CLEANED.JPG' : 'C4389208_CLEANED.JPG', + 'C4389209_GEOMED.JPG' : 'C4389208_GEOMED.JPG', + 'C4389209_RAW.JPG' : 'C4389208_RAW.JPG'})), +]) + +KNOWN_MISSING_LABELS = translator.TranslatorByRegex([ + (r'.*/document/.*', re.I, 'missing'), + (r'.*/COCIRS_.*\.VAR', 0, 'missing'), + (r'.*/COCIRS_.*VANILLA.*', re.I, 'missing'), + (r'.*/COCIRS_0209/DATA/NAV_DATA/RIN02101300.DAT', 0, 'missing'), + (r'.*/COCIRS_0602/DATA/UNCALIBR/FIFM06021412.DAT', 0, 'missing'), + (r'.*/COISS_00.*/document/report/.*', 0, 'missing'), + (r'.*/COISS_0011/calib.*\.tab', 0, 'missing'), + (r'.*/COISS_0011/calib/calib.tar.gz', 0, 'missing'), + (r'.*/COISS_0011/extras/.*\.pro', 0, 'missing'), + (r'.*/COISS_0011/extras/cisscal.*', 0, 'missing'), + (r'.*/CO(ISS|VIMS)_.*/extras/.*\.(tiff|png|jpg|jpeg|jpeg_small)', + 0, 'missing'), + (r'.*/COSP_xxxx.*\.(pdf|zip|tm|orb)', 0, 'missing'), + (r'.*/COUVIS_.*/SOFTWARE/.*\.(PRO|pro|DAT|IDL|JAR|SAV)',0, 'missing'), + (r'.*/COUVIS_.*/CALIB/.*\.DOC', 0, 'missing'), + (r'.*/COUVIS_0xxx.*/SOFTWARE/CALIB/VERSION_4/t.t', 0, 'missing'), + (r'.*/COVIMS_0xxx.*/index/index.csv', 0, 'missing'), + (r'.*/COVIMS_0xxx.*/software/.*', 0, 'missing'), + (r'.*/COVIMS_0xxx.*/calib/example.*', 0, 'missing'), + (r'.*/COVIMS_0xxx.*/calib/.*\.(tab|qub|cub|bin|lbl)', 0, 'missing'), + (r'.*/COVIMS_0xxx.*/browse/.*\.pdf', 0, 'missing'), + (r'.*/COVIMS_0xxx.*\.(lbl|qub)-old_V[0-9]+', 0, 'missing'), + (r'.*/GO_0xxx_v1/GO_0001/CATALOG/REF.CAT.BAK', 0, 'missing'), + (r'.*/GO_0xxx.*/GO_0001/SOFTWARE/GALSOS2.EXE', 0, 'missing'), + (r'.*/GO_0xxx_v1/GO_0016/AAREADME.SL9', 0, 'missing'), + (r'.*/JNOJNC_0xxx.*/EXTRAS/.*\.PNG', 0, 'missing'), + (r'.*/NH.*/browse/.*\.jpg', 0, 'missing'), + (r'.*/NH.*/index/newline', 0, 'missing'), + (r'.*/NHxxMV.*/calib/.*\.png', 0, 'missing'), + (r'.*/NHSP_xxxx.*/DATASET.HTML', 0, 'missing'), + (r'.*/RPX.*/UNZIP532.*', 0, 'missing'), + (r'.*/RPX_xxxx/RPX_0201/CALIB/.*/(-180|128)', 0, 'missing'), + (r'.*/VG.*/VG..NESR\.DAT', 0, 'missing'), + (r'.*/VG_0xxx.*/CUMINDEX.TAB', 0, 'missing'), + (r'.*/VG_0xxx.*/SOFTWARE/.*', 0, 'missing'), + (r'.*/VG._9xxx.*/SOFTWARE/.*', 0, 'missing'), + (r'.*/VG2_9065/BROWSE/C0SR01AA.LOG', 0, 'missing'), + +# These files have internal PDS3 labels, so these are not errors + (r'.*/COISS_3xxx.*\.IMG', 0, 'unneeded'), + (r'.*/COUVIS_.*/SOFTWARE/.*\.txt_.*', 0, 'unneeded'), + (r'.*/VG_.*\.(IMQ|IRQ|IBG)', 0, 'unneeded'), + (r'.*/VG_0xxx.*/(AAREADME.VMS|VTOC.SYS|IMGINDEX.DBF)', 0, 'unneeded'), +]) + +# Match pattern for any file name, but possibly things that are not file names +PATTERN = r'\'?\"?([A-Z0-9][-\w]*\.[A-Z0-9][-\w\.]*)\'?\"?' + +# Match pattern for the file name in anything of the form "keyword = filename" +TARGET_REGEX1 = re.compile(r'^ *\^?\w+ *= *\(?\{? *' + PATTERN, re.I) + +# Match pattern for a file name on a line by itself +TARGET_REGEX2 = re.compile(r'^ *,? *' + PATTERN, re.I) + +# Match pattern for one or more file names embedded in a row of a text file. +# A file name begins with a letter, followed by any number of letters, digits, +# underscore or dash. Unless the name is "Makefile", it must have one or more +# extensions, each containing one or more characters. It can also have any +# number of directory prefixes separate by slashes. + +LINK_REGEX = re.compile(r'(?:|.*?[^/@\w\.])/?(?:\.\./)*(([A-Z0-9][-\w]+/)*' + + r'(makefile\.?|[A-Z0-9][\w-]*(\.[\w-]+)+))', re.I) + +EXTS_WO_LABELS = set(['.LBL', '.CAT', '.TXT', '.FMT', '.SFD']) + +################################################################################ + +class LinkInfo(object): + """Used internally to describe a link within a specified record of a file. + """ + + def __init__(self, recno, linkname, is_target): + + self.recno = recno # record number + self.linktext = linkname # substring within this record that looks + # like a link. + self.linkname = linkname # link text after possible repair for known + # errors. + self.is_target = is_target # True if, based on the local context, this + # might be a target of a label file + self.target = '' # abspath to target of link, if any. + # If not blank, this file must exist. + + def remove_path(self): + """Remove any leading directory path from this LinkInfo object.""" + + if '/' in self.linktext: + self.linktext = self.linktext.rpartition('/')[2] + self.linkname = self.linktext + + def __str__(self): + return ('%d %s %s %s' % (self.recno, self.linktext, str(self.is_target), + self.target or '[' + self.linkname + ']')) + +def generate_links(dirpath, old_links={}, + limits={'info':-1, 'debug':500, 'ds_store':10}, logger=None): + """Generate a dictionary keyed by the absolute file path for files in the + given directory tree, which must correspond to a volume. + + Keys ending in .LBL, .CAT and .TXT return a list of tuples + (recno, link, target) + for each link found. Here, + recno = record number in file; + link = the text of the link; + target = absolute path to the target of the link. + + Other keys return a single string, which indicates the absolute path to the + label file describing this file. + + Unlabeled files not ending in .LBL, .CAT or .TXT return an empty string. + + Also return the latest modification date among all the files checked. + """ + + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.PdsFile.from_abspath(dirpath) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Finding link shelf files', dirpath, limits) + + try: + + linkinfo_dict = old_links.copy() # abspath: list of LinkInfo objects + label_dict = {k:v for k,v in old_links.items() if isinstance(v,str)} + # abspath: label for this file + abspaths = [] # list of all abspaths + + latest_mtime = 0. + + # Walk the directory tree, one subdirectory "root" at a time... + for (root, dirs, files) in os.walk(dirpath): + + local_basenames = [] # Tracks the basenames in this directory + local_basenames_uc = [] # Same as above, but upper case + for basename in files: + abspath = os.path.join(root, basename) + latest_mtime = max(latest_mtime, os.path.getmtime(abspath)) + + if basename == '.DS_Store': # skip .DS_Store files + logger.ds_store('.DS_Store file skipped', abspath) + continue + + if basename.startswith('._'): # skip dot_underscore files + logger.dot_underscore('dot_underscore file skipped', + abspath) + continue + + if basename.startswith('.'): # skip invisible files + logger.invisible('Invisible file skipped', abspath) + continue + + abspaths.append(abspath) + local_basenames.append(basename) + local_basenames_uc.append(basename.upper()) + + # Update linkinfo_dict, searching each relevant file for possible links. + # If the linking file is a label and the target file has a matching + # name, update the label_dict entry for the target. + candidate_labels = {} # {target: list of possible label basenames} + for basename in local_basenames: + + abspath = os.path.join(root, basename) + if abspath in linkinfo_dict: # for update op, skip existing links + continue + + basename_uc = basename.upper() + + # Only check LBL, CAT, TXT, etc. + ext = basename_uc[-4:] if len(basename) >= 4 else '' + if ext not in EXTS_WO_LABELS: + continue + + # Get list of link info for all possible linked filenames + logger.debug('*** REVIEWING', abspath) + linkinfo_list = read_links(abspath, logger=logger) + + # Apply repairs + repairs = REPAIRS.all(abspath) + for info in linkinfo_list: + for repair in repairs: + linkname = repair.first(info.linktext) + if linkname is None: + + # Attempt repair with leading directory path removed + if '/' in info.linktext: + info.remove_path() + linkname = repair.first(info.linktext) + + if linkname is None: + continue # no repair found + + info.linkname = linkname + if linkname == '': + logger.info('Ignoring link "%s"' % + info.linktext, abspath, force=True) + else: + logger.info('Repairing link "%s"->"%s"' % + (info.linktext, linkname), + abspath, force=True) + + # Validate non-local targets of repairs + if '/' in linkname: + target = os.path.join(root, linkname) + if os.path.exists(target): + info.target = os.path.abspath(target) + else: + logger.error('Target of repaired link is missing', + target) + + break # apply only one repair per found link + + # Validate or remove other targets + new_linkinfo_list = [] + baseroot_uc = basename_uc.partition('.')[0] + ltest = len(baseroot_uc) + for info in linkinfo_list: + if info.target: # Non-local, repaired links have targets + new_linkinfo_list.append(info) + continue + + # A blank linkname is from a repair; indicates to ignore + if info.linkname == '': + continue + + # Ignore self-references + linkname_uc = info.linkname.upper() + if linkname_uc == basename_uc: + continue + + # Check for target inside this directory + try: + match_index = local_basenames_uc.index(linkname_uc) + except ValueError: + match_index = None + + # If not found, maybe it is a non-local reference (.FMT perhaps) + if match_index is None: + + # It's easy to pick up floats as link candidates; ignore + try: + _ = float(info.linkname) + continue # Yup, it's just a float + except ValueError: + pass + + if info.linkname[-1] in ('e', 'E'): + try: + _ = float(info.linkname[:-1]) + continue # Float with exponent + except ValueError: + pass + + # Also ignore format specifications (e.g., "F10.3") + if info.linkname[0] in ('F', 'E', 'G'): + try: + _ = float(info.linkname[1:]) + continue # Format + except ValueError: + pass + + # Search non-locally + if '/' in info.linkname: + nonlocal_target = locate_link_with_path(abspath, + info.linkname) + else: + nonlocal_target = locate_nonlocal_link(abspath, + info.linkname) + + # Report the outcome + if nonlocal_target: + logger.debug('Located "%s"' % info.linkname, + nonlocal_target) + info.target = nonlocal_target + new_linkinfo_list.append(info) + continue + + if linkname_uc.endswith('.FMT'): + logger.error('Unable to locate .FMT file "%s"' % + info.linkname, abspath) + elif linkname_uc.endswith('.CAT'): + logger.error('Unable to locate .CAT file "%s"' % + info.linkname, abspath) + else: + logger.debug('Substring "%s" is not a link, ignored' % + info.linkname, abspath) + + continue + + # Save the match + info.linkname = local_basenames[match_index] # update case + info.target = os.path.join(root, info.linkname) + new_linkinfo_list.append(info) + + # Could this be the label? + if ext != '.LBL': # nope + continue + + # If names match up to '.LBL', then yes + if (len(linkname_uc) > ltest and + linkname_uc[:ltest] == baseroot_uc and + linkname_uc[ltest] == '.'): + label_dict[info.target] = abspath + logger.debug('Label identified for %s' % info.linkname, + abspath) + continue + + # Otherwise, then maybe + if info.is_target: + if info.linkname in candidate_labels: + if basename not in candidate_labels[info.linkname]: + candidate_labels[info.linkname].append(basename) + else: + candidate_labels[info.linkname] = [basename] + + logger.debug('Candidate label found for ' + + info.linkname, abspath) + + linkinfo_dict[abspath] = new_linkinfo_list + + # Identify labels for files + for basename in local_basenames: + + basename_uc = basename.upper() + ext = basename_uc[-4:] if len(basename) >= 4 else '' + if ext in (".LBL", ".FMT"): # these can't have labels + continue + + abspath = os.path.join(root, basename) + if abspath in label_dict: + continue # label already found + + # Maybe we already know the label is missing + test = KNOWN_MISSING_LABELS.first(abspath) + if test == 'unneeded': + logger.debug('Label is not neeeded', abspath) + continue + + if test == 'missing': + logger.debug('Label is known to be missing', abspath) + continue + + # Determine if a label is required + label_is_required = (ext not in EXTS_WO_LABELS) + + # Get the list of candidate labels in this directory + candidates = candidate_labels.get(basename, []) + + # Determine if the obvious label file exists + label_guess_uc = basename_uc.partition('.')[0] + '.LBL' + if label_guess_uc in local_basenames_uc: + k = local_basenames_uc.index(label_guess_uc) + obvious_label_basename = local_basenames[k] + else: + obvious_label_basename = '' + + # Simplest case... + if obvious_label_basename in candidates: + if not label_is_required: + logger.debug('Unnecessary label found', abspath, force=True) + + label_dict[abspath] = os.path.join(root, obvious_label_basename) + continue + + # More cases... + if not label_is_required: + continue # leave abspath out of label_dict + + # Report a phantom label + if obvious_label_basename: + logger.error('Label %s does not point to file' % + local_basenames[k], abspath) + + if len(candidates) == 1: + logger.debug('Label found as ' + candidates[0], abspath, + force=True) + label_dict[abspath] = os.path.join(root, candidates[0]) + continue + + # or errors... + label_dict[abspath] = "" + if len(candidates) == 0: + logger.error('Label is missing', abspath) + else: + logger.error('Ambiguous label found as %s' % candidates[0], + abspath, force=True) + for candidate in candidates[1:]: + logger.debug('Alternative label found as %s' % candidate, + abspath, force=True) + + # Merge the dictionaries + # There are cases where a file can have both a list of links and a label. + # This occurs when a .TXT or .CAT file has a label, even though it didn't + # need one. In the returned dictionary, link lists take priority. + link_dict = {} + for key in abspaths: + if key in linkinfo_dict: + # If this is a new entry, it's a list of LinkInfo objects + # If this was copied from old_links, it's already a list of tuples + values = linkinfo_dict[key] + if isinstance(values, list): + new_list = [] + for item in values: + if isinstance(item, LinkInfo): + new_list.append((item.recno, item.linktext, item.target)) + else: + new_list.append(item) + link_dict[key] = new_list + else: + link_dict[key] = values + elif key in label_dict: + link_dict[key] = label_dict[key] + else: + link_dict[key] = '' + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('Lastest holdings file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + return (link_dict, latest_mtime) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +def read_links(abspath, logger=None): + """Return a list of LinkInfo objects for anything linked or labeled by this + file. + """ + + with open(abspath, 'r', encoding='latin-1') as f: + recs = f.readlines() + + links = [] + multiple_targets = False + for recno,rec in enumerate(recs): + + while True: + + # Search for the target of a link + is_target = True + matchobj = TARGET_REGEX1.match(rec) + if matchobj: + subrec = rec[:matchobj.end()] + if '(' in subrec or '{' in subrec: + multiple_targets = True + + # ... on the same line or the next line + elif multiple_targets: + matchobj = TARGET_REGEX2.match(rec) + + # If not found, search for any other referenced file name or path + if not matchobj: + if ')' in rec or '}' in rec: + multiple_targets = False + + is_target = False + matchobj = LINK_REGEX.match(rec) + if matchobj: + multiple_targets = False + + # No more matches in this record + if not matchobj: + break + + linktext = matchobj.group(1) + links.append(LinkInfo(recno, linktext, is_target)) + + rec = rec[matchobj.end():] + + return links + +def locate_nonlocal_link(abspath, filename): + """Return the absolute path associated with a link in a PDS file. This is + done by searching up the tree and also by looking inside the LABEL, + CATALOG and INCLUDE directories if they exist.""" + + filename_uc = filename.upper() + + parts = abspath.split('/')[:-1] + + # parts are [..., 'holdings', 'volumes', volset, volname, ...] + # Therefore, if 'holdings' is in parts[:-3], then there's a volname in this + # path. + while 'holdings' in parts[:-3]: + testpath = '/'.join(parts) + basenames = os.listdir(testpath) + basenames_uc = [b.upper() for b in basenames] + try: + k = basenames_uc.index(filename_uc) + return testpath + '/' + basenames[k] + except ValueError: + pass + + for dirname in ['LABEL', 'CATALOG', 'INCLUDE', 'INDEX', 'DOCUMENT', + 'DATA', 'CALIB', 'EXTRAS', 'SOFTWARE']: + try: + k = basenames_uc.index(dirname) + subnames = os.listdir(testpath + '/' + basenames[k]) + subupper = [s.upper() for s in subnames] + try: + kk = subupper.index(filename_uc) + return testpath + '/' + basenames[k] + '/' + subnames[kk] + except ValueError: + pass + except ValueError: + pass + + parts = parts[:-1] + + return '' + +def locate_link_with_path(abspath, filename): + """Return the absolute path associated with a link that contains a leading + directory path. + """ + + parts = filename.split('/') + link_path = locate_nonlocal_link(abspath, parts[0]) + if not link_path: + return '' + + for part in parts[1:]: + basenames = os.listdir(link_path) + if part in basenames: + link_path += '/' + part + else: + basenames_uc = [b.upper() for b in basenames] + part_uc = part.upper() + if part_uc in basenames_uc: + k = basenames_uc.index(part_uc) + link_path += '/' + basenames[k] + else: + return '' + + return link_path + +################################################################################ + +def load_links(dirpath, limits={}, logger=None): + """Load link dictionary from a shelf file, converting interior paths to + absolute paths.""" + + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.PdsFile.from_abspath(dirpath) + + dirpath_ = dirpath.rstrip('/') + '/' + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Reading link shelf file for', dirpath, limits) + + try: + (link_path, lskip) = pdsdir.shelf_path_and_lskip('link') + prefix_ = pdsdir.volume_abspath() + '/' + + logger.info('Link shelf file', link_path) + + if not os.path.exists(link_path): + raise IOError('File not found: ' + link_path) + + # Read the shelf file and convert to a dictionary + with open(link_path, 'rb') as f: + interior_dict = pickle.load(f) + + # Convert interior paths to absolute paths + link_dict = {} + for (key, values) in interior_dict.items(): + long_key = dirpath_ + key + + if isinstance(values, list): + new_list = [] + for (recno, basename, interior_path) in values: + abspath = dirpath_ + str(interior_path) + if '../' in abspath: + abspath = os.path.abspath(abspath) + + new_list.append((recno, str(basename), abspath)) + + link_dict[long_key] = new_list + else: + values = str(values) + if values == '': + link_dict[long_key] = '' + else: + link_dict[long_key] = dirpath_ + values + + return link_dict + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def write_linkdict(dirpath, link_dict, limits={}, logger=None): + """Write a new link shelf file for a directory tree.""" + + # Initialize + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.PdsFile.from_abspath(dirpath) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Writing link shelf file for', dirpath, limits) + + try: + (link_path, lskip) = pdsdir.shelf_path_and_lskip('link') + logger.info('Link shelf file', link_path) + + # Create a dictionary using interior paths instead of absolute paths + interior_dict = {} + prefix = (dirpath + '/')[:lskip] + for (key, values) in link_dict.items(): + if isinstance(values, list): + new_list = [] + for (basename, recno, link_abspath) in values: + if link_abspath[:lskip] == prefix: + new_list.append((basename, recno, link_abspath[lskip:])) + else: # link outside this volume + link = pdsfile.PdsFile.from_abspath(link_abspath) + if (link.category_ == pdsdir.category_ and + link.volset == pdsdir.volset and + link.suffix == pdsdir.suffix): + link_relpath = '../' + link.volname_ + link.interior + elif link.category_ == pdsdir.category_: + link_relpath = ('../../' + link.volset_ + + link.volname_ + link.interior) + else: + link_relpath = ('../../../' + link.category_ + + link.volset_ + + link.volname_ + link.interior) + new_list.append((basename, recno, link_relpath)) + + interior_dict[key[lskip:]] = new_list + else: + interior_dict[key[lskip:]] = values[lskip:] + + # Create parent directory if necessary + parent = os.path.split(link_path)[0] + if not os.path.exists(parent): + logger.normal('Creating directory', parent) + os.makedirs(parent) + + # Write the shelf + with open(link_path, 'wb') as f: + pickle.dump(interior_dict, f) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + logger.open('Writing Python dictionary', dirpath) + try: + # Determine the maximum length of the file path and basename + len_key = 0 + len_base = 0 + for (key, value) in interior_dict.items(): + len_key = max(len_key, len(key)) + if isinstance(value, list): + tuples = value + for (recno, basename, interior_path) in tuples: + len_base = max(len_base, len(basename)) + + len_key = min(len_key, 60) + + # Write the python dictionary version + python_path = link_path.rpartition('.')[0] + '.py' + name = os.path.basename(python_path) + parts = name.split('_') + name = '_'.join(parts[:2]) + '_links' + keys = list(interior_dict.keys()) + keys.sort() + + with open(python_path, 'w', encoding='latin-1') as f: + f.write(name + ' = {\n') + for valtype in (list, str): + for key in keys: + if not isinstance(interior_dict[key], valtype): continue + + f.write(' "%s"' % key) + if len(key) < len_key: + f.write((len_key - len(key)) * ' ') + f.write(': ') + tuple_indent = max(len(key),len_key) + 7 + + values = interior_dict[key] + if isinstance(values, str): + f.write('"%s",\n' % values) + elif len(values) == 0: + f.write('[],\n') + else: + f.write('[') + for k in range(len(values)): + (recno, basename, interior_path) = values[k] + f.write('(%4d, ' % recno) + f.write('"%s, ' % (basename + '"' + + (len_base-len(basename)) * ' ')) + f.write('"%s")' % interior_path) + + if k < len(values) - 1: + f.write(',\n' + tuple_indent * ' ') + else: + f.write('],\n') + + f.write('}\n\n') + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def validate_links(dirpath, dirdict, shelfdict, limits={}, logger=None): + + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.PdsFile.from_abspath(dirpath) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Validating link shelf file for', dirpath, limits=limits) + + try: + keys = list(dirdict.keys()) + for key in keys: + if key in shelfdict: + dirinfo = dirdict[key] + shelfinfo = shelfdict[key] + + if type(dirinfo) == list: + dirinfo.sort() + + if type(shelfinfo) == list: + shelfinfo.sort() + + if dirinfo != shelfinfo: + logger.error('Link target mismatch', key) + + del shelfdict[key] + del dirdict[key] + + keys = list(dirdict.keys()) + keys.sort() + for key in keys: + logger.error('Missing link shelf file entry for', key) + + keys = list(shelfdict.keys()) + keys.sort() + for key in keys: + logger.error('Link shelf file entry found for missing file', key) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + return logger.close() + +################################################################################ + +def move_old_links(shelf_file, logger=None): + """Move a file to the /logs/ directory tree and append a time tag.""" + + if not os.path.exists(shelf_file): return + + shelf_basename = os.path.basename(shelf_file) + (shelf_prefix, shelf_ext) = os.path.splitext(shelf_basename) + + if logger is None: + logger = pdslogger.PdsLogger.get_logger(LOGNAME) + + from_logged = False + for log_dir in LOGDIRS: + dest_template = log_dir + '/' + shelf_prefix + '_v???' + shelf_ext + version_paths = glob.glob(dest_template) + + max_version = 0 + lskip = len(shelf_ext) + for version_path in version_paths: + version = int(version_path[-lskip-3:-lskip]) + max_version = max(max_version, version) + + new_version = max_version + 1 + dest = dest_template.replace('???', '%03d' % new_version) + shutil.copy(shelf_file, dest) + + if not from_logged: + logger.info('Link shelf file moved from: ' + shelf_file) + from_logged = True + + logger.info('Link shelf file moved to ' + dest) + + python_src = shelf_file.rpartition('.')[0] + '.py' + python_dest = dest.rpartition('.')[0] + '.py' + shutil.copy(python_src, python_dest) + + pickle_src = shelf_file.rpartition('.')[0] + '.pickle' + pickle_dest = dest.rpartition('.')[0] + '.pickle' + shutil.copy(pickle_src, pickle_dest) + +################################################################################ +# Simplified functions to perform tasks +################################################################################ + +def initialize(pdsdir, logger=None): + + link_path = pdsdir.shelf_path_and_lskip('link')[0] + + # Make sure file does not exist + if os.path.exists(link_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Link shelf file already exists', link_path) + return + + # Generate link info + (link_dict, _) = generate_links(pdsdir.abspath, logger=logger) + + # Move old file if necessary + if os.path.exists(link_path): + move_old_links(link_path, logger=logger) + + # Save link files + write_linkdict(pdsdir.abspath, link_dict, logger=logger) + +def reinitialize(pdsdir, logger=None): + + link_path = pdsdir.shelf_path_and_lskip('link')[0] + + # Warn if shelf file does not exist + if not os.path.exists(link_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Link shelf file does not exist; initializing', link_path) + initialize(pdsdir, logger=logger) + return + + # Generate link info + (link_dict, _) = generate_links(pdsdir.abspath, logger=logger) + + # Move old file if necessary + if os.path.exists(link_path): + move_old_links(link_path, logger=logger) + + # Save link files + write_linkdict(pdsdir.abspath, link_dict, logger=logger) + +def validate(pdsdir, logger=None): + + link_path = pdsdir.shelf_path_and_lskip('link')[0] + + # Make sure file exists + if not os.path.exists(link_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Link shelf file does not exist', link_path) + return + + # Read link shelf file + shelf_linkdict = load_links(pdsdir.abspath, logger=logger) + + # Generate link dict + (dir_linkdict, _) = generate_links(pdsdir.abspath, logger=logger) + + # Validate + validate_links(pdsdir.abspath, dir_linkdict, shelf_linkdict, logger=logger) + +def repair(pdsdir, logger=None): + + link_path = pdsdir.shelf_path_and_lskip('link')[0] + + # Make sure file exists + if not os.path.exists(link_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Link shelf file does not exist; initializing', link_path) + return + + # Read link shelf file + shelf_linkdict = load_links(pdsdir.abspath, logger=logger) + + # Generate link dict + (dir_linkdict, latest_mtime) = generate_links(pdsdir.abspath, logger=logger) + + # Compare + canceled = (dir_linkdict == shelf_linkdict) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + link_pypath = link_path.replace('.pickle', '.py') + link_mtime = min(os.path.getmtime(link_path), + os.path.getmtime(link_pypath)) + if latest_mtime > link_mtime: + logger.info('!!! Link shelf file content is up to date', + link_path, force=True) + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('!!! Latest holdings file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + dt = datetime.datetime.fromtimestamp(link_mtime) + logger.info('!!! Link shelf file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + delta = latest_mtime - link_mtime + if delta >= 86400/10: + logger.info('!!! Link shelf file is out of date %.1f days' % + (delta / 86400.), force=True) + else: + logger.info('!!! Link shelf file is out of date %.1f minutes' % + (delta / 60.), force=True) + + dt = datetime.datetime.now() + os.utime(link_path) + os.utime(link_pypath) + logger.info('!!! Time tag on link shelf files set to', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + else: + logger.info(f'!!! Link shelf file is up to date; repair canceled', + link_path, force=True) + return + + # Move files and write new links + move_old_links(link_path, logger=logger) + write_linkdict(pdsdir.abspath, dir_linkdict, logger=logger) + +def update(pdsdir, logger=None): + + link_path = pdsdir.shelf_path_and_lskip('link')[0] + + # Make sure link shelf file exists + if not os.path.exists(link_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Link shelf file does not exist; initializing', link_path) + initialize(pdsdir, logger=logger) + return + + # Read link shelf file + shelf_linkdict = load_links(pdsdir.abspath, logger=logger) + + # Generate link dict + (dir_linkdict, + latest_mtime) = generate_links(pdsdir.abspath, shelf_linkdict, + logger=logger) + + # Compare + canceled = (dir_linkdict == shelf_linkdict) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.info('!!! Link shelf file content is complete; update canceled', + link_path, force=True) + return + + # Move files and write new links + move_old_links(link_path, logger=logger) + write_linkdict(pdsdir.abspath, dir_linkdict, logger=logger) + +################################################################################ + +if __name__ == '__main__': + + # Set up parser + parser = argparse.ArgumentParser( + description='pdslinkshelf: Create, maintain and validate shelves of ' + + 'links between files.') + + parser.add_argument('--initialize', '--init', const='initialize', + default='', action='store_const', dest='task', + help='Create a link shelf file for a volume. Abort ' + + 'if the checksum file already exists.') + + parser.add_argument('--reinitialize', '--reinit', const='reinitialize', + default='', action='store_const', dest='task', + help='Create a link shelf file for a volume. Replace ' + + 'the file if it already exists.') + + parser.add_argument('--validate', const='validate', + default='', action='store_const', dest='task', + help='Validate every link in a volume directory tree ' + + 'against its link shelf file.') + + parser.add_argument('--repair', const='repair', + default='', action='store_const', dest='task', + help='Validate every link in a volume directory tree ' + + 'against its link shelf file. If any ' + + 'disagreement is found, replace the shelf ' + + 'file; otherwise leave it unchanged. If any of ' + + 'the files checked are newer than the link shelf '+ + 'file, update shelf file\'s modification date') + + parser.add_argument('--update', const='update', + default='', action='store_const', dest='task', + help='Search a directory for any new files and add ' + + 'their links to the link shelf file. Links of ' + + 'pre-existing files are not checked.') + + parser.add_argument('volume', nargs='+', type=str, + help='The path to the root directory of a volume.') + + parser.add_argument('--log', '-l', type=str, default='', + help='Optional root directory for a duplicate of the ' + + 'log files. If not specified, the value of ' + + 'environment variable "%s" ' % LOGROOT_ENV + + 'is used. In addition, individual logs are ' + + 'written into the "logs" directory parallel to ' + + '"holdings". Logs are created inside the ' + + '"pdslinkshelf" subdirectory of each log root ' + + 'directory.' + ) + + parser.add_argument('--quiet', '-q', action='store_true', + help='Do not also log to the terminal.') + + # Parse and validate the command line + args = parser.parse_args() + + if not args.task: + print('pdslinkshelf error: Missing task') + sys.exit(1) + + status = 0 + + # Define the logging directory + if args.log == '': + try: + args.log = os.environ[LOGROOT_ENV] + except KeyError: + args.log = None + + # Initialize the logger + logger = pdslogger.PdsLogger(LOGNAME) + pdsfile.PdsFile.set_log_root(args.log) + + if not args.quiet: + logger.add_handler(pdslogger.stdout_handler) + + if args.log: + path = os.path.join(args.log, 'pdslinkshelf') + warning_handler = pdslogger.warning_handler(path) + logger.add_handler(warning_handler) + + error_handler = pdslogger.error_handler(path) + logger.add_handler(error_handler) + + # Generate a list of file paths before logging + paths = [] + for path in args.volume: + + if not os.path.exists(path): + print('No such file or directory: ' + path) + sys.exit(1) + + path = os.path.abspath(path) + pdsf = pdsfile.PdsFile.from_abspath(path) + + if pdsf.checksums_: + print('No link shelf files for checksum files: ' + path) + sys.exit(1) + + if pdsf.archives_: + print('No link shelf files for archive files: ' + path) + sys.exit(1) + + if pdsf.is_volset_dir: + paths += [os.path.join(path, c) for c in pdsf.childnames] + + else: + paths.append(os.path.abspath(path)) + + # Loop through tuples... + logger.open(' '.join(sys.argv)) + try: + for path in paths: + + pdsdir = pdsfile.PdsFile.from_abspath(path) + if not pdsdir.isdir: # skip volset-level readme files + continue + + # Save logs in up to two places + logfiles = set([pdsdir.log_path_for_volume('_links', + task=args.task, + dir='pdslinkshelf'), + pdsdir.log_path_for_volume('_links', + task=args.task, + dir='pdslinkshelf', + place='parallel')]) + + # Create all the handlers for this level in the logger + local_handlers = [] + LOGDIRS = [] # used by move_old_links() + for logfile in logfiles: + local_handlers.append(pdslogger.file_handler(logfile)) + logdir = os.path.split(logfile)[0] + LOGDIRS.append(os.path.split(logfile)[0]) + + # These handlers are only used if they don't already exist + warning_handler = pdslogger.warning_handler(logdir) + error_handler = pdslogger.error_handler(logdir) + local_handlers += [warning_handler, error_handler] + + # Open the next level of the log + if len(paths) > 1: + logger.blankline() + + logger.open('Task "' + args.task + '" for', path, + handler=local_handlers) + + try: + for logfile in logfiles: + logger.info('Log file', logfile) + + if args.task == 'initialize': + initialize(pdsdir) + + elif args.task == 'reinitialize': + reinitialize(pdsdir) + + elif args.task == 'validate': + validate(pdsdir) + + elif args.task == 'repair': + repair(pdsdir) + + else: # update + update(pdsdir) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + status = 1 + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + if fatal or errors: status = 1 + + sys.exit(status) diff --git a/validation/re-validate.py b/validation/re-validate.py new file mode 100755 index 0000000..fc65395 --- /dev/null +++ b/validation/re-validate.py @@ -0,0 +1,818 @@ +#!/usr/bin/env python3 +################################################################################ +# re-validate.py +# +# Syntax: +# re-validate.py path [path ...] +# +# Enter the --help option to see more information. +################################################################################ + +import sys +import os +import glob +import argparse +import datetime +import socket +from smtplib import SMTP + +import pdslogger +import pdsfile +import pdschecksums +import pdsarchives +import pdsinfoshelf +import pdslinkshelf +import pdsdependency + +LOGNAME = 'pds.validation.re-validate' +LOGROOT_ENV = 'PDS_LOG_ROOT' + +SERVER = 'list.seti.org' +FROM_ADDR = "PDS Administrator " +REPORT_SUBJ = "Re-validate report from " + socket.gethostname() +REPORT_SUBJ_W_ERRORS = "Re-validate report with ERRORs from " + \ + socket.gethostname() +ERROR_REPORT_SUBJ = "Re-validate ERROR report from " + socket.gethostname() + +################################################################################ +# Function to validate one volume +################################################################################ + +def validate_one_volume(pdsdir, voltypes, tests, args, logger): + """Validates one volume.""" + + tests_performed = 0 + + # Open logger for this volume + logfiles = set([pdsdir.log_path_for_volume('_re-validate', + dir='re-validate'), + pdsdir.log_path_for_volume('_re-validate', + dir='re-validate', + place='parallel')]) + + local_handlers = [] + for logfile in logfiles: + logfile = logfile.replace('/volumes/','/') # this subdir not needed + local_handlers.append(pdslogger.file_handler(logfile)) + logdir = os.path.split(logfile)[0] + logdir = os.path.split(logdir)[0] + + # These handlers are only used if they don't already exist + warning_handler = pdslogger.warning_handler(logdir) + error_handler = pdslogger.error_handler(logdir) + local_handlers += [warning_handler, error_handler] + + logger.blankline() + logger.open('Re-validate ' + pdsdir.abspath, handler=local_handlers) + try: + + logger.info('Last modification', pdsdir.date) + logger.info('Volume types', str(voltypes)[1:-1].replace("'","")) + logger.info('Tests', str(tests)[1:-1].replace("'","")) + logger.blankline() + + # Checksums and archives for each voltype... + for voltype in voltypes: + abspath = pdsdir.abspath.replace('/volumes/', + '/' + voltype + '/') + if not os.path.exists(abspath): + continue + + temp_pdsdir = pdsfile.PdsFile.from_abspath(abspath) + if args.checksums: + logger.open('Checksum re-validatation for', abspath) + try: + pdschecksums.validate(temp_pdsdir, logger=logger) + finally: + tests_performed += 1 + logger.close() + + if args.archives: + logger.open('Archive re-validatation for', abspath) + try: + pdsarchives.validate(temp_pdsdir, logger=logger) + finally: + tests_performed += 1 + logger.close() + + # Checksums for each 'archive-' + voltype... + if checksums and args.archives: + for voltype in voltypes: + abspath = pdsdir.abspath.replace('/volumes/', + '/archives-' + voltype + '/') + abspath += '*.tar.gz' + abspath = glob.glob(abspath) + if not abspath: + continue + + abspath = abspath[0] # there should only be one + + (prefix, basename) = os.path.split(abspath) + temp_pdsdir = pdsfile.PdsFile.from_abspath(prefix) + logger.open('Checksum re-validatation for', abspath) + try: + pdschecksums.validate(temp_pdsdir, basename, logger) + finally: + tests_performed += 1 + logger.close() + + # Infoshelves and linkshelves for each voltype... + for voltype in voltypes: + abspath = pdsdir.abspath.replace('/volumes/', + '/' + voltype + '/') + if not os.path.exists(abspath): + continue + + temp_pdsdir = pdsfile.PdsFile.from_abspath(abspath) + if args.infoshelves: + logger.open('Infoshelf re-validatation for', abspath) + try: + pdsinfoshelf.validate(temp_pdsdir, logger=logger) + finally: + tests_performed += 1 + logger.close() + + if (args.linkshelves and + voltype in ('volumes', 'calibrated', 'metadata')): + logger.open('Linkshelf re-validatation for', abspath) + try: + pdslinkshelf.validate(temp_pdsdir, logger=logger) + finally: + tests_performed += 1 + logger.close() + + # Infoshelves for each 'archive-' + voltype... + if args.infoshelves and args.archives: + for voltype in voltypes: + abspath = pdsdir.abspath.replace('/volumes/', + '/archives-' + voltype + '/') + abspath += '*.tar.gz' + abspath = glob.glob(abspath) + if not abspath: + continue + + abspath = abspath[0] # there should only be one + + (prefix, basename) = os.path.split(abspath) + temp_pdsdir = pdsfile.PdsFile.from_abspath(prefix) + logger.open('Infoshelf re-validatation for', abspath) + try: + pdsinfoshelf.validate(temp_pdsdir, basename, logger) + finally: + tests_performed += 1 + logger.close() + + # Dependencies + if args.dependencies: + if args.timeless: + logger.open('Timeless dependency re-validation for', abspath) + else: + logger.open('Dependency re-validation for', abspath) + try: + pdsdependency.test(pdsdir, logger=logger, + check_newer=(not args.timeless)) + finally: + tests_performed += 1 + logger.close() + + except Exception as e: + logger.exception(e) + + finally: + if tests_performed == 1: + logger.info('1 re-validation test performed', pdsdir.abspath) + else: + logger.info('%d re-validation tests performed' % tests_performed, + pdsdir.abspath) + (fatal, errors, warnings, tests) = logger.close() + + return (logfile, fatal, errors) + +################################################################################ +# Log and volume management for batch mode +################################################################################ + +def volume_abspath_from_log(log_path): + """Return the absolute path within the holdings directory of the PDS volume + described by this validation log. + """ + + with open(log_path) as f: + rec = f.readline() + + parts = rec.split('|') + return parts[-1].strip().split(' ')[-1] + + +def key_from_volume_abspath(abspath): + """Return 'volset/volname' from this absolute path. + """ + + parts = abspath.split('/') + return '/'.join(parts[-2:]) + + +def key_from_log_path(log_path): + """Return 'volset/volname' from this log path. + """ + + parts = abspath.split('/') + volname = parts[-1].split('_re-validate_')[0] + + return parts[-2] + '/' + volname + + +def get_log_info(log_path): + """Return info from the log: + (start, elapsed, modtime, abspath, had_error, had_fatal). + """ + + with open(log_path) as f: + recs = f.readlines() + + if not recs: + raise ValueError('Empty log file: ' + log_path) + + parts = recs[0].split('|') + if len(parts) < 2: + raise ValueError('Empty log file: ' + log_path) + + start_time = parts[0].rstrip() + if parts[1].strip() != LOGNAME: + raise ValueError('Not a re-validate log file') + + abspath = parts[-1].strip().split(' ')[-1] + + if len(recs) < 1: + raise ValueError('Not a re-validate log file') + + if 'Last modification' not in recs[1]: + raise ValueError('Missing modification time') + + modtime = recs[1].split('modification:')[-1].strip() + + error = False + fatal = False + elapsed = None + for rec in recs: + error |= ('| ERROR |' in rec) + fatal |= ('| FATAL |' in rec) + + k = rec.find('Elapsed time = ') + if k >= 0: + elapsed = rec[k + len('Elapsed time = '):].strip() + + if elapsed is None: + fatal = True + + return (start_time, elapsed, modtime, abspath, error, fatal) + + +def get_all_log_info(logroot): + """Return a list of info about the latest version of every log file, + skipping those that recorded a FATAL error. Each log file is described by + the tuple: + (start, elapsed, modtime, abspath, had_error, had_fatal). + Also return a dictionary that provides the complete list of existing log + files, in chronological order, keyed by volset/volname. + """ + + # Create a dictionary keyed by volset/volname that returns the chronological + # list of all associated log paths + logs_for_volset_volume = {} + for (root, dirs, files) in os.walk(logroot): + files = list(files) + files.sort() + for file in files: + if not file.endswith('.log'): + continue + parts = file.split('_re-validate_') + if len(parts) != 2: + continue + key = os.path.basename(root) + '/' + parts[0] + if key not in logs_for_volset_volume: + logs_for_volset_volume[key] = [] + logs_for_volset_volume[key].append(os.path.join(root, file)) + + # Create a list containing info about the last log path that did not + # produce a FATAL error. + info_list = [] + for key, log_paths in logs_for_volset_volume.items(): + for log_path in log_paths[::-1]: + try: + info = get_log_info(log_path) + except ValueError: + continue + + # On rare occasions when the holdings tree has been reorganized, the + # the log path and internal volume path can disagree. + test = key_from_volume_abspath(info[3]) # info[3] is the abspath + if test != key: + continue + + if not info[-1]: # info[-1] is had_fatal + info_list.append(info) + break + + return (info_list, logs_for_volset_volume) + + +def get_volume_info(holdings): + """Return a list of tuples (volume abspath, modtime) for every volume in + the given holdings directory.""" + + path = os.path.join(holdings, 'volumes/*_*/*_*') + abspaths = glob.glob(path) + + info_list = [] + for abspath in abspaths: + pdsdir = pdsfile.PdsFile.from_abspath(abspath) + info_list.append((abspath, pdsdir.date)) + + return info_list + + +def find_modified_volumes(holdings_info, log_info): + """Compare the information in the holdings info and log info; return a tuple + (modified_holdings, current_log_info, missing_keys).""" + + # Create a dictionary of log info organized by volset/volume + # Also create the set (modtime, volset/volume) for each log volume + log_dict = {} + log_modtimes = set() + for info in log_info: + (start, elapsed, modtime, abspath, had_error, had_fatal) = info + key = key_from_volume_abspath(abspath) + log_dict[key] = info + log_modtimes.add((modtime, key)) + + # Create a dictionary of holdings info organized by volset/volume + # Also create the set (modtime, volset/volname) for each holdings volume + holdings_dict = {} + holdings_modtimes = set() + for (abspath, modtime) in holdings_info: + parts = abspath.split('/') + key = parts[-2] + '/' + parts[-1] + holdings_dict[key] = (abspath, modtime) + holdings_modtimes.add((modtime, key)) + + # Determine the set of entries that have been modified since their last + # validation + modified_holdings = holdings_modtimes - log_modtimes + + # Update content to an ordered list of tuples (abspath, modtime) + modified_holdings = list(modified_holdings) + modified_holdings.sort() # from oldest to newest + modified_holdings = [holdings_dict[info[1]] for info in modified_holdings] + + # Delete these keys from the log info dictionary + for (_, key) in modified_holdings: + if key in log_dict: + del log_dict[key] + + # Identify previously logged volumes not found in holdings + # Delete these from the log dictionary + missing_keys = [key for key in log_dict if key not in holdings_dict] + for key in missing_keys: + del log_dict[key] + + # If a log file is from a holdings directory tree not currently being + # validated, redirect this validation to the correct directory tree. + for key, info in log_dict.items(): + old_path = info[3] + new_path = holdings_dict[key][0] + if new_path != old_path: + info = list(info) + info[3] = new_path + log_dict[key] = tuple(info) + + # Sort the remaining logged volumes from oldest to newest + current_log_info = list(log_dict.values()) + current_log_info.sort() + + return (modified_holdings, current_log_info, missing_keys) + + +def send_email(to_addr, subject, message): + smtp = SMTP() + smtp.connect(SERVER, 25) + date = datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S") + + if type(to_addr) == str: + to_addr = [to_addr] + + to_addr_in_msg = ','.join(to_addr) + + msg = ("From: %s\nTo: %s\nSubject: %s\nDate: %s\n\n%s" \ + % (FROM_ADDR, to_addr_in_msg, subject, date, message)) + + for addr in to_addr: + smtp.sendmail(FROM_ADDR, addr, msg) + + smtp.quit() + +################################################################################ +# Executable program +################################################################################ + +# Set up parser +parser = argparse.ArgumentParser( + description='re-validate: Perform various validation tasks on an online ' + + 'volume or volumes.') + +parser.add_argument('volume', nargs='*', type=str, + help='Paths to volumes or volume sets for validation. ' + + 'In batch mode, provide the path to the holdings ' + + 'directory.') + +parser.add_argument('--log', '-l', type=str, default='', + help='Optional root directory for a duplicate of the log ' + + 'files. If not specified, the value of ' + + 'environment variable "%s" ' % LOGROOT_ENV + + 'is used. In addition, logs are written to the ' + + '"logs" directory parallel to "holdings". Logs are ' + + 'created inside the "re-validate" subdirectory of ' + + 'each log root directory.') + +parser.add_argument('--batch', '-b', action='store_true', + help='Operate in batch mode. In this mode, the program ' + + 'searches the existing logs and the given holdings ' + + 'directories and validates any new volumes found. ' + + 'Afterward, it validates volumes starting with the ' + + 'ones with the oldest logs. Use --minutes to limit ' + + 'the duration of the run.') + +parser.add_argument('--minutes', type=int, default=60, + help='In batch mode, this is the rough upper limit of ' + + 'the duration of the run. The program will iterate ' + + 'through available volumes but will not start a new ' + + 'one once the time limit in minutes has been reached.') + +parser.add_argument('--batch-status', action='store_true', + help='Prints a summary of what the program would do now ' + + 'if run in batch mode.') + +parser.add_argument('--email', type=str, action='append', default=[], + metavar='ADDR', + help='Email address to which to send a report when a ' + + 'batch job completes. Repeat for multiple recipients.') + +parser.add_argument('--error-email', type=str, action='append', default=[], + metavar='ADDR', + help='Email address to which to send an error report ' + + 'when a batch job completes. If no errors are ' + + 'found, no message is sent. Repeat for multiple ' + + 'recipients.') + +parser.add_argument('--quiet', '-q', action='store_true', + help='Do not log to the terminal.') + +parser.add_argument('--checksums', '-C', action='store_true', + help='Validate MD5 checksums.') + +parser.add_argument('--archives', '-A', action='store_true', + help='Validate archive files.') + +parser.add_argument('--info', '-I', action='store_true', + help='Validate infoshelves.') + +parser.add_argument('--links', '-L', action='store_true', + help='Validate linkshelves.') + +parser.add_argument('--dependencies', '-D', action='store_true', + help='Validate dependencies.') + +parser.add_argument('--full', '-F', action='store_true', + help='Perform the full set of validation tests ' + + '(checksums, archives, infoshelves, linkshelves, ' + + 'dependencies). This is the default.') + +parser.add_argument('--timeless', '-T', action='store_true', + help='Suppress "newer modification date" tests for ' + + 'dependencies. These tests are unnecessary during a ' + + 'full validation because the contents of archive, ' + + 'checksum and shelf files are also checked, so the ' + + 'dates on these files are immaterial.') + +parser.add_argument('--volumes', '-v', action='store_true', + help='Check volume directories.') + +parser.add_argument('--calibrated', '-c', action='store_true', + help='Check calibrated directories.') + +parser.add_argument('--diagrams', '-d', action='store_true', + help='Check diagram directories.') + +parser.add_argument('--metadata', '-m', action='store_true', + help='Check metadata directories.') + +parser.add_argument('--previews', '-p', action='store_true', + help='Check preview directories.') + +parser.add_argument('--all', '-a', action='store_true', + help='Check all directories and files related to the ' + + 'selected volume(s), i.e., those in volumes/, ' + + 'calibrated/, diagrams/, metadata/, and previews/, ' + + 'plus their checksums and archives. This is the ' + + 'default.') + +# Parse and validate the command line +args = parser.parse_args() + +# Interpret file types +voltypes = [] +if args.volumes: + voltypes += ['volumes'] +if args.calibrated: + voltypes += ['calibrated'] +if args.diagrams: + voltypes += ['diagrams'] +if args.metadata: + voltypes += ['metadata'] +if args.calibrated: + voltypes += ['previews'] + +if voltypes == [] or args.all: + voltypes = ['volumes', 'calibrated', 'diagrams', 'metadata', 'previews'] + +# Determine which tests to perform +checksums = args.checksums +archives = args.archives +infoshelves = args.info +linkshelves = args.links +dependencies = args.dependencies + +if args.full or not (checksums or archives or infoshelves or linkshelves or + dependencies): + checksums = True + archives = True + infoshelves = True + linkshelves = True + dependencies = True + +dependencies &= ('volumes' in voltypes) +linkshelves &= (('volumes' in voltypes or 'metadata' in voltypes or + 'calibrated' in voltypes)) + +args.checksums = checksums +args.archives = archives +args.infoshelves = infoshelves +args.linkshelves = linkshelves +args.dependencies = dependencies + +tests = [] +if checksums : tests.append('checksums') +if archives : tests.append('archives') +if infoshelves : tests.append('infoshelves') +if linkshelves : tests.append('linkshelves') +if dependencies: tests.append('dependencies') + +args.timeless = args.timeless and args.dependencies + +# Define the logging directory +if args.log == '': + try: + args.log = os.environ[LOGROOT_ENV] + except KeyError: + args.log = None + +# Initialize the logger +new_limits = {'info':10, 'normal':10, 'override':False} +logger = pdslogger.PdsLogger(LOGNAME, limits=new_limits) + +# Place to search for existing logs in batch mode +pdsfile.PdsFile.set_log_root(args.log) + +if not args.quiet: + logger.add_handler(pdslogger.stdout_handler) + +if args.log: + path = os.path.join(args.log, 're-validate') + logger.add_handler(pdslogger.warning_handler(path)) + logger.add_handler(pdslogger.error_handler(path)) + +######################################## +# Interactive mode +######################################## + +if not args.batch and not args.batch_status: + + # Stop if a volume or volume set doesn't exist + if not args.volume: + print('Missing volume path') + sys.exit(1) + + for volume in args.volume: + if not os.path.exists(volume): + print('Volume path not found: ' + volume) + sys.exit(1) + + # Convert to PdsFile objects; expand volume sets; collect holdings paths + pdsdirs = [] + roots = set() + for volume in args.volume: + abspath = os.path.abspath(volume) + pdsdir = pdsfile.PdsFile.from_abspath(abspath) + if pdsdir.category_ != 'volumes/' or pdsdir.interior: + print('Not a volume path: ', pdsdir.abspath) + sys.exit(1) + + logger.add_root(pdsdir.root_) + + if pdsdir.volname: + pdsdirs.append(pdsdir) + else: + for name in pdsdir.childnames: + pdsdirs.append(pdsdir.child(name)) + + # Main loop + logger.open(' '.join(sys.argv)) + try: + # For each volume... + for pdsdir in pdsdirs: + _ = validate_one_volume(pdsdir, voltypes, tests, args, logger) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + status = 1 if (fatal or errors) else 0 + + sys.exit(status) + +######################################## +# Batch mode +######################################## + +else: + + if not args.volume: + print('No holdings path identified') + sys.exit(1) + + holdings_abspaths = [] + for holdings in args.volume: + if not os.path.exists(holdings): + print('Holdings path not found: ' + holdings) + sys.exit(1) + + holdings = holdings.rstrip('/') + holdings = os.path.realpath(holdings) + holdings = os.path.abspath(holdings) + if not holdings.endswith('/holdings'): + print('Not a holdings directory: ' + holdings) + sys.exit(1) + + if holdings not in holdings_abspaths: + holdings_abspaths.append(holdings) + + logger.add_root(holdings_abspaths) + holdings_abspaths = set(holdings_abspaths) + + # Read the existing logs + (log_info, logs_for_volset_volname) = get_all_log_info(args.log) + + # Read the current holdings + holdings_info = [] + for holdings in args.volume: + holdings_info += get_volume_info(holdings) + + # Define an ordered list of tasks + (modified_holdings, + current_logs, + missing_keys) = find_modified_volumes(holdings_info, log_info) + + # Report missing volumes + for key in missing_keys: + # Determine if this volset/volname has ever appeared in any of the + # holdings directory trees + holdings_for_key = set() + for log_path in logs_for_volset_volname[key]: + volume_abspath = volume_abspath_from_log(log_path) + if volume_abspath == '': # if log file is empty + continue + + holdings_abspath = volume_abspath.split('/volumes')[0] + holdings_for_key.add(holdings_abspath) + + # If not, ignore + if not (holdings_abspaths & holdings_for_key): + continue + + # Report error + holdings_for_key = list(holdings_for_key) + holdings_for_key.sort() + for holdings_abspath in holdings_for_key: + logger.error('Missing volume', + os.path.join(holdings_abspath + '/volumes', key)) + + # Print info in trial run mode + if args.batch_status: + fmt = '%4d %20s%-11s modified %s, not previously validated' + line_number = 0 + for (abspath, date) in modified_holdings: + pdsdir = pdsfile.PdsFile.from_abspath(abspath) + line_number += 1 + print(fmt % (line_number, pdsdir.volset_, pdsdir.volname, + date[:10])) + + fmt ='%4d %20s%-11s modified %s, last validated %s, duration %s%s' + for info in current_logs: + (start, elapsed, date, abspath, had_error, had_fatal) = info + pdsdir = pdsfile.PdsFile.from_abspath(abspath) + error_text = ', error logged' if had_error else '' + line_number += 1 + print(fmt % (line_number, pdsdir.volset_, pdsdir.volname, + date[:10], start[:10], elapsed[:-7], error_text)) + + sys.exit() + + # Start batch processing + # info = (abspath, mod_date, prev_validation, had_errors) + info = [(p[0], p[1], None, False) for p in modified_holdings] + \ + [(p[3], p[2], p[0], p[4]) for p in current_logs] + start = datetime.datetime.now() + + batch_messages = [] + error_messages = [] + batch_prefix = ('Batch re-validate started at %s on %s\n' % + (start.strftime("%Y-%m-%d %H:%M:%S"), + ','.join(args.volume))) + print(batch_prefix) + + # Main loop + logger.open(' '.join(sys.argv)) + try: + + # For each volume... + for (abspath, mod_date, prev_validation, had_errors) in info: + pdsdir = pdsfile.PdsFile.from_abspath(abspath) + if prev_validation is None: + ps = 'not previously validated' + else: + ps = 'last validated %s' % prev_validation[:10] + batch_message = '%20s%-11s modified %s, %s' % \ + (pdsdir.volset_, pdsdir.volname, mod_date[:10], ps) + print(batch_message) + + (log_path, + fatal, errors) = validate_one_volume(pdsdir, voltypes, tests, + args, logger) + error_message = '' + if fatal or errors: + stringlist = ['***** '] + if fatal: + stringlist += ['Fatal = ', str(fatal), '; '] + if errors: + stringlist += ['Errors = ', str(errors), '; '] + stringlist.append(log_path) + error_message = ''.join(stringlist) + + print(error_message) + + batch_messages.append(batch_message) + + if error_message: + batch_messages.append(error_message) + + error_messages.append(batch_message) + error_messages.append(error_message) + + now = datetime.datetime.now() + if (now - start).seconds > args.minutes*60: + break + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + status = 1 if (fatal or errors) else 0 + + now = datetime.datetime.now() + batch_suffix = ('\nTimeout at %s after %d minutes' % + (now.strftime("%Y-%m-%d %H:%M:%S"), + int((now - start).seconds/60. + 0.5))) + print(batch_suffix) + + if args.email: + if error_messages: + subj = REPORT_SUBJ_W_ERRORS + else: + subj = REPORT_SUBJ + + full_message = [batch_prefix] + batch_messages + [batch_suffix] + send_email(args.email, subj, '\n'.join(full_message)) + + if error_messages and args.error_email: + full_message = [batch_prefix] + error_messages + [batch_suffix] + send_email(args.error_email, ERROR_REPORT_SUBJ, + '\n'.join(full_message)) + +# sys.exit(status) + sys.exit(0) # In batch mode, don't cancel the launchdaemon. + # Does this help?? + diff --git a/validation/shelf-consistency-check.py b/validation/shelf-consistency-check.py new file mode 100755 index 0000000..fec81da --- /dev/null +++ b/validation/shelf-consistency-check.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +################################################################################ +# # shelf-consistency-check.py +# +# Syntax: +# shelf-consistency-check.py [--verbose] shelf_root [shelf_root ...] +# +# Confirm that every info shelf file has a corresponding directory in holdings/. +################################################################################ + +import os, sys + +paths = sys.argv[1:] + +# Look for --verbose option +if '--verbose' in paths: + paths.remove('--verbose') + verbose = True +else: + verbose = False + +# Traverse each directory tree... +errors = 0 +tests = 0 +for path in paths: + for root, dirs, files in os.walk(path): + + # Ignore anything not inside a shelves directory + if 'shelves' not in root: continue + if root.endswith('shelves'): continue + + # Confirm it is one of the expected subdirectories + tail = root.partition('shelves/')[-1] + tail = tail.partition('/')[0] + if tail not in ('info', 'links', 'index'): + print('*** Not a valid shelves directory: ' + root) + errors += 1 + tests += 1 + continue + + # Check each file... + for name in files: + shelf_path = os.path.join(root, name) + tests += 1 + + if name == '.DS_Store': + continue + + # Check the file extension + if not (name.endswith('.py') or name.endswith('.pickle')): + print('*** Extraneous file found: ' + shelf_path) + errors += 1 + continue + + # Convert to the associated holdings path + holdings_path = shelf_path.replace('shelves/' + tail, 'holdings') + holdings_path = holdings_path.rpartition('.')[0] + + # For index shelves, make sure the holdings label file exists + if tail == 'index': + if not os.path.exists(holdings_path + '.lbl'): + print('*** Extraneous shelf: ' + shelf_path) + error += 1 + continue + + if verbose: + print(holdings_path) + + # For info and link shelves, make sure the holdings directory exists + else: + holdings_path = holdings_path.rpartition('_')[0] + if not os.path.exists(holdings_path): + print('*** Extraneous shelf: ' + shelf_path) + errors += 1 + continue + + if verbose: + print(holdings_path) + +# Summarize +print('Tests performed: %d' % tests) +print('Errors found: %d' % errors) + +if errors: + sys.exit(1) + +################################################################################ From ab669f96b60cc18086d0401ebd9f918809754314 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Mon, 19 Aug 2024 12:48:51 -0700 Subject: [PATCH 02/21] Add command line tool to show opus products output with the given absolute path of a filespec. --- utility/show_opus_products.py | 83 +++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 utility/show_opus_products.py diff --git a/utility/show_opus_products.py b/utility/show_opus_products.py new file mode 100644 index 0000000..6dbb565 --- /dev/null +++ b/utility/show_opus_products.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +import argparse +from ast import literal_eval +from json import loads, dumps +from pdsfile import (Pds3File, + Pds4File) +from pdsfile.pds3file.tests.helper import PDS3_HOLDINGS_DIR +from pdsfile.pds4file.tests.helper import PDS4_HOLDINGS_DIR +import pprint +import sys + +# Set up parser +parser = argparse.ArgumentParser( + description="""show_opus_products: show the output of opus products for the given + absolute path of the file.""") + +parser.add_argument('--abspath', type=str, default='', required=True, + help='The absolute path of the file') + +parser.add_argument('--raw', '-r', action='store_true', + help='Show the raw output of opus products.') + +args = parser.parse_args() +abspath = args.abspath +display_raw = args.raw + +# print help screen if no abspath is given +if len(sys.argv) == 1 or not abspath or 'holdings' not in abspath: + parser.print_help() + parser.exit() + +is_pds3 = True +if '/holdings/' in abspath: + Pds3File.use_shelves_only(True) + Pds3File.preload(PDS3_HOLDINGS_DIR) +else: + Pds4File.use_shelves_only(False) + Pds4File.preload(PDS4_HOLDINGS_DIR) + is_pds3 = False + +if is_pds3: + pdsf_inst = Pds3File.from_abspath(abspath) + +else: + pdsf_inst = Pds4File.from_abspath(abspath) + +if not pdsf_inst.exists: + print(f"The istantiated pdsfile doesn't exist! Please double check the path.") + parser.exit() + +opus_prod = pdsf_inst.opus_products() +res = {} + +for prod_category, prod_list in opus_prod.items(): + pdsf_list = [] + for pdsf in prod_list: + pdsf_list.append(pdsf[0].logical_path) + + if not display_raw: + opus_type = prod_category[2] + res[opus_type] = pdsf_list + else: + res[prod_category] = pdsf_list + json_format_res = dumps({str(k): v for k, v in res.items()}, indent=2) + +print('======= OPUS PRODUCTS OUTPUT =======') +if not display_raw: + # pprint dictionary with opus type as the key and its corresponding product list as + # the value + pprint.pp(res, width=90) +else: + # print raw opus products ouput with cusomized format + print('{') + for prod_category, prod_list in res.items(): + print(f" ('{prod_category[0]}',") + print(' ',f'{prod_category[1]},') + print(' ',f"'{prod_category[2]}',") + print(' ',f"'{prod_category[3]}',") + print(' ',f'{prod_category[4]}): [') + for prod in prod_list: + print(' ',f"'{prod}',") + print(' ],') + print('}') From 4ffbc454471a2f64ba1caa3614b529bb04d5f5c2 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Wed, 18 Sep 2024 11:24:15 -0700 Subject: [PATCH 03/21] Create pds4 directory to store maintenance files for pds4. --- holdings_maintenance/pds3/pdsinfoshelf.py | 2 +- holdings_maintenance/pds4/pds4checksums.py | 859 ++++++++++++++++++++ holdings_maintenance/pds4/pds4infoshelf.py | 896 +++++++++++++++++++++ 3 files changed, 1756 insertions(+), 1 deletion(-) create mode 100755 holdings_maintenance/pds4/pds4checksums.py create mode 100755 holdings_maintenance/pds4/pds4infoshelf.py diff --git a/holdings_maintenance/pds3/pdsinfoshelf.py b/holdings_maintenance/pds3/pdsinfoshelf.py index 7396946..cbf4a12 100755 --- a/holdings_maintenance/pds3/pdsinfoshelf.py +++ b/holdings_maintenance/pds3/pdsinfoshelf.py @@ -662,7 +662,7 @@ def main(): 'than the shelf file, update the shelf file\'s ' + 'modification date.') - parser.add_argument('volume', nargs='+', type=str, + parser.add_argument('--volume', nargs='+', type=str, help='The path to the root of the volume or volume ' + 'set. For a volume set, all the volume ' + 'directories inside it are handled in sequence.') diff --git a/holdings_maintenance/pds4/pds4checksums.py b/holdings_maintenance/pds4/pds4checksums.py new file mode 100755 index 0000000..cd99d31 --- /dev/null +++ b/holdings_maintenance/pds4/pds4checksums.py @@ -0,0 +1,859 @@ +#!/usr/bin/env python3 +################################################################################ +# pdschecksums.py library and main program +# +# Syntax: +# pdschecksums.py --task path [path ...] +# +# Enter the --help option to see more information. +################################################################################ + +import argparse +import datetime +import glob +import hashlib +import os +import shutil +import sys + +import pdslogger +import pdsfile + +# Holds log file directories temporarily, used by move_old_checksums() +LOGDIRS = [] + +LOGNAME = 'pds.validation.checksums' +LOGROOT_ENV = 'PDS_LOG_ROOT' + +################################################################################ + +# From http://stackoverflow.com/questions/3431825/- +# generating-an-md5-checksum-of-a-file + +def hashfile(fname, blocksize=65536): + f = open(fname, 'rb') + hasher = hashlib.md5() + buf = f.read(blocksize) + while len(buf) > 0: + hasher.update(buf) + buf = f.read(blocksize) + return hasher.hexdigest() + +################################################################################ + +def generate_checksums(pdsdir, selection=None, oldpairs=[], regardless=True, + limits={'normal':-1}, logger=None): + """Generate a list of tuples (abspath, checksum) recursively from the given + directory tree. + + If a selection is specified, it is interpreted as the basename of a file, + and only that file is processed. + + The optional oldpairs is a list of (abspath, checksum) pairs. For any file + that already has a checksum in the shortcut list, the checksum is copied + from this list rather than re-calculated. This list is merged with the + selection if a selection is identified. + + If regardless is True, then the checksum of a selection is calculated + regardless of whether it is already in abspairs. + + Also return the latest modification date among all the files checked. + """ + + dirpath = pdsdir.abspath + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Generating MD5 checksums', dirpath, limits=limits) + + latest_mtime = 0. + try: + md5_dict = {} + for (abspath, hex) in oldpairs: + md5_dict[abspath] = hex + + newtuples = [] + for (path, dirs, files) in os.walk(dirpath): + for file in files: + abspath = os.path.join(path, file) + latest_mtime = max(latest_mtime, os.path.getmtime(abspath)) + + if selection and file != selection: + continue + + if file == '.DS_Store': # skip .DS_Store files + logger.ds_store('.DS_Store skipped', abspath) + continue + + if file.startswith('._'): # skip dot-underscore files + logger.dot_underscore('._* file skipped', abspath) + continue + + if '/.' in abspath: # flag invisible files + logger.invisible('Invisible file', abspath) + + if regardless and selection: + md5 = hashfile(abspath) + newtuples.append((abspath, md5, file)) + logger.normal('Selected MD5=%s' % md5, abspath) + + elif abspath in md5_dict: + newtuples.append((abspath, md5_dict[abspath], file)) + logger.debug('MD5 copied', abspath) + + else: + md5 = hashfile(abspath) + newtuples.append((abspath, md5, file)) + logger.normal('MD5=%s' % md5, abspath) + + if selection: + if len(newtuples) == 0: + logger.error('File selection not found', selection) + return ({}, latest_mtime) + + if len(newtuples) > 1: + logger.error('Multiple copies of file selection found', + selection) + return ({}, latest_mtime) + + # Add new values to dictionary + for (abspath, md5, _) in newtuples: + md5_dict[abspath] = md5 + + # Restore original order, old keys then new + old_keys = [p[0] for p in oldpairs] + + newpairs = [] + for key in old_keys: + newpairs.append((key, md5_dict[key])) + del md5_dict[key] + + for (key, new_md5, new_file) in newtuples: + if key in md5_dict: # if not already copied to list of pairs + newpairs.append((key, md5_dict[key])) + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('Lastest holdings file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + return (newpairs, latest_mtime) + +################################################################################ + +def read_checksums(check_path, selection=None, limits={}, logger=None): + + """Return a list of tuples (abspath, checksum) from a checksum file. + + If a selection is specified, then only the checksum with this file name + is returned.""" + + check_path = os.path.abspath(check_path) + pdscheck = pdsfile.Pds3File.from_abspath(check_path) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdscheck.root_) + logger.open('Reading MD5 checksums', check_path, limits=limits) + + try: + logger.info('MD5 checksum file', check_path) + + if not os.path.exists(check_path): + logger.error('MD5 checksum file not found', check_path) + return [] + + prefix_ = pdscheck.dirpath_and_prefix_for_checksum()[1] + + # Read the pairs + abspairs = [] + with open(check_path, 'r') as f: + for rec in f: + hexval = rec[:32] + filepath = rec[34:].rstrip() + + if selection and os.path.basename(filepath) != selection: + continue + + basename = os.path.basename(filepath) + if basename == '.DS_Store': + logger.error('.DS_Store found in checksum file', filepath) + continue + + if basename.startswith('._'): + logger.error('._* file found in checksum file', filepath) + continue + + if basename[0] == '.': + logger.invisible('Checksum for invisible file', filepath) + + abspairs.append((prefix_ + filepath, hexval)) + logger.debug('Read', filepath) + + if selection and len(abspairs) == 0: + logger.error('File selection not found', selection) + return [] + + except Exception as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + return abspairs + +################################################################################ + +def checksum_dict(dirpath, logger=None): + + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.Pds3File.from_abspath(dirpath) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.info('Loading checksums for', dirpath, force=True) + + check_path = pdsdir.checksum_path_and_lskip()[0] + abspairs = read_checksums(check_path, logger=logger) + + pair_dict = {} + for (abspath, checksum) in abspairs: + pair_dict[abspath] = checksum + + logger.info('Checksum load completed', dirpath, force=True) + return pair_dict + +################################################################################ + +def write_checksums(check_path, abspairs, + limits={'dot_':-1, 'ds_store':-1, 'invisible':100}, + logger=None): + """Write a checksum table containing the given pairs (abspath, checksum).""" + + check_path = os.path.abspath(check_path) + pdscheck = pdsfile.Pds3File.from_abspath(check_path) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdscheck.root_) + logger.open('Writing MD5 checksums', check_path, limits=limits) + + try: + # Create parent directory if necessary + parent = os.path.split(check_path)[0] + if not os.path.exists(parent): + logger.normal('Creating directory', parent) + os.makedirs(parent) + + prefix_ = pdscheck.dirpath_and_prefix_for_checksum()[1] + lskip = len(prefix_) + + # Write file + f = open(check_path, 'w') + for pair in abspairs: + (abspath, hex) = pair + + if abspath.endswith('/.DS_Store'): # skip .DS_Store files + logger.ds_store('.DS_Store skipped', abspath) + continue + + if '/._' in abspath: # skip dot-underscore files + logger.dot_underscore('._* file skipped', abspath) + continue + + if '/.' in abspath: # flag invisible files + logger.invisible('Invisible file', abspath) + + f.write('%s %s\n' % (hex, abspath[lskip:])) + logger.debug('Written', abspath) + + f.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def validate_pairs(pairs1, pairs2, selection=None, limits={}, logger=None): + """Validate the first checksum list against the second. + + If a selection is specified, only a file with that basename is checked.""" + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.open('Validating checksums', limits=limits) + + success = True + try: + md5_dict = {} + for (abspath, hex) in pairs2: + md5_dict[abspath] = hex + + for (abspath, hex) in pairs1: + if selection and selection != os.path.basename(abspath): + continue + + if abspath not in md5_dict: + logger.error('Missing checksum', abspath) + success = False + + elif hex != md5_dict[abspath]: + del md5_dict[abspath] + logger.error('Checksum mismatch', abspath) + success = False + + else: + del md5_dict[abspath] + logger.normal('Validated', abspath) + + if not selection: + abspaths = list(md5_dict.keys()) + abspaths.sort() + for abspath in abspaths: + logger.error('Extra file', abspath) + success = False + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + logger.close() + return success + +################################################################################ + +def move_old_checksums(check_path, logger=None): + """Appends a version number to an existing checksum file and moves it to + the associated log directory.""" + + if not os.path.exists(check_path): return + + check_basename = os.path.basename(check_path) + (check_prefix, check_ext) = os.path.splitext(check_basename) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + from_logged = False + for log_dir in LOGDIRS: + dest_template = log_dir + '/' + check_prefix + '_v???' + check_ext + version_paths = glob.glob(dest_template) + + max_version = 0 + lskip = len(check_ext) + for version_path in version_paths: + version = int(version_path[-lskip-3:-lskip]) + max_version = max(max_version, version) + + new_version = max_version + 1 + dest = dest_template.replace('???', '%03d' % new_version) + shutil.copy(check_path, dest) + + if not from_logged: + logger.info('Checksum file moved from: ' + check_path) + from_logged = True + + logger.info('Checksum file moved to', dest) + +################################################################################ +# Simplified functions to perform tasks +################################################################################ + +def initialize(pdsdir, selection=None, logger=None): + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Make sure checksum file does not exist + if os.path.exists(check_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Checksum file already exists', check_path) + return False + + # Check selection + if selection: + raise ValueError('File selection is disallowed for task ' + + '"initialize": ' + selection) + + # Generate checksums + (pairs, _) = generate_checksums(pdsdir, logger=logger) + if not pairs: + return False + + # Write new checksum file + write_checksums(check_path, pairs, logger=logger) + return True + +def reinitialize(pdsdir, selection=None, logger=None): + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Warn if checksum file does not exist + if not os.path.exists(check_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Checksum file does not exist', check_path) + return False + else: + logger.warn('Checksum file does not exist; initializing', check_path) + return initialize(pdsdir, selection=selection, logger=logger) + + # Re-initialize just the selection; preserve others + if selection: + oldpairs = read_checksums(check_path, logger=logger) + if not oldpairs: + return False + else: + oldpairs = [] + + # Generate new checksums + (pairs, _) = generate_checksums(pdsdir, selection, oldpairs, + regardless=True, logger=logger) + if not pairs: + return False + + # Write new checksum file + move_old_checksums(check_path, logger=logger) + write_checksums(check_path, pairs, logger=logger) + return True + +def validate(pdsdir, selection=None, logger=None): + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Make sure checksum file exists + if not os.path.exists(check_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Checksum file does not exist', check_path) + return False + + # Read checksum file + md5pairs = read_checksums(check_path, selection, logger=logger) + if not md5pairs: + return False + + # Generate checksums + (dirpairs, _) = generate_checksums(pdsdir, selection, logger=logger) + if not dirpairs: + return False + + # Validate + return validate_pairs(dirpairs, md5pairs, selection, logger=logger) + +def repair(pdsdir, selection=None, logger=None): + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Make sure checksum file exists + if not os.path.exists(check_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Checksum file does not exist', check_path) + return False + else: + logger.warn('Checksum file does not exist; initializing', check_path) + return initialize(pdsdir, selection=selection, logger=logger) + + # Read checksums file + md5pairs = read_checksums(check_path, logger=logger) + if not md5pairs: + return False + + # Generate new checksums + if selection: + (dirpairs, + latest_mtime) = generate_checksums(pdsdir, selection, md5pairs, + regardless=True, logger=logger) + else: + (dirpairs, + latest_mtime) = generate_checksums(pdsdir, logger=logger) + + if not dirpairs: + return False + + # Compare checksums + md5pairs.sort() + dirpairs.sort() + canceled = (dirpairs == md5pairs) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + check_mtime = os.path.getmtime(check_path) + if latest_mtime > check_mtime: + logger.info('!!! Checksum file content is up to date', + check_path, force=True) + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('!!! Latest holdings file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + check_mtime = os.path.getmtime(check_path) + dt = datetime.datetime.fromtimestamp(check_mtime) + logger.info('!!! Checksum file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + delta = latest_mtime - check_mtime + if delta >= 86400/10: + logger.info('!!! Checksum file is out of date %.1f days' % + (delta / 86400.), force=True) + else: + logger.info('!!! Checksum file is out of date %.1f minutes' % + (delta / 60.), force=True) + + dt = datetime.datetime.now() + os.utime(check_path) + logger.info('!!! Time tag on checksum file set to', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + else: + logger.info('!!! Checksum file is up to date; repair canceled', + check_path, force=True) + return True + + # Write checksum file + move_old_checksums(check_path, logger=logger) + write_checksums(check_path, dirpairs, logger=logger) + return True + +def update(pdsdir, selection=None, logger=None): + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Make sure file exists + if not os.path.exists(check_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Checksum file does not exist', check_path) + return False + else: + logger.warn('Checksum file does not exist; initializing', check_path) + return initialize(pdsdir, selection=selection, logger=logger) + + # Read checksums file + md5pairs = read_checksums(check_path, logger=logger) + if not md5pairs: + return False + + # Generate new checksums if necessary + (dirpairs, + latest_mtime) = generate_checksums(pdsdir, selection, md5pairs, + regardless=False, logger=logger) + if not dirpairs: + return False + + # Compare checksums + md5pairs.sort() + dirpairs.sort() + canceled = (dirpairs == md5pairs) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.info('!!! Checksum file content is complete; update canceled', + check_path) + return True + + # Write checksum file + move_old_checksums(check_path, logger=logger) + write_checksums(check_path, dirpairs, logger=logger) + return True + +################################################################################ +# Executable program +################################################################################ + +def main(): + + # Set up parser + parser = argparse.ArgumentParser( + description='pdschecksums: Create, maintain and validate MD5 ' + + 'checksum files for PDS volumes and volume sets.') + + parser.add_argument('--initialize', '--init', const='initialize', + default='', action='store_const', dest='task', + help='Create an MD5 checksum file for a volume or ' + + 'volume set. Abort if the checksum file ' + + 'already exists.') + + parser.add_argument('--reinitialize', '--reinit', const='reinitialize', + default='', action='store_const', dest='task', + help='Create an MD5 checksum file for a volume or ' + + 'volume set. Replace the checksum file if it ' + + 'already exists. If a single file is specified, ' + + 'such as one archive file in a volume set, only ' + + 'single checksum is re-initialized.') + + parser.add_argument('--validate', const='validate', + default='', action='store_const', dest='task', + help='Validate every file in a volume directory tree ' + + 'against its MD5 checksum. If a single file ' + + 'is specified, such as one archive file in a ' + + 'volume set, only that single checksum is ' + + 'validated.') + + parser.add_argument('--repair', const='repair', + default='', action='store_const', dest='task', + help='Validate every file in a volume directory tree ' + + 'against its MD5 checksum. If any disagreement ' + + 'is found, the checksum file is replaced; ' + + 'otherwise it is unchanged. If a single file is ' + + 'specified, such as one archive file of a ' + + 'volume set, then only that single checksum is ' + + 'repaired. If any of the files checked are newer' + + 'than the checksum file, update shelf file\'s ' + + 'modification date') + + parser.add_argument('--update', const='update', + default='', action='store_const', dest='task', + help='Search a directory for any new files and add ' + + 'their MD5 checksums to the checksum file. ' + + 'Checksums of pre-existing files are not checked.') + + parser.add_argument('volume', nargs='+', type=str, + help='The path to the root directory of a volume or ' + + 'volume set. For a volume set, all the volume ' + + 'directories inside it are handled in sequence. ' + + 'Note that, for archive directories, checksums ' + + 'are grouped into one file for the entire ' + + 'volume set.') + + parser.add_argument('--log', '-l', type=str, default='', + help='Optional root directory for a duplicate of the ' + + 'log files. If not specified, the value of ' + + 'environment variable "%s" ' % LOGROOT_ENV + + 'is used. In addition, individual logs are ' + + 'written into the "logs" directory parallel to ' + + '"holdings". Logs are created inside the ' + + '"pdschecksums" subdirectory of each log root ' + + 'directory.') + + parser.add_argument('--quiet', '-q', action='store_true', + help='Do not also log to the terminal.') + + parser.add_argument('--archives', '-a', default=False, action='store_true', + help='Instead of referring to a volume, refer to the ' + + 'the archive file for that volume.') + + parser.add_argument('--infoshelf', '-i', dest='infoshelf', + default=False, action='store_true', + help='After a successful run, also execute the ' + + 'equivalent pdsinfoshelf command.') + + + # Parse and validate the command line + args = parser.parse_args() + + if not args.task: + print('pdschecksums error: Missing task') + sys.exit(1) + + # Define the logging directory + if args.log == '': + try: + args.log = os.environ[LOGROOT_ENV] + except KeyError: + args.log = None + + # Initialize the logger + logger = pdslogger.PdsLogger(LOGNAME) + pdsfile.Pds3File.set_log_root(args.log) + + if not args.quiet: + logger.add_handler(pdslogger.stdout_handler) + + if args.log: + path = os.path.join(args.log, 'pdschecksums') + warning_handler = pdslogger.warning_handler(path) + logger.add_handler(warning_handler) + + error_handler = pdslogger.error_handler(path) + logger.add_handler(error_handler) + + # Prepare the list of paths + abspaths = [] + for path in args.volume: + + # Make sure path makes sense + path = os.path.abspath(path) + parts = path.partition('/holdings/') + if not parts[1]: + print('Not a holdings subdirectory: ' + path) + sys.exit(1) + + if parts[2].startswith('checksums-'): + print('No checksums for checksum files: ' + path) + sys.exit(1) + + # Convert to an archives path if necessary + if args.archives and not parts[2].startswith('archives-'): + path = parts[0] + '/holdings/archives-' + parts[2] + + # Convert to a list of absolute paths that exist (volsets or volumes) + try: + pdsf = pdsfile.Pds3File.from_abspath(path, must_exist=True) + abspaths.append(pdsf.abspath) + + except (ValueError, IOError): + # Allow a volume name to stand in for a .tar.gz archive + (dir, basename) = os.path.split(path) + pdsdir = pdsfile.Pds3File.from_abspath(dir) + if pdsdir.archives_ and '.' not in basename: + if pdsdir.voltype_ == 'volumes/': + basename += '.tar.gz' + else: + basename += '_%s.tar.gz' % pdsdir.voltype_[:-1] + + newpaths = glob.glob(os.path.join(dir, basename)) + if len(newpaths) == 0: + raise + + abspaths += newpaths + continue + else: + raise + + # Generate a list of tuples (pdsfile, selection) + info = [] + for path in abspaths: + pdsf = pdsfile.Pds3File.from_abspath(path) + + if pdsf.is_volset_dir: + # Archive directories are checksumed by volset + if pdsf.archives_: + info.append((pdsf, None)) + + # Others are checksumed by volume + else: + children = [pdsf.child(c) for c in pdsf.childnames] + info += [(c, None) for c in children if c.isdir] + # "if c.isdir" is False for volset level readme files + + elif pdsf.is_volume_dir: + # Checksum one volume + info.append((pdsf, None)) + + elif pdsf.isdir: + print('Invalid directory for checksumming: ' + pdsf.logical_path) + sys.exit(1) + + else: + pdsdir = pdsf.parent() + if pdsf.is_volume_file: + # Checksum one archive file + info.append((pdsdir, pdsf.basename)) + elif pdsdir.is_volume_dir: + # Checksum one top-level file in volume + info.append((pdsdir, pdsf.basename)) + else: + print('Invalid file for checksumming: ' + pdsf.logical_path) + sys.exit(1) + + # Begin logging and loop through tuples... + logger.open(' '.join(sys.argv)) + try: + for (pdsdir, selection) in info: + path = pdsdir.abspath + + if selection: + pdsf = pdsdir.child(os.path.basename(selection)) + else: + pdsf = pdsdir + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Save logs in up to two places + if pdsf.volname: + logfiles = set([pdsf.log_path_for_volume('_md5', + task=args.task, + dir='pdschecksums'), + pdsf.log_path_for_volume('_md5', + task=args.task, + dir='pdschecksums', + place='parallel')]) + else: + logfiles = set([pdsf.log_path_for_volset('_md5', + task=args.task, + dir='pdschecksums'), + pdsf.log_path_for_volset('_md5', + task=args.task, + dir='pdschecksums', + place='parallel')]) + + # Create all the handlers for this level in the logger + local_handlers = [] + LOGDIRS = [] # used by move_old_checksums() + for logfile in logfiles: + local_handlers.append(pdslogger.file_handler(logfile)) + logdir = os.path.split(logfile)[0] + LOGDIRS.append(os.path.split(logfile)[0]) + + # These handlers are only used if they don't already exist + warning_handler = pdslogger.warning_handler(logdir) + error_handler = pdslogger.error_handler(logdir) + local_handlers += [warning_handler, error_handler] + + # Open the next level of the log + if len(info) > 1: + logger.blankline() + + if selection: + logger.open('Task "' + args.task + '" for selection ' + + selection, path, handler=local_handlers) + else: + logger.open('Task "' + args.task + '" for', path, + handler=local_handlers) + + try: + for logfile in logfiles: + logger.info('Log file', logfile) + + if args.task == 'initialize': + proceed = initialize(pdsdir, selection) + + elif args.task == 'reinitialize': + if selection: # don't erase everything else! + proceed = update(pdsdir, selection) + else: + proceed = reinitialize(pdsdir, selection) + + elif args.task == 'validate': + proceed = validate(pdsdir, selection) + + elif args.task == 'repair': + proceed = repair(pdsdir, selection) + + else: # update + proceed = update(pdsdir, selection) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + proceed = False + raise + + finally: + _ = logger.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + proceed = False + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + if fatal or errors: + proceed = False + + # If everything went well, execute pdsinfoshelf too + if proceed and args.infoshelf: + new_list = [a.replace('pdschecksums', 'pdsinfoshelf') for a in sys.argv] + new_list = [a for a in new_list if a not in ('--infoshelf', '-i')] + status = os.system(' '.join(new_list)) + sys.exit(status) + +if __name__ == '__main__': + main() diff --git a/holdings_maintenance/pds4/pds4infoshelf.py b/holdings_maintenance/pds4/pds4infoshelf.py new file mode 100755 index 0000000..904f43e --- /dev/null +++ b/holdings_maintenance/pds4/pds4infoshelf.py @@ -0,0 +1,896 @@ +#!/usr/bin/env python3 +################################################################################ +# pdsinfoshelf.py library and main program +# +# Syntax: +# pdsinfoshelf.py --task path [path ...] +# +# Enter the --help option to see more information. +################################################################################ + +import argparse +import datetime +import glob +import os +from pathlib import Path +import pickle +import shutil +import sys +from PIL import Image + +import pdslogger +import pdsfile + +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +sys.path.insert(0, str(REPO_ROOT)) + +from holdings_maintenance.pds3 import pdschecksums + +# Holds log file directories temporarily, used by move_old_info() +LOGDIRS = [] + +LOGNAME = 'pds.validation.fileinfo' +LOGROOT_ENV = 'PDS_LOG_ROOT' + +PREVIEW_EXTS = set(['.jpg', '.png', '.gif', '.tif', '.tiff', + '.jpeg', '.jpeg_small']) + + +################################################################################ + +def generate_infodict(pdsdir, selection, old_infodict={}, + limits={'normal':-1}, logger=None): + """Generate a dictionary keyed by absolute file path for each file in the + directory tree. Value returned is a tuple (nbytes, child_count, modtime, + checksum, preview size). + + If a selection is specified, it is interpreted as the basename of a file, + and only that file is processed. + + The optional old_infodict overrides information found in the directory. + This dictionary is merged with the new information assembled. However, if + a selection is specified, information about the selection is always updated. + + Also return the latest modification date among all the files checked. + """ + + ### Internal function + + def get_info_for_file(abspath): + + nbytes = os.path.getsize(abspath) + children = 0 + mtime = os.path.getmtime(abspath) + dt = datetime.datetime.fromtimestamp(mtime) + modtime = dt.strftime('%Y-%m-%d %H:%M:%S.%f') + try: + checksum = checkdict[abspath] + except KeyError: + logger.error('Missing entry in checksum file', abspath) + checksum = '' + + size = (0,0) + ext = os.path.splitext(abspath)[1] + if ext.lower() in PREVIEW_EXTS: + try: + im = Image.open(abspath) + size = im.size + im.close() + except Exception: + logger.error('Preview size not found', abspath) + + return (nbytes, children, modtime, checksum, size) + + def get_info(abspath, infodict, old_infodict, checkdict): + """Info about the given abspath.""" + + if os.path.isdir(abspath): + nbytes = 0 + children = 0 + modtime = '' + + files = os.listdir(abspath) + for file in files: + absfile = os.path.join(abspath, file) + + if file == '.DS_Store': # skip .DS_Store files + logger.ds_store('.DS_Store skipped', absfile) + continue + + if file.startswith('._'): # skip dot-underscore files + logger.dot_underscore('._* file skipped', absfile) + continue + + if '/.' in abspath: # flag invisible files + logger.invisible('Invisible file', absfile) + + info = get_info(absfile, infodict, old_infodict, checkdict) + nbytes += info[0] + children += 1 + modtime = max(modtime, info[2]) + + info = (nbytes, children, modtime, '', (0,0)) + + elif abspath in old_infodict: + info = old_infodict[abspath] + + else: + info = get_info_for_file(abspath) + logger.normal('File info generated', abspath) + + infodict[abspath] = info + return info + + ################################ + # Begin executable code + ################################ + + dirpath = pdsdir.abspath + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + + if selection: + logger.open('Generating file info for selection "%s"' % selection, + dirpath, limits) + else: + logger.open('Generating file info', dirpath, limits) + + try: + # Load checksum dictionary + checkdict = pdschecksums.checksum_dict(dirpath, logger=logger) +# Removed... because we can't ignore empty directories +# if not checkdict: +# return ({}, 0.) + + # Generate info recursively + infodict = {} + if selection: + root = os.path.join(dirpath, selection) + else: + root = pdsdir.abspath + + info = get_info(root, infodict, old_infodict, checkdict) + latest_modtime = info[2] + + # Merge dictionaries + merged = old_infodict.copy() + + if selection: + merged[root] = infodict[root] + + else: + for (key, value) in infodict.items(): + if key not in merged: + info = infodict[key] + merged[key] = info + latest_modtime = max(latest_modtime, info[2]) + + if not merged: + logger.info('No files found') + latest_modtime = '' + else: + logger.info('Latest holdings file modification date = ' + + latest_modtime[:19], force=True) + + # We also have to check the modtime of the checksum file! + check_path = pdsdir.checksum_path_and_lskip()[0] + timestamp = os.path.getmtime(check_path) + check_datetime = datetime.datetime.fromtimestamp(timestamp) + check_modtime = check_datetime.strftime('%Y-%m-%d %H:%M:%S.%f') + logger.info('Checksum file modification date = ' + check_modtime[:19], + check_path, force=True) + if check_modtime > latest_modtime: + latest_modtime = check_modtime + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + return (merged, latest_modtime) + +################################################################################ + +def load_infodict(pdsdir, logger=None): + + dirpath = pdsdir.abspath + dirpath_ = dirpath.rstrip('/') + '/' + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Reading info shelf file for', dirpath_[:-1]) + + try: + (info_path, lskip) = pdsdir.shelf_path_and_lskip('info') + logger.info('Info shelf file', info_path) + + if not os.path.exists(info_path): + logger.error('Info shelf file not found', info_path) + return {} + + # Read the shelf file and convert to a dictionary + with open(info_path, 'rb') as f: + shelf = pickle.load(f) + + infodict = {} + for (key,info) in shelf.items(): + # Remove a 'null' checksum indicated by a string of dashes + # (Directories do not have checksums.) + if info[3] and info[3][0] == '-': + info = info[:3] + ('',) + info[4:] + + if key == '': + infodict[dirpath_[:-1]] = info + else: + infodict[dirpath_[:lskip] + key] = info + + return infodict + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def write_infodict(pdsdir, infodict, limits={}, logger=None): + """Write a new info shelf file for a directory tree.""" + + # Initialize + dirpath = pdsdir.abspath + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Writing info file info for', dirpath, limits=limits) + + try: + (info_path, lskip) = pdsdir.shelf_path_and_lskip('info') + logger.info('Info shelf file', info_path) + + # Create parent directory if necessary + parent = os.path.split(info_path)[0] + if not os.path.exists(parent): + logger.info('Creating parent directory', parent) + os.makedirs(parent) + + # Write the pickle file + pickle_dict = {} + for (key, values) in infodict.items(): + short_key = key[lskip:] + pickle_dict[short_key] = values + + with open(info_path, 'wb') as f: + pickle.dump(pickle_dict, f) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + logger.open('Writing Python dictionary', dirpath, limits=limits) + try: + # Determine the maximum length of the file path + len_path = 0 + for (abspath, values) in infodict.items(): + len_path = max(len_path, len(abspath)) + + len_path -= lskip + + # Write the python dictionary version + python_path = info_path.rpartition('.')[0] + '.py' + name = os.path.basename(python_path) + parts = name.split('_') + name = '_'.join(parts[:2]) + '_info' + abspaths = list(infodict.keys()) + abspaths.sort() + + with open(python_path, 'w', encoding='latin-1') as f: + f.write(name + ' = {\n') + for abspath in abspaths: + path = abspath[lskip:] + (nbytes, children, modtime, checksum, size) = infodict[abspath] + f.write(' "%s: ' % (path + '"' + (len_path-len(path)) * ' ')) + f.write('(%11d, %3d, ' % (nbytes, children)) + f.write('"%s", ' % modtime) + f.write('"%-33s, ' % (checksum + '"')) + f.write('(%4d,%4d)),\n' % size) + + f.write('}\n\n') + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def validate_infodict(pdsdir, dirdict, shelfdict, selection, + limits={'normal': 0}, logger=None): + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + + if selection: + logger.open('Validating file info for selection %s' % selection, + pdsdir.abspath, limits=limits) + else: + logger.open('Validating file info for', pdsdir.abspath, limits=limits) + + # Prune the shelf dictionary if necessary + if selection: + keys = list(shelfdict.keys()) + full_path = os.path.join(pdsdir.abspath, selection) + for key in keys: + if key != full_path: + del shelfdict[key] + + try: + keys = list(dirdict.keys()) + for key in keys: + if key in shelfdict: + dirinfo = dirdict[key] + shelfinfo = shelfdict[key] + + (bytes1, count1, modtime1, checksum1, size1) = dirinfo + (bytes2, count2, modtime2, checksum2, size2) = shelfinfo + + # Truncate modtimes to seconds + modtime1 = modtime1.rpartition('.')[0] + modtime2 = modtime2.rpartition('.')[0] + + agreement = True + if bytes1 != bytes2: + logger.error('File size mismatch %d %d' % + (bytes1, bytes2), key) + agreement = False + + if count1 != count2: + logger.error('Child count mismatch %d %d' % + (count1, count1), key) + agreement = False + + if abs(modtime1 != modtime2) > 1: + logger.error('Modification time mismatch "%s" "%s"' % + (modtime1, modtime2), key) + agreement = False + + if checksum1 != checksum1: + logger.error('Checksum mismatch', key) + agreement = False + + if size1 != size2: + logger.error('Display size mismatch', key) + agreement = False + + if agreement: + logger.normal('File info matches', key) + + del shelfdict[key] + del dirdict[key] + + keys = list(dirdict.keys()) + keys.sort() + for key in keys: + logger.error('Missing shelf info for', key) + + keys = list(shelfdict.keys()) + keys.sort() + for key in keys: + logger.error('Shelf info for missing file', key) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + return logger.close() + +################################################################################ + +def move_old_info(shelf_file, logger=None): + """Move a file to the /logs/ directory tree and append a time tag.""" + + if not os.path.exists(shelf_file): return + + shelf_basename = os.path.basename(shelf_file) + (shelf_prefix, shelf_ext) = os.path.splitext(shelf_basename) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + from_logged = False + for log_dir in LOGDIRS: + dest_template = log_dir + '/' + shelf_prefix + '_v???' + shelf_ext + version_paths = glob.glob(dest_template) + + max_version = 0 + lskip = len(shelf_ext) + for version_path in version_paths: + version = int(version_path[-lskip-3:-lskip]) + max_version = max(max_version, version) + + new_version = max_version + 1 + dest = dest_template.replace('???', '%03d' % new_version) + shutil.copy(shelf_file, dest) + + if not from_logged: + logger.info('Info shelf file moved from: ' + shelf_file) + from_logged = True + + logger.info('Info shelf file moved to', dest) + + python_file = shelf_file.rpartition('.')[0] + '.py' + dest = dest.rpartition('.')[0] + '.py' + shutil.copy(python_file, dest) + +################################################################################ +# Simplified functions to perform tasks +################################################################################ + +def initialize(pdsdir, selection=None, logger=None): + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + # Make sure file does not exist + if os.path.exists(info_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Info shelf file already exists', info_path) + return + + # Check selection + if selection: + logger.error('File selection is disallowed for task "initialize"', + selection) + return + + # Generate info + (infodict, _) = generate_infodict(pdsdir, selection, logger=logger) + + # Save info file + write_infodict(pdsdir, infodict, logger=logger) + +def reinitialize(pdsdir, selection=None, logger=None): + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + # Warn if shelf file does not exist + if not os.path.exists(info_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Info shelf file does not exist', info_path) + else: + logger.warn('Info shelf file does not exist; initializing', + info_path) + initialize(pdsdir, selection=selection, logger=logger) + return + + # Generate info + (infodict, _) = generate_infodict(pdsdir, selection, logger=logger) + if not infodict: + return + + # Move old file if necessary + if os.path.exists(info_path): + move_old_info(info_path, logger=logger) + + # Save info file + write_infodict(pdsdir, infodict, logger=logger) + +def validate(pdsdir, selection=None, logger=None): + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + # Make sure file exists + if not os.path.exists(info_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Info shelf file does not exist', info_path) + return + + # Read info shelf file + shelf_infodict = load_infodict(pdsdir, logger=logger) + + # Generate info + (dir_infodict, _) = generate_infodict(pdsdir, selection, logger=logger) + + # Validate + validate_infodict(pdsdir, dir_infodict, shelf_infodict, selection=selection, + logger=logger) + +def repair(pdsdir, selection=None, logger=None): + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + # Make sure file exists + if not os.path.exists(info_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Info shelf file does not exist', info_path) + else: + logger.warn('Info shelf file does not exist; initializing', + info_path) + initialize(pdsdir, selection=selection, logger=logger) + return + + # Read info shelf file + shelf_infodict = load_infodict(pdsdir, logger=logger) + + # Generate info + (dir_infodict, latest_modtime) = generate_infodict(pdsdir, selection, + logger=logger) + latest_iso = latest_modtime.replace(' ', 'T') + latest_datetime = datetime.datetime.fromisoformat(latest_iso) + + # For a single selection, use the old information + if selection: + key = list(dir_infodict.keys())[0] + value = dir_infodict[key] + dir_infodict = shelf_infodict.copy() + dir_infodict[key] = value + + # Compare + canceled = (dir_infodict == shelf_infodict) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + info_pypath = info_path.replace('.pickle', '.py') + timestamp = min(os.path.getmtime(info_path), + os.path.getmtime(info_pypath)) + info_datetime = datetime.datetime.fromtimestamp(timestamp) + info_iso = info_datetime.isoformat(timespec='microseconds') + + if latest_iso > info_iso: + logger.info('!!! Info shelf file content is up to date', + info_path, force=True) + logger.info('!!! Latest holdings file modification date', + latest_iso, force=True) + logger.info('!!! Info shelf file modification date', + info_iso, force=True) + + delta = (latest_datetime - info_datetime).total_seconds() + if delta >= 86400/10: + logger.info('!!! Info shelf file is out of date %.1f days' % + (delta / 86400.), force=True) + else: + logger.info('!!! Info shelf file is out of date %.1f minutes' % + (delta / 60.), force=True) + + dt = datetime.datetime.now() + os.utime(info_path) + os.utime(info_pypath) + logger.info('!!! Time tag on info shelf files set to', + dt.strftime('%Y-%m-%dT%H:%M:%S'), force=True) + else: + logger.info('!!! Info shelf file is up to date; repair canceled', + info_path, force=True) + return + + # Move files and write new info + move_old_info(info_path, logger=logger) + write_infodict(pdsdir, dir_infodict, logger=logger) + +def update(pdsdir, selection=None, logger=None): + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + # Make sure info shelf file exists + if not os.path.exists(info_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Info shelf file does not exist', info_path) + else: + logger.warn('Info shelf file does not exist; initializing', + info_path) + initialize(pdsdir, selection=selection, logger=logger) + return + + # Read info shelf file + shelf_infodict = load_infodict(pdsdir, logger=logger) + + # Generate info + (dir_infodict, _) = generate_infodict(pdsdir, selection, shelf_infodict, + logger=logger) + + # Compare + canceled = (dir_infodict == shelf_infodict) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.info('!!! Info shelf file content is complete; update canceled', + info_path, force=True) + return + + # Write checksum file + move_old_info(info_path, logger=logger) + write_infodict(pdsdir, dir_infodict, logger=logger) + +################################################################################ +################################################################################ + +def main(): + + # Set up parser + parser = argparse.ArgumentParser( + description='pdsinfoshelf: Create, maintain and validate shelf files ' + + 'containing basic information about each file.') + + parser.add_argument('--initialize', '--init', const='initialize', + default='', action='store_const', dest='task', + help='Create an infoshelf file for a bundle. Abort ' + + 'if the file already exists.') + + parser.add_argument('--reinitialize', '--reinit', const='reinitialize', + default='', action='store_const', dest='task', + help='Create an infoshelf file for a bundle. Replace ' + + 'the file if it already exists. If a single ' + + 'file is specified, such as one archive file in ' + + 'a bundle set, then only information about that ' + + 'file is re-initialized.') + + parser.add_argument('--validate', const='validate', + default='', action='store_const', dest='task', + help='Validate every file in a bundle against the ' + + 'contents of its infoshelf file. If a single ' + + 'file is specified, such as an archive file in ' + + 'a bundle set, then only information about that ' + + 'file is validated') + + parser.add_argument('--repair', const='repair', + default='', action='store_const', dest='task', + help='Validate every file in a bundle against the ' + + 'contents of its infoshelf file. If any file ' + + 'has changed, the infoshelf file is replaced. ' + + 'If a single file is specified, such as an ' + + 'archive file in a bundle set, then only ' + + 'information about that file is repaired. If any '+ + 'of the files checked are newer than the shelf ' + + 'file, update the shelf file\'s modification ' + + 'date.') + + parser.add_argument('--update', const='update', + default='', action='store_const', dest='task', + help='Search a directory for any new files and add ' + + 'their information to the infoshelf file. ' + + 'Information about pre-existing files is not ' + + 'updated. If any of the files checked are newer ' + + 'than the shelf file, update the shelf file\'s ' + + 'modification date.') + + parser.add_argument('--bundle', nargs='+', type=str, + help='The path to the root of the bundle or bundle ' + + 'set. For a bundle set, all the bundle ' + + 'directories inside it are handled in sequence.') + + parser.add_argument('--log', '-l', type=str, default='', + help='Optional root directory for a duplicate of the ' + + 'log files. If not specified, the value of ' + + 'environment variable "%s" ' % LOGROOT_ENV + + 'is used. In addition, individual logs are ' + + 'written into the "logs" directory parallel to ' + + '"holdings". Logs are created inside the ' + + '"pdsinfoshelf" subdirectory of each log root ' + + 'directory.' + ) + + parser.add_argument('--quiet', '-q', action='store_true', + help='Do not also log to the terminal.') + + parser.add_argument('--archives', '-a', default=False, action='store_true', + help='Instead of referring to a bundle, refer to the ' + + 'the archive file for that bundle.') + + + # Parse and validate the command line + args = parser.parse_args() + + if not args.task: + print('pdsinfoshelf error: Missing task') + sys.exit(1) + + status = 0 + + # Define the logging directory + if args.log == '': + try: + args.log = os.environ[LOGROOT_ENV] + except KeyError: + args.log = None + + # Initialize the logger + logger = pdslogger.PdsLogger(LOGNAME) + pdsfile.Pds4File.set_log_root(args.log) + + if not args.quiet: + logger.add_handler(pdslogger.stdout_handler) + + if args.log: + path = os.path.join(args.log, 'pdsinfoshelf') + warning_handler = pdslogger.warning_handler(path) + logger.add_handler(warning_handler) + + error_handler = pdslogger.error_handler(path) + logger.add_handler(error_handler) + + # Prepare the list of paths + abspaths = [] + for path in args.bundle: + + # Make sure path makes sense + path = os.path.abspath(path) + parts = path.partition('/pds4-holdings/') + if not parts[1]: + print('Not a holdings subdirectory: ' + path) + sys.exit(1) + + if parts[2].startswith('checksums-'): + print('No infoshelves for checksum files: ' + path) + sys.exit(1) + + # Convert to an archives path if necessary + if args.archives and not parts[2].startswith('archives-'): + path = parts[0] + '/holdings/archives-' + parts[2] + + # Convert to a list of absolute paths that exist (bundlsets or bundles) + try: + pdsf = pdsfile.Pds4File.from_abspath(path, must_exist=True) + abspaths.append(pdsf.abspath) + + except (ValueError, IOError): + # Allow a bundle name to stand in for a .tar.gz archive + (dir, basename) = os.path.split(path) + pdsdir = pdsfile.Pds4File.from_abspath(dir) + if pdsdir.archives_ and '.' not in basename: + if pdsdir.voltype_ == 'bundles/': + basename += '.tar.gz' + else: + basename += '_%s.tar.gz' % pdsdir.voltype_[:-1] + + newpaths = glob.glob(os.path.join(dir, basename)) + if len(newpaths) == 0: + raise + + abspaths += newpaths + continue + else: + raise + + # Generate a list of tuples (pdsfile, selection) + info = [] + for path in abspaths: + pdsf = pdsfile.Pds4File.from_abspath(path) + + if pdsf.is_bundleset_dir: + # Info about archive directories is stored by bundleset + if pdsf.archives_: + info.append((pdsf, None)) + + # Others are checksumed by bundle + else: + children = [pdsf.child(c) for c in pdsf.childnames] + info += [(c, None) for c in children if c.isdir] + # "if c.isdir" is False for bundleset level readme files + + elif pdsf.is_bundle_dir: + # Shelve one bundle + info.append((pdsf, None)) + + elif pdsf.isdir: + print('Invalid directory for an infoshelf: ' + pdsf.logical_path) + sys.exit(1) + + else: + pdsdir = pdsf.parent() + if pdsf.is_bundle_file: + # Shelve one archive file + info.append((pdsdir, pdsf.basename)) + elif pdsdir.is_bundle_dir: + # Shelve one top-level file in bundle + info.append((pdsdir, pdsf.basename)) + else: + print('Invalid file for an infoshelf: ' + pdsf.logical_path) + sys.exit(1) + + # Open logger and loop through tuples... + logger.open(' '.join(sys.argv)) + try: + for (pdsdir, selection) in info: + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + if selection: + pdsf = pdsdir.child(os.path.basename(selection)) + else: + pdsf = pdsdir + + # Save logs in up to two places + if pdsf.bundlename: + logfiles = set([pdsf.log_path_for_bundle('_info', + task=args.task, + dir='pdsinfoshelf'), + pdsf.log_path_for_bundle('_info', + task=args.task, + dir='pdsinfoshelf', + place='parallel')]) + else: + logfiles = set([pdsf.log_path_for_bundleset('_info', + task=args.task, + dir='pdsinfoshelf'), + pdsf.log_path_for_bundleset('_info', + task=args.task, + dir='pdsinfoshelf', + place='parallel')]) + + # Create all the handlers for this level in the logger + local_handlers = [] + LOGDIRS = [] # used by move_old_info() + for logfile in logfiles: + local_handlers.append(pdslogger.file_handler(logfile)) + logdir = os.path.split(logfile)[0] + LOGDIRS.append(os.path.split(logfile)[0]) + + # These handlers are only used if they don't already exist + warning_handler = pdslogger.warning_handler(logdir) + error_handler = pdslogger.error_handler(logdir) + local_handlers += [warning_handler, error_handler] + + # Open the next level of the log + if len(info) > 1: + logger.blankline() + + if selection: + logger.open('Task "' + args.task + '" for selection ' + + selection, pdsdir.abspath, handler=local_handlers) + else: + logger.open('Task "' + args.task + '" for', pdsdir.abspath, + handler=local_handlers) + + try: + for logfile in logfiles: + logger.info('Log file', logfile) + + if args.task == 'initialize': + initialize(pdsdir, selection) + + elif args.task == 'reinitialize': + if selection: # don't erase everything else! + update(pdsdir, selection) + else: + reinitialize(pdsdir, selection) + + elif args.task == 'validate': + validate(pdsdir, selection) + + elif args.task == 'repair': + repair(pdsdir, selection) + + else: # update + update(pdsdir, selection) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + print(sys.exc_info()[2]) + status = 1 + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + if fatal or errors: + status = 1 + + sys.exit(status) + +if __name__ == '__main__': + main() From cd66002283a893ff65ec33f8bcdfd2c4a8aed130 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Wed, 18 Sep 2024 16:59:18 -0700 Subject: [PATCH 04/21] Update pds4/pds4checksums.py so that checksum files can be created under checksums-bundles directory for pds4 --- holdings_maintenance/pds3/pdschecksums.py | 20 +++++----- holdings_maintenance/pds4/pds4checksums.py | 46 ++++++++++++---------- pdsfile/pdsfile.py | 10 ++++- 3 files changed, 43 insertions(+), 33 deletions(-) diff --git a/holdings_maintenance/pds3/pdschecksums.py b/holdings_maintenance/pds3/pdschecksums.py index cd99d31..2a16513 100755 --- a/holdings_maintenance/pds3/pdschecksums.py +++ b/holdings_maintenance/pds3/pdschecksums.py @@ -614,7 +614,7 @@ def main(): 'their MD5 checksums to the checksum file. ' + 'Checksums of pre-existing files are not checked.') - parser.add_argument('volume', nargs='+', type=str, + parser.add_argument('--volume', nargs='+', type=str, help='The path to the root directory of a volume or ' + 'volume set. For a volume set, all the volume ' + 'directories inside it are handled in sequence. ' + @@ -768,21 +768,21 @@ def main(): # Save logs in up to two places if pdsf.volname: - logfiles = set([pdsf.log_path_for_volume('_md5', + logfiles = set([pdsf.log_path_for_bundle('_md5', task=args.task, dir='pdschecksums'), - pdsf.log_path_for_volume('_md5', + pdsf.log_path_for_bundle('_md5', task=args.task, dir='pdschecksums', place='parallel')]) else: - logfiles = set([pdsf.log_path_for_volset('_md5', - task=args.task, - dir='pdschecksums'), - pdsf.log_path_for_volset('_md5', - task=args.task, - dir='pdschecksums', - place='parallel')]) + logfiles = set([pdsf.log_path_for_bundleset('_md5', + task=args.task, + dir='pdschecksums'), + pdsf.log_path_for_bundleset('_md5', + task=args.task, + dir='pdschecksums', + place='parallel')]) # Create all the handlers for this level in the logger local_handlers = [] diff --git a/holdings_maintenance/pds4/pds4checksums.py b/holdings_maintenance/pds4/pds4checksums.py index cd99d31..38d08be 100755 --- a/holdings_maintenance/pds4/pds4checksums.py +++ b/holdings_maintenance/pds4/pds4checksums.py @@ -155,7 +155,7 @@ def read_checksums(check_path, selection=None, limits={}, logger=None): is returned.""" check_path = os.path.abspath(check_path) - pdscheck = pdsfile.Pds3File.from_abspath(check_path) + pdscheck = pdsfile.Pds4File.from_abspath(check_path) logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) logger.replace_root(pdscheck.root_) @@ -213,7 +213,7 @@ def read_checksums(check_path, selection=None, limits={}, logger=None): def checksum_dict(dirpath, logger=None): dirpath = os.path.abspath(dirpath) - pdsdir = pdsfile.Pds3File.from_abspath(dirpath) + pdsdir = pdsfile.Pds4File.from_abspath(dirpath) logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) logger.replace_root(pdsdir.root_) @@ -237,7 +237,7 @@ def write_checksums(check_path, abspairs, """Write a checksum table containing the given pairs (abspath, checksum).""" check_path = os.path.abspath(check_path) - pdscheck = pdsfile.Pds3File.from_abspath(check_path) + pdscheck = pdsfile.Pds4File.from_abspath(check_path) logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) logger.replace_root(pdscheck.root_) @@ -614,7 +614,7 @@ def main(): 'their MD5 checksums to the checksum file. ' + 'Checksums of pre-existing files are not checked.') - parser.add_argument('volume', nargs='+', type=str, + parser.add_argument('--bundle', nargs='+', type=str, help='The path to the root directory of a volume or ' + 'volume set. For a volume set, all the volume ' + 'directories inside it are handled in sequence. ' + @@ -661,7 +661,7 @@ def main(): # Initialize the logger logger = pdslogger.PdsLogger(LOGNAME) - pdsfile.Pds3File.set_log_root(args.log) + pdsfile.Pds4File.set_log_root(args.log) if not args.quiet: logger.add_handler(pdslogger.stdout_handler) @@ -676,11 +676,11 @@ def main(): # Prepare the list of paths abspaths = [] - for path in args.volume: + for path in args.bundle: # Make sure path makes sense path = os.path.abspath(path) - parts = path.partition('/holdings/') + parts = path.partition('/pds4-holdings/') if not parts[1]: print('Not a holdings subdirectory: ' + path) sys.exit(1) @@ -695,13 +695,13 @@ def main(): # Convert to a list of absolute paths that exist (volsets or volumes) try: - pdsf = pdsfile.Pds3File.from_abspath(path, must_exist=True) + pdsf = pdsfile.Pds4File.from_abspath(path, must_exist=True) abspaths.append(pdsf.abspath) except (ValueError, IOError): # Allow a volume name to stand in for a .tar.gz archive (dir, basename) = os.path.split(path) - pdsdir = pdsfile.Pds3File.from_abspath(dir) + pdsdir = pdsfile.Pds4File.from_abspath(dir) if pdsdir.archives_ and '.' not in basename: if pdsdir.voltype_ == 'volumes/': basename += '.tar.gz' @@ -720,9 +720,9 @@ def main(): # Generate a list of tuples (pdsfile, selection) info = [] for path in abspaths: - pdsf = pdsfile.Pds3File.from_abspath(path) + pdsf = pdsfile.Pds4File.from_abspath(path) - if pdsf.is_volset_dir: + if pdsf.is_bundleset_dir: # Archive directories are checksumed by volset if pdsf.archives_: info.append((pdsf, None)) @@ -758,6 +758,10 @@ def main(): try: for (pdsdir, selection) in info: path = pdsdir.abspath + print('xxxxxxxxxxxx') + print(path) + if '_support' in path: + continue if selection: pdsf = pdsdir.child(os.path.basename(selection)) @@ -767,22 +771,22 @@ def main(): check_path = pdsdir.checksum_path_and_lskip()[0] # Save logs in up to two places - if pdsf.volname: - logfiles = set([pdsf.log_path_for_volume('_md5', + if pdsf.bundlename: + logfiles = set([pdsf.log_path_for_bundle('_md5', task=args.task, dir='pdschecksums'), - pdsf.log_path_for_volume('_md5', + pdsf.log_path_for_bundle('_md5', task=args.task, dir='pdschecksums', place='parallel')]) else: - logfiles = set([pdsf.log_path_for_volset('_md5', - task=args.task, - dir='pdschecksums'), - pdsf.log_path_for_volset('_md5', - task=args.task, - dir='pdschecksums', - place='parallel')]) + logfiles = set([pdsf.log_path_for_bundleset('_md5', + task=args.task, + dir='pdschecksums'), + pdsf.log_path_for_bundleset('_md5', + task=args.task, + dir='pdschecksums', + place='parallel')]) # Create all the handlers for this level in the logger local_handlers = [] diff --git a/pdsfile/pdsfile.py b/pdsfile/pdsfile.py index 4b93144..268780c 100644 --- a/pdsfile/pdsfile.py +++ b/pdsfile/pdsfile.py @@ -271,7 +271,9 @@ class PdsFile(object): r'(|_md5\.txt|\.tar\.gz))$') BUNDLESET_PLUS_REGEX_I = re.compile(BUNDLESET_PLUS_REGEX.pattern, re.I) - BUNDLENAME_REGEX = re.compile(r'^([A-Z][A-Z0-9]{1,5}_(?:[0-9]{4}))$') + BUNDLENAME_REGEX = re.compile(r'^(([A-Z][A-Z0-9]{1,5}_(?:[0-9]{4}))|([a-zA-z\_].+))$') + # BUNDLENAME_REGEX = re.compile(r'^([A-Z][A-Z0-9]{1,5}_(?:[0-9]{4}))$') + BUNDLENAME_REGEX_I = re.compile(BUNDLENAME_REGEX.pattern, re.I) BUNDLENAME_PLUS_REGEX = re.compile(BUNDLENAME_REGEX.pattern[:-1] + r'(|_[a-z]+)(|_md5\.txt|\.tar\.gz)$') @@ -4832,7 +4834,11 @@ def checksum_path_and_lskip(self): raise ValueError('No checksums of checksum files: ' + self.logical_path) - if self.voltype_ == 'volumes/': + print('=================') + print(self.voltype_) + print(self.archives_) + print(self.bundlename) + if self.voltype_ == 'volumes/' or self.voltype_ == 'bundles/': suffix = '' else: suffix = '_' + self.voltype_[:-1] From ddbbb2a0e1d19746ce0105533f9e412418022e81 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Thu, 19 Sep 2024 16:11:17 -0700 Subject: [PATCH 05/21] Update pds4/pds4infoshelf.py to create info sheleves files under _infoshelf-bundles --- holdings_maintenance/pds4/pds4checksums.py | 4 ++-- holdings_maintenance/pds4/pds4infoshelf.py | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/holdings_maintenance/pds4/pds4checksums.py b/holdings_maintenance/pds4/pds4checksums.py index 38d08be..4661d16 100755 --- a/holdings_maintenance/pds4/pds4checksums.py +++ b/holdings_maintenance/pds4/pds4checksums.py @@ -758,8 +758,8 @@ def main(): try: for (pdsdir, selection) in info: path = pdsdir.abspath - print('xxxxxxxxxxxx') - print(path) + + # skip _support dirctory if '_support' in path: continue diff --git a/holdings_maintenance/pds4/pds4infoshelf.py b/holdings_maintenance/pds4/pds4infoshelf.py index 904f43e..84ea2ec 100755 --- a/holdings_maintenance/pds4/pds4infoshelf.py +++ b/holdings_maintenance/pds4/pds4infoshelf.py @@ -24,7 +24,7 @@ REPO_ROOT = Path(__file__).resolve().parent.parent.parent sys.path.insert(0, str(REPO_ROOT)) -from holdings_maintenance.pds3 import pdschecksums +from holdings_maintenance.pds4 import pds4checksums # Holds log file directories temporarily, used by move_old_info() LOGDIRS = [] @@ -138,7 +138,7 @@ def get_info(abspath, infodict, old_infodict, checkdict): try: # Load checksum dictionary - checkdict = pdschecksums.checksum_dict(dirpath, logger=logger) + checkdict = pds4checksums.checksum_dict(dirpath, logger=logger) # Removed... because we can't ignore empty directories # if not checkdict: # return ({}, 0.) @@ -801,6 +801,10 @@ def main(): try: for (pdsdir, selection) in info: + # skip _support dirctory + if '_support' in pdsdir.abspath: + continue + info_path = pdsdir.shelf_path_and_lskip('info')[0] if selection: From 69bfd7a3bb65b1f6bb40f4157db58032522f5123 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Thu, 19 Sep 2024 16:25:13 -0700 Subject: [PATCH 06/21] Removed the debug print statement in the pdsfile.py --- holdings_maintenance/pds4/pds4checksums.py | 2 +- pdsfile/pdsfile.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/holdings_maintenance/pds4/pds4checksums.py b/holdings_maintenance/pds4/pds4checksums.py index 4661d16..b78e333 100755 --- a/holdings_maintenance/pds4/pds4checksums.py +++ b/holdings_maintenance/pds4/pds4checksums.py @@ -691,7 +691,7 @@ def main(): # Convert to an archives path if necessary if args.archives and not parts[2].startswith('archives-'): - path = parts[0] + '/holdings/archives-' + parts[2] + path = parts[0] + '/pds4-holdings/archives-' + parts[2] # Convert to a list of absolute paths that exist (volsets or volumes) try: diff --git a/pdsfile/pdsfile.py b/pdsfile/pdsfile.py index f5487eb..02cb117 100644 --- a/pdsfile/pdsfile.py +++ b/pdsfile/pdsfile.py @@ -4834,10 +4834,6 @@ def checksum_path_and_lskip(self): raise ValueError('No checksums of checksum files: ' + self.logical_path) - print('=================') - print(self.voltype_) - print(self.archives_) - print(self.bundlename) if self.voltype_ == 'volumes/' or self.voltype_ == 'bundles/': suffix = '' else: From bc740478f10b3f749098239f2c9a614128af2071 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Sat, 21 Sep 2024 03:30:44 -0700 Subject: [PATCH 07/21] Create pds4/pds4indexshelf.py & pds4/pds4linkshelf.py --- holdings_maintenance/pds4/pds4indexshelf.py | 499 ++++++ holdings_maintenance/pds4/pds4infoshelf.py | 2 +- holdings_maintenance/pds4/pds4linkshelf.py | 1721 +++++++++++++++++++ pdsfile/pds3file/__init__.py | 3 + pdsfile/pds4file/__init__.py | 3 + 5 files changed, 2227 insertions(+), 1 deletion(-) create mode 100755 holdings_maintenance/pds4/pds4indexshelf.py create mode 100755 holdings_maintenance/pds4/pds4linkshelf.py diff --git a/holdings_maintenance/pds4/pds4indexshelf.py b/holdings_maintenance/pds4/pds4indexshelf.py new file mode 100755 index 0000000..177e419 --- /dev/null +++ b/holdings_maintenance/pds4/pds4indexshelf.py @@ -0,0 +1,499 @@ +#!/usr/bin/env python3 +################################################################################ +# pdsindexshelf.py library and main program +# +# Syntax: +# pdsindexshelf.py --task index_path.csv [index_path.csv ...] +# +# Enter the --help option to see more information. +################################################################################ + +import argparse +import datetime +import glob +import os +import pickle +import sys + +import pdslogger +import pdsfile +import pdstable + +LOGNAME = 'pds.validation.indexshelf' +LOGROOT_ENV = 'PDS_LOG_ROOT' + +################################################################################ + +def generate_indexdict(pdsf, logger=None): + """Generate a dictionary keyed by row key for each row in the given table. + The value returned is a list containing all the associated row indices. + """ + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsf.root_) + logger.open('Tabulating index rows for', pdsf.abspath) + + try: + table = pdstable.PdsTable(pdsf.label_abspath, + filename_keylen=pdsf.filename_keylen) + + table.index_rows_by_filename_key() # fills in table.filename_keys + childnames = table.filename_keys + index_dict = {c:table.row_indices_by_filename_key(c) + for c in childnames} + + logger.info('Rows tabulated', str(len(index_dict)), force=True) + + latest_mtime = max(os.path.getmtime(pdsf.abspath), + os.path.getmtime(pdsf.label_abspath)) + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('Latest index file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + except (OSError, ValueError) as e: + logger.error(str(e)) + raise e + + finally: + _ = logger.close() + + return (index_dict, latest_mtime) + +################################################################################ + +def write_indexdict(pdsf, index_dict, logger=None): + """Write a new shelf file for the rows of this index.""" + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsf.root_) + logger.open('Writing index shelf file info for', pdsf.abspath) + + try: + pdsfile.Pds4File.close_all_shelves() # prevents using a cached shelf file + + shelf_path = pdsf.indexshelf_abspath + logger.info('Index shelf file', shelf_path) + + # Create parent directory if necessary + parent = os.path.split(shelf_path)[0] + if not os.path.exists(parent): + logger.info('Creating parent directory', parent) + os.makedirs(parent) + + # Write the pickle file + with open(shelf_path, 'wb') as f: + pickle.dump(index_dict, f) + + # Write the Python file + python_path = shelf_path.rpartition('.')[0] + '.py' + logger.info('Writing Python file', python_path) + + # Determine the maximum length of the keys + len_path = 0 + for key in index_dict: + len_path = max(len_path, len(key)) + + name = os.path.basename(shelf_path).rpartition('.')[0] + with open(python_path, 'w', encoding='latin-1') as f: + f.write(name + ' = {\n') + for key in index_dict: + f.write(' "%s: ' % (key + '"' + (len_path-len(key)) * ' ')) + + rows = index_dict[key] + if len(rows) == 1: + f.write('%d,\n' % rows[0]) + else: + f.write('(') + for row in rows[:-1]: + f.write('%d, ' % row) + f.write('%d),\n' % rows[-1]) + + f.write('}\n\n') + + logger.info('Two files written') + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def load_indexdict(pdsf, logger=None): + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsf.root_) + logger.open('Reading index shelf file for', pdsf.abspath) + + try: + shelf_path = pdsf.indexshelf_abspath + logger.info('Index shelf file', shelf_path) + + if not os.path.exists(shelf_path): + logger.error('Index shelf file not found', shelf_path) + return {} + + with open(shelf_path, 'rb') as f: + index_dict = pickle.load(f) + + logger.info('Shelf records loaded', str(len(index_dict))) + + except pickle.PickleError as e: + logger.exception(e) + raise + + finally: + logger.close() + + return index_dict + +################################################################################ + +def validate_infodict(pdsf, tabdict, shelfdict, logger=None): + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsf.root_) + logger.info('Validating index file for', pdsf.abspath) + + if tabdict == shelfdict: + logger.info('Validation complete') + else: + logger.error('Validation failed for', pdsf.abspath) + +################################################################################ +# Simplified functions to perform tasks +################################################################################ + +def initialize(pdsf, logger=None): + + shelf_path = pdsf.indexshelf_abspath + + # Make sure file does not exist + if os.path.exists(pdsf.indexshelf_abspath): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Index shelf file already exists', shelf_path) + return + + # Generate info + (index_dict, _) = generate_indexdict(pdsf, logger=logger) + if index_dict is None: + return + + # Save info file + write_indexdict(pdsf, index_dict, logger=logger) + +def reinitialize(pdsf, logger=None): + + shelf_path = pdsf.indexshelf_abspath + + # Warn if shelf file does not exist + if not os.path.exists(shelf_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Index shelf file does not exist; initializing', shelf_path) + initialize(pdsf, logger=logger) + return + + # Generate info + (index_dict, _) = generate_indexdict(pdsf, logger=logger) + if not index_dict: + return + + # Save info file + write_indexdict(pdsf, index_dict, logger=logger) + +def validate(pdsf, logger=None): + + shelf_path = pdsf.indexshelf_abspath + + # Make sure file exists + if not os.path.exists(shelf_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Index shelf file does not exist', shelf_path) + return + + (table_indexdict, _) = generate_indexdict(pdsf, logger=logger) + if table_indexdict is None: + return + + shelf_indexdict = load_indexdict(pdsf, logger=logger) + if not shelf_indexdict: + return + + # Validate + validate_infodict(pdsf, table_indexdict, shelf_indexdict, + logger=logger) + +def repair(pdsf, logger=None, op='repair'): + + shelf_path = pdsf.indexshelf_abspath + + # Make sure file exists + if not os.path.exists(shelf_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Index shelf file does not exist; initializing', + shelf_path) + initialize(pdsf, logger=logger) + return + + (table_indexdict, latest_mtime) = generate_indexdict(pdsf, logger=logger) + if not table_indexdict: + return + + shelf_indexdict = load_indexdict(pdsf, logger=logger) + if not shelf_indexdict: + return + + # Compare + canceled = (table_indexdict == shelf_indexdict) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + shelf_pypath = shelf_path.replace('.pickle', '.py') + shelf_mtime = min(os.path.getmtime(shelf_path), + os.path.getmtime(shelf_pypath)) + if latest_mtime > shelf_mtime: + logger.info('!!! Index shelf file content is up to date', + shelf_path, force=True) + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('!!! Index file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + dt = datetime.datetime.fromtimestamp(shelf_mtime) + logger.info('!!! Index shelf file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + delta = latest_mtime - shelf_mtime + if delta >= 86400/10: + logger.info('!!! Index shelf file is out of date %.1f days' % + (delta / 86400.), force=True) + else: + logger.info('!!! Index shelf file is out of date %.1f minutes' % + (delta / 60.), force=True) + + dt = datetime.datetime.now() + os.utime(shelf_path) + os.utime(shelf_pypath) + logger.info('!!! Time tag on index shelf files set to', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + else: + logger.info('!!! Index shelf file is up to date; repair canceled', + shelf_path, force=True) + + return + + # Write new info + write_indexdict(pdsf, table_indexdict, logger=logger) + +def update(pdsf, selection=None, logger=None): + + shelf_path = pdsf.indexshelf_abspath + if os.path.exists(shelf_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.info('!!! Index shelf file exists; not updated', pdsf.abspath) + + else: + initialize(pdsf, logger) + +################################################################################ +################################################################################ + +def main(): + + # Set up parser + parser = argparse.ArgumentParser( + description='pdsindexshelf: Create, maintain and validate shelf files ' + + 'containing row lookup information for index files.') + + parser.add_argument('--initialize', '--init', const='initialize', + default='', action='store_const', dest='task', + help='Create an indexshelf file for an index or for ' + + 'an entire metadata directory. Abort if the file '+ + 'already exists.') + + parser.add_argument('--reinitialize', '--reinit', const='reinitialize', + default='', action='store_const', dest='task', + help='Create an indexshelf file for an index or for ' + + 'an entire metadata directory. Replace any files '+ + 'that already exists.') + + parser.add_argument('--validate', const='validate', + default='', action='store_const', dest='task', + help='Validate an indexshelf file or metadata ' + + 'directory.') + + parser.add_argument('--repair', const='repair', + default='', action='store_const', dest='task', + help='Validate an index shelf file; replace only if ' + + 'necessary. If the shelf file content is correct '+ + 'but it is older than either the file or the ' + + 'label, update the shelf file\'s modification ' + + 'date.') + + parser.add_argument('--update', const='update', + default='', action='store_const', dest='task', + help='Search a metadata directory for any new index ' + + 'files and add create an index shelf file for ' + + 'each one. Existing index shelf files are not ' + + 'checked.') + + parser.add_argument('--table', nargs='+', type=str, + help='Path to an index file or metadata directory.') + + parser.add_argument('--log', '-l', type=str, default='', + help='Optional root directory for a duplicate of the ' + + 'log files. If not specified, the value of ' + + 'environment variable "%s" ' % LOGROOT_ENV + + 'is used. In addition, individual logs are ' + + 'written into the "logs" directory parallel to ' + + '"holdings". Logs are created inside the "index" '+ + 'subdirectory of each log root directory.') + + parser.add_argument('--quiet', '-q', action='store_true', + help='Do not also log to the terminal.') + + # Parse and validate the command line + args = parser.parse_args() + + if not args.task: + print('pdsindexshelf error: Missing task') + sys.exit(1) + + status = 0 + + # Define the logging directory + if args.log == '': + try: + args.log = os.environ[LOGROOT_ENV] + except KeyError: + args.log = None + + # Initialize the logger + logger = pdslogger.PdsLogger(LOGNAME) + pdsfile.Pds4File.set_log_root(args.log) + + if not args.quiet: + logger.add_handler(pdslogger.stdout_handler) + + if args.log: + path = os.path.join(args.log, 'pdsindexshelf') + warning_handler = pdslogger.warning_handler(path) + logger.add_handler(warning_handler) + + error_handler = pdslogger.error_handler(path) + logger.add_handler(error_handler) + + # Generate a list of Pds4File objects before logging + pdsfiles = [] + + for path in args.table: + + if not os.path.exists(path): + + print('No such file or directory: ' + path) + sys.exit(1) + + path = os.path.abspath(path) + pdsf = pdsfile.Pds4File.from_abspath(path) + + if pdsf.isdir: + if not '/metadata/' in path: + print('Not a metadata directory: ' + path) + sys.exit(1) + + tables = glob.glob(os.path.join(path, '*.csv')) + if not tables: + tables = glob.glob(os.path.join(path, '*/*.csv')) + + if not tables: + print('No .csv files in directory: ' + path) + sys.exit(1) + + pdsfiles += pdsfile.Pds4File.pdsfiles_for_abspaths(tables) + + else: + if not '/metadata/' in path: + print('Not a metadata file: ' + path) + sys.exit(1) + if not path.endswith('.csv'): + print('Not a table file: ' + path) + sys.exit(1) + + pdsfiles.append(pdsf) + + # Open logger and loop through tables... + logger.open(' '.join(sys.argv)) + try: + for pdsf in pdsfiles: + + # Save logs in up to two places + logfiles = [pdsf.log_path_for_index(task=args.task, + dir='pdsindexshelf'), + pdsf.log_path_for_index(task=args.task, + dir='pdsindexshelf', + place='parallel')] + if logfiles[0] == logfiles[1]: + logfiles = logfiles[:-1] + + # Create all the handlers for this level in the logger + local_handlers = [] + for logfile in logfiles: + local_handlers.append(pdslogger.file_handler(logfile)) + logdir = (logfile.rpartition('/pdsindexshelf/')[0] + + '/pdsindexshelf') + + # These handlers are only used if they don't already exist + warning_handler = pdslogger.warning_handler(logdir) + error_handler = pdslogger.error_handler(logdir) + local_handlers += [warning_handler, error_handler] + + # Open the next level of the log + if len(pdsfiles) > 1: + logger.blankline() + + logger.open('Task "' + args.task + '" for', pdsf.abspath, + handler=local_handlers) + + try: + for logfile in logfiles: + logger.info('Log file', logfile) + + if args.task == 'initialize': + initialize(pdsf) + + elif args.task == 'reinitialize': + reinitialize(pdsf) + + elif args.task == 'validate': + validate(pdsf) + + elif args.task == 'repair': + repair(pdsf) + + else: # update + update(pdsf) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + print(sys.exc_info()[2]) + status = 1 + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + if fatal or errors: status = 1 + + sys.exit(status) + +if __name__ == '__main__': + main() diff --git a/holdings_maintenance/pds4/pds4infoshelf.py b/holdings_maintenance/pds4/pds4infoshelf.py index 84ea2ec..4814356 100755 --- a/holdings_maintenance/pds4/pds4infoshelf.py +++ b/holdings_maintenance/pds4/pds4infoshelf.py @@ -734,7 +734,7 @@ def main(): # Convert to an archives path if necessary if args.archives and not parts[2].startswith('archives-'): - path = parts[0] + '/holdings/archives-' + parts[2] + path = parts[0] + '/pds4-holdings/archives-' + parts[2] # Convert to a list of absolute paths that exist (bundlsets or bundles) try: diff --git a/holdings_maintenance/pds4/pds4linkshelf.py b/holdings_maintenance/pds4/pds4linkshelf.py new file mode 100755 index 0000000..1b9b093 --- /dev/null +++ b/holdings_maintenance/pds4/pds4linkshelf.py @@ -0,0 +1,1721 @@ +#!/usr/bin/env python3 +################################################################################ +# # pdslinkshelf.py library and main program +# +# Syntax: +# pdslinkshelf.py --task path [path ...] +# +# Enter the --help option to see more information. +################################################################################ + +import argparse +import datetime +import glob +import os +import pickle +import re +import shutil +import sys + +import pdslogger +import pdsfile +import translator + +LOGNAME = 'pds.validation.links' +LOGROOT_ENV = 'PDS_LOG_ROOT' + +# Holds log file directories temporarily, used by move_old_links() +LOGDIRS = [] + +REPAIRS = translator.TranslatorByRegex([ + + # COCIRS + (r'.*/COCIRS_[01].*/DATAINFO\.TXT', 0, + translator.TranslatorByDict( + {'DIAG.FMT' : 'UNCALIBR/DIAG.FMT', + 'FRV.FMT' : 'UNCALIBR/FRV.FMT', + 'GEO.FMT' : 'NAV_DATA/GEO.FMT', + 'HSK.FMT' : 'HSK_DATA/HSK.FMT', + 'IFGM.FMT' : 'UNCALIBR/IFGM.FMT', + 'IHSK.FMT' : 'UNCALIBR/IHSK.FMT', + 'ISPM.FMT' : 'APODSPEC/ISPM.FMT', + 'OBS.FMT' : 'UNCALIBR/OBS.FMT', + 'POI.FMT' : 'NAV_DATA/POI.FMT', + 'RIN.FMT' : 'NAV_DATA/RIN.FMT', + 'TAR.FMT' : 'NAV_DATA/TAR.FMT'})), + (r'.*/COCIRS_[01].*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'DATASIS.TXT' : 'DOCUMENT/DATASIS.PDF', + 'VOLSYS.TXT' : 'DOCUMENT/VOLSYS.PDF'})), + (r'.*/COCIRS_[01].*/DATASET\.CAT', 0, + translator.TranslatorByDict( + {'DATASIS.TXT' : 'DATASIS.PDF'})), + (r'.*/COCIRS_[01].*/SOFTWARE/DOC/SDOCINFO\.TXT', 0, + translator.TranslatorByDict( + {'vanilla_guide.htm' : 'vanilla-guide.html', + 'vanilla_guide.pdf' : 'vanilla-guide.pdf'})), + (r'.*/COCIRS_[01].*/DOCUMENT/DOCINFO\.TXT', 0, + translator.TranslatorByDict( + {'cirs_fov_overview.fig1.tiff' : 'cirs_fov_overview_fig1.tiff', + 'cirs_fov_overview.fig2.tiff' : 'cirs_fov_overview_fig2.tiff', + 'cirs_fov_overview.fig3.tiff' : 'cirs_fov_overview_fig3.tiff'})), + (r'.*/COCIRS_[01].*/CUBE/.*\.(LBL|lbl)', 0, + translator.TranslatorByRegex([ + (r'([0-9A-Z_]+)\.DAT', 0, r'\1.tar.gz')])), + (r'.*/COCIRS_[56].*/TUTORIAL\.TXT', 0, + translator.TranslatorByDict( + {'GEODATA.FMT' : '../DATA/GEODATA/GEODATA.FMT', + 'ISPMDATA.FMT' : '../DATA/ISPMDATA/ISPMDATA.FMT', + 'POIDATA.FMT' : '../DATA/POIDATA/POIDATA.FMT', + 'RINDATA.FMT' : '../DATA/RINDATA/RINDATA.FMT', + 'TARDATA.FMT' : '../DATA/TARDATA/TARDATA.FMT', + 'filename.FMT' : ''})), + (r'.*/COCIRS_[56].*/BROWSE/.*\.LBL', 0, + translator.TranslatorByRegex([ + (r'(SPEC[0-9]{10}_FP[134]\.DAT)', 0, r'../../DATA/APODSPEC/\1'), + (r'(ISPM[0-9]{10}_FP[134]\.TAB)', 0, r'../../DATA/ISPMDATA/\1'), + (r'(RIN[0-9]{10}_FP[134]\.TAB)', 0, r'../../DATA/RINDATA/\1'), + (r'(POI[0-9]{10}_FP[134]\.TAB)', 0, r'../../DATA/POIDATA/\1'), + (r'(TAR[0-9]{10}_FP[134]\.TAB)', 0, r'../../DATA/TARDATA/\1'), + (r'(GEO[0-9]{10}_[0-9]{3}\.TAB)', 0, r'../../DATA/GEODATA/\1')])), + (r'.*/COCIRS_[56].*/DATA/APODSPEC/.*\.LBL', 0, + translator.TranslatorByRegex([ + (r'(ISPM[0-9]{10}_FP[134]\.TAB)', 0, r'../ISPMDATA/\1'), + (r'(RIN[0-9]{10}_FP[134]\.TAB)', 0, r'../RINDATA/\1'), + (r'(POI[0-9]{10}_FP[134]\.TAB)', 0, r'../POIDATA/\1'), + (r'(TAR[0-9]{10}_FP[134]\.TAB)', 0, r'../TARDATA/\1')])), + (r'.*/COCIRS_[56].*/DATA/ISPMDATA/.*\.LBL', 0, + translator.TranslatorByRegex([ + (r'(SPEC[0-9]{10}_FP[134]\.DAT)', 0, r'../APODSPEC/\1'), + (r'(RIN[0-9]{10}_FP[134]\.TAB)', 0, r'../RINDATA/\1'), + (r'(POI[0-9]{10}_FP[134]\.TAB)', 0, r'../POIDATA/\1'), + (r'(TAR[0-9]{10}_FP[134]\.TAB)', 0, r'../TARDATA/\1')])), + (r'.*/COCIRS_[56].*/DATA/RINDATA/.*\.LBL', 0, + translator.TranslatorByRegex([ + (r'(SPEC[0-9]{10}_FP[134]\.DAT)', 0, r'../APODSPEC/\1'), + (r'(ISPM[0-9]{10}_FP[134]\.TAB)', 0, r'../ISPMDATA/\1'), + (r'(POI[0-9]{10}_FP[134]\.TAB)', 0, r'../POIDATA/\1'), + (r'(TAR[0-9]{10}_FP[134]\.TAB)', 0, r'../TARDATA/\1')])), + (r'.*/COCIRS_[56].*/DATA/POIDATA/.*\.LBL', 0, + translator.TranslatorByRegex([ + (r'(SPEC[0-9]{10}_FP[134]\.DAT)', 0, r'../APODSPEC/\1'), + (r'(ISPM[0-9]{10}_FP[134]\.TAB)', 0, r'../ISPMDATA/\1'), + (r'(RIN[0-9]{10}_FP[134]\.TAB)', 0, r'../RINDATA/\1'), + (r'(TAR[0-9]{10}_FP[134]\.TAB)', 0, r'../TARDATA/\1')])), + (r'.*/COCIRS_[56].*/DATA/TARDATA/.*\.LBL', 0, + translator.TranslatorByRegex([ + (r'(SPEC[0-9]{10}_FP[134]\.DAT)', 0, r'../APODSPEC/\1'), + (r'(ISPM[0-9]{10}_FP[134]\.TAB)', 0, r'../ISPMDATA/\1'), + (r'(RIN[0-9]{10}_FP[134]\.TAB)', 0, r'../RINDATA/\1'), + (r'(POI[0-9]{10}_FP[134]\.TAB)', 0, r'../POIDATA/\1')])), + (r'.*/COCIRS_[56].*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'REF.CAT' : 'CATALOG/CIRSREF.CAT'})), + + # COISS + (r'.*/COISS_0.*\.lbl', 0, + translator.TranslatorByDict( + {'PREFIX8.FMT' : 'prefix.fmt'})), + (r'.*/COISS_00.*/aareadme\.txt', 0, + translator.TranslatorByDict( + {'calinfo.txt' : '../COISS_0011/calib/calinfo.txt', + 'extrinfo.txt' : '../COISS_0011/extras/extrinfo.txt'})), + (r'.*/COISS_0.*/index\.lbl', 0, + translator.TranslatorByDict( + {'CUMINDEX.TAB' : 'index.tab'})), + (r'.*/COISS_0011/calib/darkcurrent/wac_\w+_dark_parameters04222\.lbl', 0, + translator.TranslatorByRegex([ + (r'wac_(\w+)_dark_parameters04228\.xdr', 0, r'wac_\1_dark_parameters04222.xdr')])), + (r'.*/COISS_[012].*/aareadme\.txt', 0, + translator.TranslatorByDict( + {'Calds.CAT' : '../../COISS_0xxx/COISS_0001/catalog/calds.cat', + 'calds.cat' : '../../COISS_0xxx/COISS_0001/catalog/calds.cat', + 'Jupiterds.CAT' : '../../COISS_1xxx/COISS_1001/catalog/jupiterds.cat', + 'jupiterds.cat' : '../../COISS_1xxx/COISS_1001/catalog/jupiterds.cat', + 'Saturnds.CAT' : '../../COISS_2xxx/COISS_2001/catalog/saturnds.cat', + 'saturnds.cat' : '../../COISS_2xxx/COISS_2001/catalog/saturnds.cat', + 'calinfo.txt' : '../../COISS_0xxx/COISS_0011/calib/calinfo.txt', + 'calib.tar.gz' : '../../COISS_0xxx/COISS_0011/calib/calib.tar.gz', + 'in_flight_cal.tex' : '../../COISS_0xxx/COISS_0011/document/in_flight_cal.tex', + 'in_flight_cal.pdf' : '../../COISS_0xxx/COISS_0011/document/in_flight_cal.pdf', + 'in_flight_cal.lbl' : '../../COISS_0xxx/COISS_0011/document/in_flight_cal.lbl', + 'theoretical_basis.tex': '../../COISS_0xxx/COISS_0011/document/theoretical_basis.tex', + 'theoretical_basis.pdf': '../../COISS_0xxx/COISS_0011/document/theoretical_basis.pdf', + 'theoretical_basis.lbl': '../../COISS_0xxx/COISS_0011/document/theoretical_basis.lbl', + 'theoretical_basis.ps' : '../../COISS_0xxx/COISS_0011/document/theoretical_basis.pdf', + 'cisscal.tar.gz' : '../../COISS_0xxx/COISS_0011/extras/cisscal.tar.gz'})), + (r'.*/COISS_[012].*/archsis\.txt', 0, + translator.TranslatorByDict( + {'Calds.CAT' : '../../../COISS_0xxx/COISS_0001/catalog/calds.cat', + 'calds.cat' : '../../../COISS_0xxx/COISS_0001/catalog/calds.cat', + 'Jupiterds.CAT' : '../../../COISS_1xxx/COISS_1001/catalog/jupiterds.cat', + 'jupiterds.cat' : '../../../COISS_1xxx/COISS_1001/catalog/jupiterds.cat', + 'Saturnds.CAT' : '../../../COISS_2xxx/COISS_2001/catalog/saturnds.cat', + 'saturnds.cat' : '../../../COISS_2xxx/COISS_2001/catalog/saturnds.cat'})), + + # COUVIS + (r'.*/COUVIS_0.*/INDEX\.LBL', 0, + translator.TranslatorByDict( + {'CUBEDS.CAT' : '../CATALOG/SCUBEDS.CAT'})), + (r'.*/COUVIS_0.*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'INST.CAT' : 'CATALOG/UVISINST.CAT', + 'XCALDS.CAT' : 'CATALOG/SCALDS.CAT', + 'XCUBEDS.CAT' : 'CATALOG/SCUBEDS.CAT', + 'XSPECDS.CAT' : 'CATALOG/SSPECDS.CAT', + 'XSSBDS.CAT' : 'CATALOG/SSSBDS.CAT', + 'XWAVDS.CAT' : 'CATALOG/SWAVDS.CAT'})), + (r'.*/COUVIS_0.*/CATALOG/.*\.CAT', 0, + translator.TranslatorByDict( + {'SPECDS.CAT' : 'SSPECDS.CAT', + 'CUBEDS.CAT' : 'SCUBEDS.CAT'})), + (r'.*/COUVIS_0.*/SOFTWARE/READERS/READERS_README.TXT', 0, + translator.TranslatorByDict( + {'CATALOG/CUBEDS.CAT' : '../../CATALOG/SCUBEDS.CAT'})), + (r'.*/COUVIS_0.*/SOFTWARE/READERS/OLD.*/READERS_README.TXT', 0, + translator.TranslatorByDict( + {'CATALOG/CUBEDS.CAT' : '../../../CATALOG/SCUBEDS.CAT'})), + (r'.*/COUVIS_8xxx/.*/aareadme\.txt', 0, + translator.TranslatorByDict( + {'inst.cat' : 'catalog/uvisinst.cat'})), + (r'.*/COUVIS_8xxx_v1.*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'INST.CAT' : 'CATALOG/UVISINST.CAT'})), + (r'.*/COUVIS_8xxx_v2.*/voldesc\.cat', 0, + translator.TranslatorByDict( + {'UVISINST.CAT' : 'catalog/inst.cat', + 'PROJREF.CAT' : ''})), + (r'.*/COUVIS_8xxx_v1/.*/CATINFO\.TXT', re.I, + translator.TranslatorByDict( + {'INST.CAT' : 'UVISINST.CAT'})), + (r'.*/COUVIS_8xxx(|_v2\.0)/.*/voldesc\.cat', re.I, + translator.TranslatorByDict( + {'PROJREF.CAT' : ''})), + (r'.*/metadata/.*/COUVIS_0.*_index\.lbl', 0, + translator.TranslatorByDict( + {'CUBEDS.CAT' : ''})), + + # COVIMS + (r'.*/COVIMS_0001/aareadme\.txt', 0, + translator.TranslatorByDict( + {'band_bin_center.fmt' : '../COVIMS_0002/label/band_bin_center.fmt', + 'core_description.fmt' : '../COVIMS_0002/label/core_description.fmt', + 'suffix_description.fmt': '../COVIMS_0002/label/suffix_description.fmt', + 'labinfo.txt' : '../COVIMS_0002/label/labinfo.txt'})), + (r'.*/COVIMS_0.../aareadme\.txt', 0, + translator.TranslatorByDict( + {'caldoc.txt' : 'software/doc/caldoc.txt', + 'make_dark.sav' : 'software/bin/make_dark.sav', + 'ppvl_10_1.zip' : 'software/lib/ppvl_1_10.zip', + 'ppvl_1_10.zip' : 'software/lib/ppvl_1_10.zip', + 'libPPVL.a' : 'software/lib/ppvl_1_10/libPPVL.a', + 'Makefile' : 'software/lib/ppvl_1_10/Makefile', + 'Makefile.sun' : 'software/lib/ppvl_1_10/Makefile.sun', + 'PIRL_strings.c' : 'software/lib/ppvl_1_10/PIRL_strings.c', + 'PIRL_strings.h' : 'software/lib/ppvl_1_10/PIRL_strings.h', + 'PPVL.c' : 'software/lib/ppvl_1_10/PPVL.c', + 'PPVL.h' : 'software/lib/ppvl_1_10/PPVL.h', + 'PPVL-README' : 'software/lib/ppvl_1_10/PPVL-README', + 'PPVL_report.c' : 'software/lib/ppvl_1_10/PPVL_report.c', + 'PPVL_selections.c' : 'software/lib/ppvl_1_10/PPVL_selections.c', + 'PPVL_selections.h' : 'software/lib/ppvl_1_10/PPVL_selections.h', + 'RANLIB.csh' : 'software/lib/ppvl_1_10/RANLIB.csh', + 'README' : 'software/lib/ppvl_1_10/README', + 'PPVL.3' : 'software/lib/ppvl_1_10/doc/PPVL.3', + 'PPVL_selections.3' : 'software/lib/ppvl_1_10/doc/PPVL_selections.3', + 'PPVL_report.1' : 'software/lib/ppvl_1_10/doc/PPVL_report.1', + 'PPVL_get_PDS_EOL.3' : 'software/lib/ppvl_1_10/doc/PPVL_get_PDS_EOL.3', + 'bp_trans.c' : 'software/src/c/cube_prep/bp_trans.c', + 'cube_prep.c' : 'software/src/c/cube_prep/cube_prep.c', + 'error.h' : 'software/src/c/ir_bg/error.h', + 'fit.c' : 'software/src/c/ir_bg/fit.c', + 'ir_bg.c' : 'software/src/c/ir_bg/ir_bg.c', + 'ir_bg_sub.c' : 'software/src/c/ir_bg_sub/ir_bg_sub.c', + 'mark_saturated.c' : 'software/src/c/mark_saturated/mark_saturated.c', + 'make_dark.pro' : 'software/src/idl/make_dark.pro', + 'vims_cal_pipe.pl' : 'software/src/perl/vims_cal_pipe.pl', + 'cal_pipe2.pm' : 'software/src/perl/cal_pipe2/cal_pipe2.pm', + 'cal_occultation.pm' : 'software/src/perl/cal_pipe2/cal_occultation.pm', + 'cal_point.pm' : 'software/src/perl/cal_pipe2/cal_point.pm', + 'dark_vis.pm' : 'software/src/perl/cal_pipe2/dark_vis.pm', + 'flat_ir2.pm' : 'software/src/perl/cal_pipe2/flat_ir2.pm', + 'flat_vis2.pm' : 'software/src/perl/cal_pipe2/flat_vis2.pm', + 'isis_geo.pm' : 'software/src/perl/cal_pipe2/isis_geo.pm', + 'solar_remove.pm' : 'software/src/perl/cal_pipe2/solar_remove.pm', + 'specific_energy.pm' : 'software/src/perl/cal_pipe2/specific_energy.pm'})), + (r'.*/COVIMS_0001/data/.*\.lbl', 0, + translator.TranslatorByDict( + {'band_bin_center.fmt' : '../../../COVIMS_0002/label/band_bin_center.fmt', + 'core_description.fmt' : '../../../COVIMS_0002/label/core_description.fmt', + 'suffix_description.fmt': '../../../COVIMS_0002/label/suffix_description.fmt', + 'BAND_BIN_CENTER.FMT' : '../../../COVIMS_0002/label/band_bin_center.fmt', + 'CORE_DESCRIPTION.FMT' : '../../../COVIMS_0002/label/core_description.fmt', + 'SUFFIX_DESCRIPTION.FMT': '../../../COVIMS_0002/label/suffix_description.fmt'})), + (r'.*/COVIMS_0001/document/archsis\.txt', 0, + translator.TranslatorByDict( + {'band_bin_center.fmt' : '../../COVIMS_0002/label/band_bin_center.fmt', + 'core_description.fmt' : '../../COVIMS_0002/label/core_description.fmt', + 'suffix_description.fmt': '../../COVIMS_0002/label/suffix_description.fmt', + 'BAND_BIN_CENTER.FMT' : '../../COVIMS_0002/label/band_bin_center.fmt', + 'CORE_DESCRIPTION.FMT' : '../../COVIMS_0002/label/core_description.fmt', + 'SUFFIX_DESCRIPTION.FMT': '../../COVIMS_0002/label/suffix_description.fmt'})), + (r'.*/COVIMS_0.*/document/archsis\.txt', 0, + translator.TranslatorByDict( + {'suffix.cat' : ''})), + (r'.*/COVIMS_0.*/errata\.txt', 0, + translator.TranslatorByDict( + {'center.fmt' : 'label/band_bin_center.fmt'})), + (r'.*/COVIMS_0024/data/2008017T190718_2008017T201544/v1579292302_1\.lbl', 0, + translator.TranslatorByDict( + {"v1579292302.qub" : "v1579292302_1.qub"})), + (r'.*/metadata/COVIMS.*/.*supplemental_index.lbl', 0, + translator.TranslatorByDict( + {'dpsis.txt': '../../../volumes/COVIMS_0xxx/COVIMS_0001/document/dpsis.txt'})), + (r'.*/COVIMS_8xxx_v2.*/voldesc.cat', 0, + translator.TranslatorByDict( + {'PROJREF.CAT' : ''})), + + # EBROCC + (r'.*/EBROCC_0001/INDEX/MCD_INDEX\.LBL', 0, + translator.TranslatorByDict( + {'LIC_INDEX.TAB' : 'MCD_INDEX.TAB'})), + (r'.*/EBROCC_0001/INDEX/PAL_INDEX\.LBL', 0, + translator.TranslatorByDict( + {'LIC_INDEX.TAB' : 'PAL_INDEX.TAB'})), + (r'.*/EBROCC_0001/SORCDATA/ESO1M/ES1_INGRESS_GEOMETRY\.LBL', 0, + translator.TranslatorByDict( + {'ES1_INGRESS_GEOMETRY.LBL': 'ES1_INGRESS_GEOMETRY.DAT'})), + + # GO + (r'.*/GO_0xxx.*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'ttds.cat' : '../GO_0020/CATALOG/TTDS.CAT'})), + (r'.*/GO_0xxx_v1/GO_00(0[789]|1[0-6])/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'CATSTATUS.TXT' : 'DOCUMENT/CATSTAT.TXT'})), + (r'.*/GO_0xxx.*/GO_0001/CATALOG/DATASET\.CAT', 0, + translator.TranslatorByRegex( + [(r'(\w\w\w[1-4][sf]_blm02\.img)', 0, r'../BLEMISH/#UPPER#\1'), + (r'(\w\w\w[sf]_cal0[1-5]\.dat)', 0, r'../SLOPE/#UPPER#\1'), + (r'([123][sf]\w+_dc0[1-5]\.dat)', 0, r'../DARK/#UPPER#\1'), + (r'calibration_so02.img', 0, r'../SHUTTER/CALIBRATION_SO02.IMG')])), + (r'.*/GO_0xxx.*/GO_000[2-6]/CATALOG/DATASET\.CAT', 0, + translator.TranslatorByDict( + {'V_E1DS.CAT' : ''})), + (r'.*/GO_0xxx.*/GO_0001/DOCUMENT/PDSLABEL\.TXT', 0, + translator.TranslatorByDict( + {'RLINEPRX.FMT' : '../../GO_0002/LABEL/RLINEPRX.FMT', + 'RTLMTAB.FMT' : '../../GO_0002/LABEL/RTLMTAB.FMT'})), + (r'.*/GO_0xxx_v1/GO_0001/INDEX/CUMINDEX\.LBL', 0, + translator.TranslatorByDict( + {'IMGINDEX.TAB' : 'CUMINDEX.TAB'})), + (r'.*/GO_0xxx_v1/GO_0001/INDEX/P1CUMINDEX\.LBL', 0, + translator.TranslatorByDict( + {'IMGINDEX.TAB' : 'P1CUMINDEX.TAB'})), + + # HST + (r'.*/HSTJ.*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'NST.CAT' : 'CATALOG/INST.CAT'})), + (r'.*/HSTJ.*/CATINFO\.TXT', 0, + translator.TranslatorByDict( + {'NST.CAT' : 'INST.CAT'})), + (r'.*/HSTJ.*_v.*/HSTJ1_0427/DATA/VISIT_02/.*\.LBL', 0, + translator.TranslatorByDict( + {'J96O02JLQ_FLT_WFC1.JPG': '', + 'J96O02JMQ_FLT_WFC1.JPG': '', + 'J96O02JLQ_FLT_WFC2.JPG': 'J96O02JLQ_FLT.JPG', + 'J96O02JMQ_FLT_WFC2.JPG': 'J96O02JMQ_FLT.JPG', + 'J96O02JOQ_FLT_WFC2.JPG': 'J96O02JOQ_FLT.JPG', + 'J96O02JQQ_FLT_WFC2.JPG': 'J96O02JQQ_FLT.JPG', + 'J96O02JSQ_FLT_WFC2.JPG': 'J96O02JSQ_FLT.JPG'})), + (r'.*/HSTJx_xxxx.*_v.*/HSTJ1_2395/DATA/.*\.LBL', 0, + translator.TranslatorByDict( + {'JBNY02SOQ_FLT_WFC1.JPG': '', + 'JBNY02SOQ_FLT_WFC2.JPG': 'JBNY02SOQ_FLT.JPG', + 'JBNY02SQQ_FLT_WFC2.JPG': 'JBNY02SQQ_FLT.JPG', + 'JBNY02SSQ_FLT_WFC2.JPG': 'JBNY02SSQ_FLT.JPG', + 'JBNYA1T2Q_FLT_WFC2.JPG': 'JBNYA1T2Q_FLT.JPG', + 'JBNYA2SUQ_FLT_WFC2.JPG': 'JBNYA2SUQ_FLT.JPG'})), + + # JNOJIR + (r'.*/JNOJIR.*/AAREADME.TXT', 0, + translator.TranslatorByDict( + {'PERSON.CAT' : 'JNO_JIRAM_PERSON.CAT', + 'DATAINFO.TXT' : ''})), + (r'.*/JNOJIR.*/JIR_IMG_\w+_RESPONSIVITY_V03.LBL', 0, + translator.TranslatorByRegex( + [(r'(JIR_IMG_\w+_RESPONSIVITY)_V02\.DAT', 0, r'\1_V03.DAT')])), + (r'.*/JNOJIR_20(2[789]|3\d)/DATA/JIR_\w+.LBL', 0, + translator.TranslatorByRegex( + [(r'(JIR_IMG_\w+_RESPONSIVITY)_V02\.DAT', 0, r'../CALIB/\1_V03.DAT')])), + # Embedded list comprehension + # Each links a SOURCE_PRODUCT_ID on JNOJIR_2nnn to the associated EDR in + # the parallel directory on JNOJIR_1nnn. Set up through volume _2049. + ] + [ + (fr'.*/JNOJIR_xxxx/JNOJIR_20{nn:02d}/DATA/JIR_\w+.LBL', 0, + translator.TranslatorByRegex( + [(r'(JIR_\w+_EDR_20\w+)\.(DAT|IMG)', 0, + fr'../../JNOJIR_10{nn:02d}/DATA/\1.\2')])) + for nn in range(0,50)] + [ + + # JNOJNC + (r'.*/JNOJNC.*/(AAREADME|CATINFO).TXT', 0, + translator.TranslatorByDict( + {'JUNO_REF.CAT' : 'JUNO_PROJREF.CAT'})), + + # NHSP (and *SP_xxxx) + (r'.*/NHSP_xxxx_v1.*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'personel.cat' : 'CATALOG/PERSONNEL.CAT', + 'spiceds.cat' : 'CATALOG/SPICE_INST.CAT'})), + (r'.*SP_xxxx.*/aareadme\.txt', 0, + translator.TranslatorByDict( + {'dataset.cat' : 'catalog/spiceds.cat', + 'ckinfo.txt' : 'data/ck/ckinfo.txt', + 'ekinfo.txt' : 'data/ek/ekinfo.txt', + 'fkinfo.txt' : 'data/fk/fkinfo.txt', + 'ikinfo.txt' : 'data/ik/ikinfo.txt', + 'lskinfo.txt' : 'data/lsk/lskinfo.txt', + 'pckinfo.txt' : 'data/pck/pckinfo.txt', + 'sclkinfo.txt' : 'data/sclk/sclkinfo.txt', + 'spkinfo.txt' : 'data/spk/spkinfo.txt', + 'ckdoc.txt' : 'document/ck/ckdoc.txt', + 'ekdoc.txt' : 'document/ek/ekdoc.txt', + 'mkinfo.txt' : 'extras/mk/mkinfo.txt', + 'orbinfo.txt' : 'extras/orbnum/orbinfo.txt', + 'spkxinfo.txt' : 'extras/spkxtra/spkxinfo.txt', + 'covinfo.txt' : 'extras/spkxtra/covtab/covinfo.txt', + 'ckxtinfo.txt' : 'extras/ckxtra/ckxtinfo.txt', + 'navinfo.txt' : 'extras/ckxtra/cknav/navinfo.txt', + 'issinfo.txt' : 'extras/ckxtra/ckiss/issinfo.txt'})), + + # NHxxMV/NHxxLO + (r'.*/NHxx.._xxxx_v1/NH(JU|LA).*/aareadme\.txt', 0, + translator.TranslatorByDict( + {'PAYLOAD_SSR.LBL' : 'document/payload_ssr/payload_ssr.lbl', + 'RALPH_SSR.LBL' : 'document/ralph_ssr/ralph_ssr.lbl', + 'SOC_INST_ICD.LBL' : 'document/soc_inst_icd/soc_inst_icd.lbl'})), + (r'.*/NHxx.._xxxx_v1/NH(JU|LA).*/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'PAYLOAD_SSR.LBL' : 'DOCUMENT/PAYLOAD_SSR/PAYLOAD_SSR.LBL', + 'RALPH_SSR.LBL' : 'DOCUMENT/RALPH_SSR/RALPH_SSR.LBL', + 'SOC_INST_ICD.LBL' : 'DOCUMENT/SOC_INST_ICD/SOC_INST_ICD.LBL'})), + (r'.*/NHxxLO_xxxx.*/NH..LO_2001/data/\w+/.*\.lbl', 0, + translator.TranslatorByRegex( + [(r'cflat_grnd_SFA_(\w+\.fit)', 0, r'../../calib/cflat_grnd_sfa_\1'), + (r'(cflat|dead|delta|dsmear|hot|sap)_(\w+\.fit)', 0, r'../../calib/\1_\2')])), + (r'.*/NHxxMV_xxxx.*/NH..MV_2001/data/\w+/.*\.lbl', 0, + translator.TranslatorByRegex( + [(r'(mc[0-3])_(flat_\w+\.fit)s', 0, r'../../calib/mcl/\1_\2'), + (r'(mp[12])_(flat_\w+\.fit)s', 0, r'../../calib/mp/\1_\2'), + (r'(mfr_flat_\w+\.fit)s', 0, r'../../calib/mfr/\1')])), + + # RPX + (r'.*/RPX_0101.*/R_HARRIS\.LBL', 0, + translator.TranslatorByDict( + {'R_HARRIS.DF' : 'R_HARRIS.PDF'})), + (r'.*/RPX_0101.*/F161225AB\.LBL', 0, + translator.TranslatorByDict( + {'F161225RB.GIF' : 'F161225AB.GIF'})), + (r'.*/RPX_0201.*/T0808_F1498_CAL\.LBL', 0, + translator.TranslatorByDict( + {'T0808_F1497_CAL.IMG' : 'T0808_F1498_CAL.IMG'})), + (r'.*/RPX_0401/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'INSTHOST.CAT' : 'CATALOG/HOST.CAT'})), + + # Any VG + (r'.*/VG.*/CATALOG/CATINFO\.TXT', 0, + translator.TranslatorByDict( + {'VGnNINST.CAT' : 'VG1NINST.CAT', + 'VGnHOST.CAT' : 'VG1HOST.CAT'})), + + # VG_20xx (IRIS) + (r'.*/VG_2001/.*/VG2_SAT\.LBL', 0, + translator.TranslatorByDict( + {'IRIS_ROWFMT.FMT' : '../JUPITER/IRISHEDR.FMT'})), + (r'.*/VG_2001/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'IRISHEDR.FMT' : 'JUPITER/IRISHEDR.FMT', + 'IRISTRGP.FMT' : 'JUPITER/CALIB/IRISTRGP.FMT'})), + + # VG_28xx (ring profiles) + (r'.*/VG_28[0-9]{2}/.*INFO\.TXT', 0, + translator.TranslatorByDict( + {'RS1SINST.CAT' : 'VG1SINST.CAT', + 'RS2UINST.CAT' : 'VG2UINST.CAT'})), + (r'.*/VG_28xx/VG_2801/CALIB/PS2C01\.LBL', 0, + translator.TranslatorByDict( + {'PS1C01.TAB' : 'PS2C01.TAB'})), + (r'.*/VG_28xx/VG_2801/JITTER/PS1J01\.LBL', 0, + translator.TranslatorByDict( + {'PS1J02.TAB' : 'PS1J01.TAB'})), + (r'.*/VG_28xx/VG_2801/JITTER/PU2J02\.LBL', 0, + translator.TranslatorByDict( + {'PU2J01.TAB' : 'PU2J02.TAB'})), + (r'.*/VG_280./.*/L3GUIDE\.TXT', 0, + translator.TranslatorByDict( + {'RTLMTAB.FMT' : ''})), + (r'.*/VG_2802/EDITDATA/DATAINFO\.TXT', 0, + translator.TranslatorByDict( + {'INST.CAT' : '../CATALOG/VG1INST.CAT'})), + (r'.*/VG_2802/EDITDATA/US3D01P\.LBL', 0, + translator.TranslatorByDict( + {'US3D01I.DAT' : 'US3D01P.DAT'})), + (r'.*/VG_2802/SORCDATA/DATAINFO\.TXT', 0, + translator.TranslatorByDict( + {'BETAPER.VOY' : 'BETPER.VOY', + 'BETAPER.LBL' : 'BETPER.LBL'})), + (r'.*/VG_2803.*/RS.R1BFV\.LBL', 0, + translator.TranslatorByDict( + {'RS_R1BFT.FMT' : 'RS_R1BFV.FMT'})), + + # VGn_9xxx (RSS) + (r'.*/VG[12]_9.*/CHECKSUMS.TXT', 0, # any file referenced in CHECKSUMS.TXT + # already has a full path; don't search + translator.TranslatorByRegex([(r'(.*)', 0, r'\1')])), + (r'.*/VG[12]_9.*/ERRATA.TXT', 0, + translator.TranslatorByDict( + {'_PERSON.CAT' : 'CATALOG/VG_RSS_PERSON.CAT'})), + (r'.*/VG1_9050/CATALOG/CATINFO.TXT', 0, + translator.TranslatorByDict( + {'MISSION.CAT' : 'VG_MISSION.CAT', + 'INST_HOST.CAT' : 'VG1_INST_HOST.CAT', + 'INST.CAT' : 'VG1_RSS_INST.CAT', + 'DS.CAT' : 'VG1_SAT_RSS_DS.CAT', + 'PERSON.CAT' : 'VG_RSS_PERSON.CAT', + 'REF.CAT' : 'VG1_S_RSS_REF.CAT', + 'TARGET.CAT' : 'VG_SAT_TARGET.CAT', + 'VG1_SAT_TARGET.CAT' : 'VG_SAT_TARGET.CAT'})), + (r'.*/VG1_9056/CATALOG/CATINFO.TXT', 0, + translator.TranslatorByDict( + {'MISSION.CAT' : 'VG_MISSION.CAT', + 'INSTHOST.CAT' : 'VG1_INST_HOST.CAT', + 'INST.CAT' : 'VG1_RSS_INST.CAT', + 'DS.CAT' : 'VG1_SSA_RSS_DS.CAT', + 'PERSON.CAT' : 'VG_RSS_PERSON.CAT', + 'REF.CAT' : 'VG1_SSA_RSS_REF.CAT', + 'TARGET.CAT' : 'VG_TITAN_TARGET.CAT'})), + (r'.*/VG2_9065/CATALOG/CATINFO.TXT', 0, + translator.TranslatorByDict( + {'MISSION.CAT' : 'VG_MISSION.CAT', + 'INSTHOST.CAT' : 'VG2_INST_HOST.CAT', + 'INST.CAT' : 'VG2_RSS_INST.CAT', + 'DS.CAT' : 'VG2_S_RSS_DS.CAT', + 'PERSON.CAT' : 'VG_RSS_PERSON.CAT', + 'REF.CAT' : 'VG2_S_RSS_REF.CAT', + 'TARGET.CAT' : 'VG_SAT_TARGET.CAT'})), + + # VGIRIS + (r'.*/VGIRIS_0001/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'JUPITER_ASCII.FMT' : 'DATA/JUPITER_VG1/JUPITER_ASCII.FMT', + 'JUPITER_LSB.FMT' : 'DATA/JUPITER_VG1/JUPITER_LSB.FMT', + 'JUPITER_MSB.FMT' : 'DATA/JUPITER_VG1/JUPITER_MSB.FMT', + 'SATURN_ASCII.FMT' : '', + 'SATURN_LSB.FMT' : '', + 'SATURN_MSB.FMT' : '', + 'VGnINST.CAT' : 'CATALOG/VG1INST.CAT', + 'VGnHOST.CAT' : 'CATALOG/VG1HOST.CAT'})), + (r'.*/VGIRIS_0001/DATA/DATAINFO\.TXT', 0, + translator.TranslatorByDict( + {'JUPITER_ASCII.FMT' : 'JUPITER_VG1/JUPITER_ASCII.FMT', + 'JUPITER_LSB.FMT' : 'JUPITER_VG1/JUPITER_LSB.FMT', + 'JUPITER_MSB.FMT' : 'JUPITER_VG1/JUPITER_MSB.FMT', + 'SATURN_ASCII.FMT' : '', + 'SATURN_LSB.FMT' : '', + 'SATURN_MSB.FMT' : '', + 'VGnINST.CAT' : '../CATALOG/VG1INST.CAT', + 'VGnHOST.CAT' : '../CATALOG/VG1HOST.CAT'})), + (r'.*/VGIRIS_0002/AAREADME\.TXT', 0, + translator.TranslatorByDict( + {'JUPITER_ASCII.FMT' : '', + 'JUPITER_LSB.FMT' : '', + 'JUPITER_MSB.FMT' : '', + 'SATURN_ASCII.FMT' : 'DATA/SATURN_VG1/SATURN_ASCII.FMT', + 'SATURN_LSB.FMT' : 'DATA/SATURN_VG1/SATURN_LSB.FMT', + 'SATURN_MSB.FMT' : 'DATA/SATURN_VG1/SATURN_MSB.FMT', + 'VGnINST.CAT' : 'CATALOG/VG1INST.CAT', + 'VGnHOST.CAT' : 'CATALOG/VG1HOST.CAT'})), + (r'.*/VGIRIS_0002/DATA/DATAINFO\.TXT', 0, + translator.TranslatorByDict( + {'JUPITER_ASCII.FMT' : '', + 'JUPITER_LSB.FMT' : '', + 'JUPITER_MSB.FMT' : '', + 'SATURN_ASCII.FMT' : 'SATURN_VG1/SATURN_ASCII.FMT', + 'SATURN_LSB.FMT' : 'SATURN_VG1/SATURN_LSB.FMT', + 'SATURN_MSB.FMT' : 'SATURN_VG1/SATURN_MSB.FMT', + 'VGnINST.CAT' : '../CATALOG/VG1INST.CAT', + 'VGnHOST.CAT' : '../CATALOG/VG1HOST.CAT'})), + + # VGISS + (r'.*/VGISS.*/BROWSE/C34801XX/C3480139_.*\.LBL', 0, + translator.TranslatorByDict( + {'C3480140_CALIB.JPG' : 'C3480139_CALIB.JPG', + 'C3480140_CLEANED.JPG' : 'C3480139_CLEANED.JPG', + 'C3480140_GEOMED.JPG' : 'C3480139_GEOMED.JPG', + 'C3480140_RAW.JPG' : 'C3480139_RAW.JPG'})), + (r'.*/VGISS.*/BROWSE/C43892XX/C4389208_.*\.LBL', 0, + translator.TranslatorByDict( + {'C4389209_CALIB.JPG' : 'C4389208_CALIB.JPG', + 'C4389209_CLEANED.JPG' : 'C4389208_CLEANED.JPG', + 'C4389209_GEOMED.JPG' : 'C4389208_GEOMED.JPG', + 'C4389209_RAW.JPG' : 'C4389208_RAW.JPG'})), +]) + +KNOWN_MISSING_LABELS = translator.TranslatorByRegex([ + (r'.*/document/.*', re.I, 'missing'), + (r'.*/COCIRS_.*\.VAR', 0, 'missing'), + (r'.*/COCIRS_.*VANILLA.*', re.I, 'missing'), + (r'.*/COCIRS_0209/DATA/NAV_DATA/RIN02101300.DAT', 0, 'missing'), + (r'.*/COCIRS_0602/DATA/UNCALIBR/FIFM06021412.DAT', 0, 'missing'), + (r'.*/COISS_00.*/document/report/.*', 0, 'missing'), + (r'.*/COISS_0011/calib.*\.tab', 0, 'missing'), + (r'.*/COISS_0011/calib/calib.tar.gz', 0, 'missing'), + (r'.*/COISS_0011/extras/.*\.pro', 0, 'missing'), + (r'.*/COISS_0011/extras/cisscal.*', 0, 'missing'), + (r'.*/CO(ISS|VIMS)_.*/extras/.*\.(tiff|png|jpg|jpeg|jpeg_small)', + 0, 'missing'), + (r'.*/COSP_xxxx.*\.(pdf|zip|tm|orb)', 0, 'missing'), + (r'.*/COUVIS_.*/SOFTWARE/.*\.(PRO|pro|DAT|IDL|JAR|SAV)',0, 'missing'), + (r'.*/COUVIS_.*/CALIB/.*\.DOC', 0, 'missing'), + (r'.*/COUVIS_0xxx.*/SOFTWARE/CALIB/VERSION_4/t.t', 0, 'missing'), + (r'.*/COVIMS_0xxx.*/index/index.csv', 0, 'missing'), + (r'.*/COVIMS_0xxx.*/software/.*', 0, 'missing'), + (r'.*/COVIMS_0xxx.*/calib/example.*', 0, 'missing'), + (r'.*/COVIMS_0xxx.*/calib/.*\.(tab|qub|cub|bin|lbl)', 0, 'missing'), + (r'.*/COVIMS_0xxx.*/browse/.*\.pdf', 0, 'missing'), + (r'.*/COVIMS_0xxx.*\.(lbl|qub)-old_V[0-9]+', 0, 'missing'), + (r'.*/GO_0xxx_v1/GO_0001/CATALOG/REF.CAT.BAK', 0, 'missing'), + (r'.*/GO_0xxx.*/GO_0001/SOFTWARE/GALSOS2.EXE', 0, 'missing'), + (r'.*/GO_0xxx_v1/GO_0016/AAREADME.SL9', 0, 'missing'), + (r'.*/JNOJNC_0xxx.*/EXTRAS/.*\.PNG', 0, 'missing'), + (r'.*/NH.*/browse/.*\.jpg', 0, 'missing'), + (r'.*/NH.*/index/newline', 0, 'missing'), + (r'.*/NHxxMV.*/calib/.*\.png', 0, 'missing'), + (r'.*/NHSP_xxxx.*/DATASET.HTML', 0, 'missing'), + (r'.*/RPX.*/UNZIP532.*', 0, 'missing'), + (r'.*/RPX_xxxx/RPX_0201/CALIB/.*/(-180|128)', 0, 'missing'), + (r'.*/VG.*/VG..NESR\.DAT', 0, 'missing'), + (r'.*/VG_0xxx.*/CUMINDEX.TAB', 0, 'missing'), + (r'.*/VG_0xxx.*/SOFTWARE/.*', 0, 'missing'), + (r'.*/VG._9xxx.*/SOFTWARE/.*', 0, 'missing'), + (r'.*/VG2_9065/BROWSE/C0SR01AA.LOG', 0, 'missing'), + +# These files have internal PDS3 labels, so these are not errors + (r'.*/COISS_3xxx.*\.IMG', 0, 'unneeded'), + (r'.*/COUVIS_.*/SOFTWARE/.*\.txt_.*', 0, 'unneeded'), + (r'.*/VG_.*\.(IMQ|IRQ|IBG)', 0, 'unneeded'), + (r'.*/VG_0xxx.*/(AAREADME.VMS|VTOC.SYS|IMGINDEX.DBF)', 0, 'unneeded'), +]) + +# Match pattern for any file name, but possibly things that are not file names +PATTERN = r'\'?\"?([A-Z0-9][-\w]*\.[A-Z0-9][-\w\.]*)\'?\"?' + +# Match pattern for the file name in anything of the form "keyword = filename" +TARGET_REGEX1 = re.compile(r'^ *\^?\w+ *= *\(?\{? *' + PATTERN, re.I) + +# Match pattern for a file name on a line by itself +TARGET_REGEX2 = re.compile(r'^ *,? *' + PATTERN, re.I) + +# Match pattern for one or more file names embedded in a row of a text file. +# A file name begins with a letter, followed by any number of letters, digits, +# underscore or dash. Unless the name is "Makefile", it must have one or more +# extensions, each containing one or more characters. It can also have any +# number of directory prefixes separate by slashes. + +LINK_REGEX = re.compile(r'(?:|.*?[^/@\w\.])/?(?:\.\./)*(([A-Z0-9][-\w]+/)*' + + r'(makefile\.?|[A-Z0-9][\w-]*(\.[\w-]+)+))', re.I) + +EXTS_WO_LABELS = set(['.LBL', '.CAT', '.TXT', '.FMT', '.SFD']) + +################################################################################ + +class LinkInfo(object): + """Used internally to describe a link within a specified record of a file. + """ + + def __init__(self, recno, linkname, is_target): + + self.recno = recno # record number + self.linktext = linkname # substring within this record that looks + # like a link. + self.linkname = linkname # link text after possible repair for known + # errors. + self.is_target = is_target # True if, based on the local context, this + # might be a target of a label file + self.target = '' # abspath to target of link, if any. + # If not blank, this file must exist. + + def remove_path(self): + """Remove any leading directory path from this LinkInfo object.""" + + if '/' in self.linktext: + self.linktext = self.linktext.rpartition('/')[2] + self.linkname = self.linktext + + def __str__(self): + return ('%d %s %s %s' % (self.recno, self.linktext, str(self.is_target), + self.target or '[' + self.linkname + ']')) + +def generate_links(dirpath, old_links={}, + limits={'info':-1, 'debug':500, 'ds_store':10}, logger=None): + """Generate a dictionary keyed by the absolute file path for files in the + given directory tree, which must correspond to a volume. + + Keys ending in .LBL, .CAT and .TXT return a list of tuples + (recno, link, target) + for each link found. Here, + recno = record number in file; + link = the text of the link; + target = absolute path to the target of the link. + + Other keys return a single string, which indicates the absolute path to the + label file describing this file. + + Unlabeled files not ending in .LBL, .CAT or .TXT return an empty string. + + Also return the latest modification date among all the files checked. + """ + + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.Pds4File.from_abspath(dirpath) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Finding link shelf files', dirpath, limits) + + try: + + linkinfo_dict = old_links.copy() # abspath: list of LinkInfo objects + label_dict = {k:v for k,v in old_links.items() if isinstance(v,str)} + # abspath: label for this file + abspaths = [] # list of all abspaths + + latest_mtime = 0. + + # Walk the directory tree, one subdirectory "root" at a time... + for (root, dirs, files) in os.walk(dirpath): + + local_basenames = [] # Tracks the basenames in this directory + local_basenames_uc = [] # Same as above, but upper case + for basename in files: + abspath = os.path.join(root, basename) + latest_mtime = max(latest_mtime, os.path.getmtime(abspath)) + + if basename == '.DS_Store': # skip .DS_Store files + logger.ds_store('.DS_Store file skipped', abspath) + continue + + if basename.startswith('._'): # skip dot_underscore files + logger.dot_underscore('dot_underscore file skipped', + abspath) + continue + + if basename.startswith('.'): # skip invisible files + logger.invisible('Invisible file skipped', abspath) + continue + + abspaths.append(abspath) + local_basenames.append(basename) + local_basenames_uc.append(basename.upper()) + + # Update linkinfo_dict, searching each relevant file for possible links. + # If the linking file is a label and the target file has a matching + # name, update the label_dict entry for the target. + candidate_labels = {} # {target: list of possible label basenames} + for basename in local_basenames: + + abspath = os.path.join(root, basename) + if abspath in linkinfo_dict: # for update op, skip existing links + continue + + basename_uc = basename.upper() + + # Only check LBL, CAT, TXT, etc. + ext = basename_uc[-4:] if len(basename) >= 4 else '' + if ext not in EXTS_WO_LABELS: + continue + + # Get list of link info for all possible linked filenames + logger.debug('*** REVIEWING', abspath) + linkinfo_list = read_links(abspath, logger=logger) + + # Apply repairs + repairs = REPAIRS.all(abspath) + for info in linkinfo_list: + for repair in repairs: + linkname = repair.first(info.linktext) + if linkname is None: + + # Attempt repair with leading directory path removed + if '/' in info.linktext: + info.remove_path() + linkname = repair.first(info.linktext) + + if linkname is None: + continue # no repair found + + info.linkname = linkname + if linkname == '': + logger.info('Ignoring link "%s"' % + info.linktext, abspath, force=True) + else: + logger.info('Repairing link "%s"->"%s"' % + (info.linktext, linkname), + abspath, force=True) + + # Validate non-local targets of repairs + if '/' in linkname: + target = os.path.join(root, linkname) + if os.path.exists(target): + info.target = os.path.abspath(target) + else: + logger.error('Target of repaired link is missing', + target) + + break # apply only one repair per found link + + # Validate or remove other targets + new_linkinfo_list = [] + baseroot_uc = basename_uc.partition('.')[0] + ltest = len(baseroot_uc) + for info in linkinfo_list: + if info.target: # Non-local, repaired links have targets + new_linkinfo_list.append(info) + continue + + # A blank linkname is from a repair; indicates to ignore + if info.linkname == '': + continue + + # Ignore self-references + linkname_uc = info.linkname.upper() + if linkname_uc == basename_uc: + continue + + # Check for target inside this directory + try: + match_index = local_basenames_uc.index(linkname_uc) + except ValueError: + match_index = None + + # If not found, maybe it is a non-local reference (.FMT perhaps) + if match_index is None: + + # It's easy to pick up floats as link candidates; ignore + try: + _ = float(info.linkname) + continue # Yup, it's just a float + except ValueError: + pass + + if info.linkname[-1] in ('e', 'E'): + try: + _ = float(info.linkname[:-1]) + continue # Float with exponent + except ValueError: + pass + + # Also ignore format specifications (e.g., "F10.3") + if info.linkname[0] in ('F', 'E', 'G'): + try: + _ = float(info.linkname[1:]) + continue # Format + except ValueError: + pass + + # Search non-locally + if '/' in info.linkname: + nonlocal_target = locate_link_with_path(abspath, + info.linkname) + else: + nonlocal_target = locate_nonlocal_link(abspath, + info.linkname) + + # Report the outcome + if nonlocal_target: + logger.debug('Located "%s"' % info.linkname, + nonlocal_target) + info.target = nonlocal_target + new_linkinfo_list.append(info) + continue + + if linkname_uc.endswith('.FMT'): + logger.error('Unable to locate .FMT file "%s"' % + info.linkname, abspath) + elif linkname_uc.endswith('.CAT'): + logger.error('Unable to locate .CAT file "%s"' % + info.linkname, abspath) + else: + logger.debug('Substring "%s" is not a link, ignored' % + info.linkname, abspath) + + continue + + # Save the match + info.linkname = local_basenames[match_index] # update case + info.target = os.path.join(root, info.linkname) + new_linkinfo_list.append(info) + + # Could this be the label? + if ext != '.LBL': # nope + continue + + # If names match up to '.LBL', then yes + if (len(linkname_uc) > ltest and + linkname_uc[:ltest] == baseroot_uc and + linkname_uc[ltest] == '.'): + label_dict[info.target] = abspath + logger.debug('Label identified for %s' % info.linkname, + abspath) + continue + + # Otherwise, then maybe + if info.is_target: + if info.linkname in candidate_labels: + if basename not in candidate_labels[info.linkname]: + candidate_labels[info.linkname].append(basename) + else: + candidate_labels[info.linkname] = [basename] + + logger.debug('Candidate label found for ' + + info.linkname, abspath) + + linkinfo_dict[abspath] = new_linkinfo_list + + # Identify labels for files + for basename in local_basenames: + + basename_uc = basename.upper() + ext = basename_uc[-4:] if len(basename) >= 4 else '' + if ext in (".LBL", ".FMT"): # these can't have labels + continue + + abspath = os.path.join(root, basename) + if abspath in label_dict: + continue # label already found + + # Maybe we already know the label is missing + test = KNOWN_MISSING_LABELS.first(abspath) + if test == 'unneeded': + logger.debug('Label is not neeeded', abspath) + continue + + if test == 'missing': + logger.debug('Label is known to be missing', abspath) + continue + + # Determine if a label is required + label_is_required = (ext not in EXTS_WO_LABELS) + + # Get the list of candidate labels in this directory + candidates = candidate_labels.get(basename, []) + + # Determine if the obvious label file exists + label_guess_uc = basename_uc.partition('.')[0] + '.LBL' + if label_guess_uc in local_basenames_uc: + k = local_basenames_uc.index(label_guess_uc) + obvious_label_basename = local_basenames[k] + else: + obvious_label_basename = '' + + # Simplest case... + if obvious_label_basename in candidates: + if not label_is_required: + logger.debug('Unnecessary label found', abspath, force=True) + + label_dict[abspath] = os.path.join(root, obvious_label_basename) + continue + + # More cases... + if not label_is_required: + continue # leave abspath out of label_dict + + # Report a phantom label + if obvious_label_basename: + logger.error('Label %s does not point to file' % + local_basenames[k], abspath) + + if len(candidates) == 1: + logger.debug('Label found as ' + candidates[0], abspath, + force=True) + label_dict[abspath] = os.path.join(root, candidates[0]) + continue + + # or errors... + label_dict[abspath] = "" + if len(candidates) == 0: + logger.error('Label is missing', abspath) + else: + logger.error('Ambiguous label found as %s' % candidates[0], + abspath, force=True) + for candidate in candidates[1:]: + logger.debug('Alternative label found as %s' % candidate, + abspath, force=True) + + # Merge the dictionaries + # There are cases where a file can have both a list of links and a label. + # This occurs when a .TXT or .CAT file has a label, even though it didn't + # need one. In the returned dictionary, link lists take priority. + link_dict = {} + for key in abspaths: + if key in linkinfo_dict: + # If this is a new entry, it's a list of LinkInfo objects + # If this was copied from old_links, it's already a list of tuples + values = linkinfo_dict[key] + if isinstance(values, list): + new_list = [] + for item in values: + if isinstance(item, LinkInfo): + new_list.append((item.recno, item.linktext, item.target)) + else: + new_list.append(item) + link_dict[key] = new_list + else: + link_dict[key] = values + elif key in label_dict: + link_dict[key] = label_dict[key] + else: + link_dict[key] = '' + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('Lastest holdings file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + return (link_dict, latest_mtime) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +def read_links(abspath, logger=None): + """Return a list of LinkInfo objects for anything linked or labeled by this + file. + """ + + with open(abspath, 'r', encoding='latin-1') as f: + recs = f.readlines() + + links = [] + multiple_targets = False + for recno,rec in enumerate(recs): + + while True: + + # Search for the target of a link + is_target = True + matchobj = TARGET_REGEX1.match(rec) + if matchobj: + subrec = rec[:matchobj.end()] + if '(' in subrec or '{' in subrec: + multiple_targets = True + + # ... on the same line or the next line + elif multiple_targets: + matchobj = TARGET_REGEX2.match(rec) + + # If not found, search for any other referenced file name or path + if not matchobj: + if ')' in rec or '}' in rec: + multiple_targets = False + + is_target = False + matchobj = LINK_REGEX.match(rec) + if matchobj: + multiple_targets = False + + # No more matches in this record + if not matchobj: + break + + linktext = matchobj.group(1) + links.append(LinkInfo(recno, linktext, is_target)) + + rec = rec[matchobj.end():] + + return links + +def locate_nonlocal_link(abspath, filename): + """Return the absolute path associated with a link in a PDS file. This is + done by searching up the tree and also by looking inside the LABEL, + CATALOG and INCLUDE directories if they exist.""" + + filename_uc = filename.upper() + + parts = abspath.split('/')[:-1] + + # parts are [..., 'holdings', 'volumes', volset, volname, ...] + # Therefore, if 'holdings' is in parts[:-3], then there's a volname in this + # path. + while 'pds4-holdings' in parts[:-3]: + testpath = '/'.join(parts) + basenames = os.listdir(testpath) + basenames_uc = [b.upper() for b in basenames] + try: + k = basenames_uc.index(filename_uc) + return testpath + '/' + basenames[k] + except ValueError: + pass + + for dirname in ['LABEL', 'CATALOG', 'INCLUDE', 'INDEX', 'DOCUMENT', + 'DATA', 'CALIB', 'EXTRAS', 'SOFTWARE']: + try: + k = basenames_uc.index(dirname) + subnames = os.listdir(testpath + '/' + basenames[k]) + subupper = [s.upper() for s in subnames] + try: + kk = subupper.index(filename_uc) + return testpath + '/' + basenames[k] + '/' + subnames[kk] + except ValueError: + pass + except ValueError: + pass + + parts = parts[:-1] + + return '' + +def locate_link_with_path(abspath, filename): + """Return the absolute path associated with a link that contains a leading + directory path. + """ + + parts = filename.split('/') + link_path = locate_nonlocal_link(abspath, parts[0]) + if not link_path: + return '' + + for part in parts[1:]: + basenames = os.listdir(link_path) + if part in basenames: + link_path += '/' + part + else: + basenames_uc = [b.upper() for b in basenames] + part_uc = part.upper() + if part_uc in basenames_uc: + k = basenames_uc.index(part_uc) + link_path += '/' + basenames[k] + else: + return '' + + return link_path + +################################################################################ + +def load_links(dirpath, limits={}, logger=None): + """Load link dictionary from a shelf file, converting interior paths to + absolute paths.""" + + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.Pds4File.from_abspath(dirpath) + + dirpath_ = dirpath.rstrip('/') + '/' + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Reading link shelf file for', dirpath, limits) + + try: + (link_path, lskip) = pdsdir.shelf_path_and_lskip('link') + prefix_ = pdsdir.volume_abspath() + '/' + + logger.info('Link shelf file', link_path) + + if not os.path.exists(link_path): + raise IOError('File not found: ' + link_path) + + # Read the shelf file and convert to a dictionary + with open(link_path, 'rb') as f: + interior_dict = pickle.load(f) + + # Convert interior paths to absolute paths + link_dict = {} + for (key, values) in interior_dict.items(): + long_key = dirpath_ + key + + if isinstance(values, list): + new_list = [] + for (recno, basename, interior_path) in values: + abspath = dirpath_ + str(interior_path) + if '../' in abspath: + abspath = os.path.abspath(abspath) + + new_list.append((recno, str(basename), abspath)) + + link_dict[long_key] = new_list + else: + values = str(values) + if values == '': + link_dict[long_key] = '' + else: + link_dict[long_key] = dirpath_ + values + + return link_dict + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def write_linkdict(dirpath, link_dict, limits={}, logger=None): + """Write a new link shelf file for a directory tree.""" + + # Initialize + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.Pds4File.from_abspath(dirpath) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Writing link shelf file for', dirpath, limits) + + try: + (link_path, lskip) = pdsdir.shelf_path_and_lskip('link') + logger.info('Link shelf file', link_path) + + # Create a dictionary using interior paths instead of absolute paths + interior_dict = {} + prefix = (dirpath + '/')[:lskip] + for (key, values) in link_dict.items(): + if isinstance(values, list): + new_list = [] + for (basename, recno, link_abspath) in values: + if link_abspath[:lskip] == prefix: + new_list.append((basename, recno, link_abspath[lskip:])) + else: # link outside this volume + link = pdsfile.Pds4File.from_abspath(link_abspath) + if (link.category_ == pdsdir.category_ and + link.bundleset == pdsdir.bundleset and + link.suffix == pdsdir.suffix): + link_relpath = '../' + link.bundlename_ + link.interior + elif link.category_ == pdsdir.category_: + link_relpath = ('../../' + link.bundleset_ + + link.bundlename_ + link.interior) + else: + link_relpath = ('../../../' + link.category_ + + link.bundleset_ + + link.bundlename_ + link.interior) + new_list.append((basename, recno, link_relpath)) + + interior_dict[key[lskip:]] = new_list + else: + interior_dict[key[lskip:]] = values[lskip:] + + # Create parent directory if necessary + parent = os.path.split(link_path)[0] + if not os.path.exists(parent): + logger.normal('Creating directory', parent) + os.makedirs(parent) + + # Write the shelf + with open(link_path, 'wb') as f: + pickle.dump(interior_dict, f) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + logger.open('Writing Python dictionary', dirpath) + try: + # Determine the maximum length of the file path and basename + len_key = 0 + len_base = 0 + for (key, value) in interior_dict.items(): + len_key = max(len_key, len(key)) + if isinstance(value, list): + tuples = value + for (recno, basename, interior_path) in tuples: + len_base = max(len_base, len(basename)) + + len_key = min(len_key, 60) + + # Write the python dictionary version + python_path = link_path.rpartition('.')[0] + '.py' + name = os.path.basename(python_path) + parts = name.split('_') + name = '_'.join(parts[:2]) + '_links' + keys = list(interior_dict.keys()) + keys.sort() + + with open(python_path, 'w', encoding='latin-1') as f: + f.write(name + ' = {\n') + for valtype in (list, str): + for key in keys: + if not isinstance(interior_dict[key], valtype): continue + + f.write(' "%s"' % key) + if len(key) < len_key: + f.write((len_key - len(key)) * ' ') + f.write(': ') + tuple_indent = max(len(key),len_key) + 7 + + values = interior_dict[key] + if isinstance(values, str): + f.write('"%s",\n' % values) + elif len(values) == 0: + f.write('[],\n') + else: + f.write('[') + for k in range(len(values)): + (recno, basename, interior_path) = values[k] + f.write('(%4d, ' % recno) + f.write('"%s, ' % (basename + '"' + + (len_base-len(basename)) * ' ')) + f.write('"%s")' % interior_path) + + if k < len(values) - 1: + f.write(',\n' + tuple_indent * ' ') + else: + f.write('],\n') + + f.write('}\n\n') + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def validate_links(dirpath, dirdict, shelfdict, limits={}, logger=None): + + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.Pds4File.from_abspath(dirpath) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Validating link shelf file for', dirpath, limits=limits) + + try: + keys = list(dirdict.keys()) + for key in keys: + if key in shelfdict: + dirinfo = dirdict[key] + shelfinfo = shelfdict[key] + + if type(dirinfo) == list: + dirinfo.sort() + + if type(shelfinfo) == list: + shelfinfo.sort() + + if dirinfo != shelfinfo: + logger.error('Link target mismatch', key) + + del shelfdict[key] + del dirdict[key] + + keys = list(dirdict.keys()) + keys.sort() + for key in keys: + logger.error('Missing link shelf file entry for', key) + + keys = list(shelfdict.keys()) + keys.sort() + for key in keys: + logger.error('Link shelf file entry found for missing file', key) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + return logger.close() + +################################################################################ + +def move_old_links(shelf_file, logger=None): + """Move a file to the /logs/ directory tree and append a time tag.""" + + if not os.path.exists(shelf_file): return + + shelf_basename = os.path.basename(shelf_file) + (shelf_prefix, shelf_ext) = os.path.splitext(shelf_basename) + + if logger is None: + logger = pdslogger.PdsLogger.get_logger(LOGNAME) + + from_logged = False + for log_dir in LOGDIRS: + dest_template = log_dir + '/' + shelf_prefix + '_v???' + shelf_ext + version_paths = glob.glob(dest_template) + + max_version = 0 + lskip = len(shelf_ext) + for version_path in version_paths: + version = int(version_path[-lskip-3:-lskip]) + max_version = max(max_version, version) + + new_version = max_version + 1 + dest = dest_template.replace('???', '%03d' % new_version) + shutil.copy(shelf_file, dest) + + if not from_logged: + logger.info('Link shelf file moved from: ' + shelf_file) + from_logged = True + + logger.info('Link shelf file moved to ' + dest) + + python_src = shelf_file.rpartition('.')[0] + '.py' + python_dest = dest.rpartition('.')[0] + '.py' + shutil.copy(python_src, python_dest) + + pickle_src = shelf_file.rpartition('.')[0] + '.pickle' + pickle_dest = dest.rpartition('.')[0] + '.pickle' + shutil.copy(pickle_src, pickle_dest) + +################################################################################ +# Simplified functions to perform tasks +################################################################################ + +def initialize(pdsdir, logger=None): + + link_path = pdsdir.shelf_path_and_lskip('link')[0] + + # Make sure file does not exist + if os.path.exists(link_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Link shelf file already exists', link_path) + return + + # Generate link info + (link_dict, _) = generate_links(pdsdir.abspath, logger=logger) + + # Move old file if necessary + if os.path.exists(link_path): + move_old_links(link_path, logger=logger) + + # Save link files + write_linkdict(pdsdir.abspath, link_dict, logger=logger) + +def reinitialize(pdsdir, logger=None): + + link_path = pdsdir.shelf_path_and_lskip('link')[0] + + # Warn if shelf file does not exist + if not os.path.exists(link_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Link shelf file does not exist; initializing', link_path) + initialize(pdsdir, logger=logger) + return + + # Generate link info + (link_dict, _) = generate_links(pdsdir.abspath, logger=logger) + + # Move old file if necessary + if os.path.exists(link_path): + move_old_links(link_path, logger=logger) + + # Save link files + write_linkdict(pdsdir.abspath, link_dict, logger=logger) + +def validate(pdsdir, logger=None): + + link_path = pdsdir.shelf_path_and_lskip('link')[0] + + # Make sure file exists + if not os.path.exists(link_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Link shelf file does not exist', link_path) + return + + # Read link shelf file + shelf_linkdict = load_links(pdsdir.abspath, logger=logger) + + # Generate link dict + (dir_linkdict, _) = generate_links(pdsdir.abspath, logger=logger) + + # Validate + validate_links(pdsdir.abspath, dir_linkdict, shelf_linkdict, logger=logger) + +def repair(pdsdir, logger=None): + + link_path = pdsdir.shelf_path_and_lskip('link')[0] + + # Make sure file exists + if not os.path.exists(link_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Link shelf file does not exist; initializing', link_path) + return + + # Read link shelf file + shelf_linkdict = load_links(pdsdir.abspath, logger=logger) + + # Generate link dict + (dir_linkdict, latest_mtime) = generate_links(pdsdir.abspath, logger=logger) + + # Compare + canceled = (dir_linkdict == shelf_linkdict) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + link_pypath = link_path.replace('.pickle', '.py') + link_mtime = min(os.path.getmtime(link_path), + os.path.getmtime(link_pypath)) + if latest_mtime > link_mtime: + logger.info('!!! Link shelf file content is up to date', + link_path, force=True) + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('!!! Latest holdings file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + dt = datetime.datetime.fromtimestamp(link_mtime) + logger.info('!!! Link shelf file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + delta = latest_mtime - link_mtime + if delta >= 86400/10: + logger.info('!!! Link shelf file is out of date %.1f days' % + (delta / 86400.), force=True) + else: + logger.info('!!! Link shelf file is out of date %.1f minutes' % + (delta / 60.), force=True) + + dt = datetime.datetime.now() + os.utime(link_path) + os.utime(link_pypath) + logger.info('!!! Time tag on link shelf files set to', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + else: + logger.info(f'!!! Link shelf file is up to date; repair canceled', + link_path, force=True) + return + + # Move files and write new links + move_old_links(link_path, logger=logger) + write_linkdict(pdsdir.abspath, dir_linkdict, logger=logger) + +def update(pdsdir, logger=None): + + link_path = pdsdir.shelf_path_and_lskip('link')[0] + + # Make sure link shelf file exists + if not os.path.exists(link_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Link shelf file does not exist; initializing', link_path) + initialize(pdsdir, logger=logger) + return + + # Read link shelf file + shelf_linkdict = load_links(pdsdir.abspath, logger=logger) + + # Generate link dict + (dir_linkdict, + latest_mtime) = generate_links(pdsdir.abspath, shelf_linkdict, + logger=logger) + + # Compare + canceled = (dir_linkdict == shelf_linkdict) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.info('!!! Link shelf file content is complete; update canceled', + link_path, force=True) + return + + # Move files and write new links + move_old_links(link_path, logger=logger) + write_linkdict(pdsdir.abspath, dir_linkdict, logger=logger) + +################################################################################ + +def main(): + + # Set up parser + parser = argparse.ArgumentParser( + description='pdslinkshelf: Create, maintain and validate shelves of ' + + 'links between files.') + + parser.add_argument('--initialize', '--init', const='initialize', + default='', action='store_const', dest='task', + help='Create a link shelf file for a volume. Abort ' + + 'if the checksum file already exists.') + + parser.add_argument('--reinitialize', '--reinit', const='reinitialize', + default='', action='store_const', dest='task', + help='Create a link shelf file for a volume. Replace ' + + 'the file if it already exists.') + + parser.add_argument('--validate', const='validate', + default='', action='store_const', dest='task', + help='Validate every link in a volume directory tree ' + + 'against its link shelf file.') + + parser.add_argument('--repair', const='repair', + default='', action='store_const', dest='task', + help='Validate every link in a volume directory tree ' + + 'against its link shelf file. If any ' + + 'disagreement is found, replace the shelf ' + + 'file; otherwise leave it unchanged. If any of ' + + 'the files checked are newer than the link shelf '+ + 'file, update shelf file\'s modification date') + + parser.add_argument('--update', const='update', + default='', action='store_const', dest='task', + help='Search a directory for any new files and add ' + + 'their links to the link shelf file. Links of ' + + 'pre-existing files are not checked.') + + parser.add_argument('volume', nargs='+', type=str, + help='The path to the root directory of a volume.') + + parser.add_argument('--log', '-l', type=str, default='', + help='Optional root directory for a duplicate of the ' + + 'log files. If not specified, the value of ' + + 'environment variable "%s" ' % LOGROOT_ENV + + 'is used. In addition, individual logs are ' + + 'written into the "logs" directory parallel to ' + + '"holdings". Logs are created inside the ' + + '"pdslinkshelf" subdirectory of each log root ' + + 'directory.' + ) + + parser.add_argument('--quiet', '-q', action='store_true', + help='Do not also log to the terminal.') + + # Parse and validate the command line + args = parser.parse_args() + + if not args.task: + print('pdslinkshelf error: Missing task') + sys.exit(1) + + status = 0 + + # Define the logging directory + if args.log == '': + try: + args.log = os.environ[LOGROOT_ENV] + except KeyError: + args.log = None + + # Initialize the logger + logger = pdslogger.PdsLogger(LOGNAME) + pdsfile.Pds4File.set_log_root(args.log) + + if not args.quiet: + logger.add_handler(pdslogger.stdout_handler) + + if args.log: + path = os.path.join(args.log, 'pdslinkshelf') + warning_handler = pdslogger.warning_handler(path) + logger.add_handler(warning_handler) + + error_handler = pdslogger.error_handler(path) + logger.add_handler(error_handler) + + # Generate a list of file paths before logging + paths = [] + for path in args.volume: + + if not os.path.exists(path): + print('No such file or directory: ' + path) + sys.exit(1) + + path = os.path.abspath(path) + pdsf = pdsfile.Pds4File.from_abspath(path) + + if pdsf.checksums_: + print('No link shelf files for checksum files: ' + path) + sys.exit(1) + + if pdsf.archives_: + print('No link shelf files for archive files: ' + path) + sys.exit(1) + + if pdsf.is_volset_dir: + paths += [os.path.join(path, c) for c in pdsf.childnames] + + else: + paths.append(os.path.abspath(path)) + + # Loop through tuples... + logger.open(' '.join(sys.argv)) + try: + for path in paths: + + pdsdir = pdsfile.Pds4File.from_abspath(path) + if not pdsdir.isdir: # skip volset-level readme files + continue + + # Save logs in up to two places + logfiles = set([pdsdir.log_path_for_volume('_links', + task=args.task, + dir='pdslinkshelf'), + pdsdir.log_path_for_volume('_links', + task=args.task, + dir='pdslinkshelf', + place='parallel')]) + + # Create all the handlers for this level in the logger + local_handlers = [] + LOGDIRS = [] # used by move_old_links() + for logfile in logfiles: + local_handlers.append(pdslogger.file_handler(logfile)) + logdir = os.path.split(logfile)[0] + LOGDIRS.append(os.path.split(logfile)[0]) + + # These handlers are only used if they don't already exist + warning_handler = pdslogger.warning_handler(logdir) + error_handler = pdslogger.error_handler(logdir) + local_handlers += [warning_handler, error_handler] + + # Open the next level of the log + if len(paths) > 1: + logger.blankline() + + logger.open('Task "' + args.task + '" for', path, + handler=local_handlers) + + try: + for logfile in logfiles: + logger.info('Log file', logfile) + + if args.task == 'initialize': + initialize(pdsdir) + + elif args.task == 'reinitialize': + reinitialize(pdsdir) + + elif args.task == 'validate': + validate(pdsdir) + + elif args.task == 'repair': + repair(pdsdir) + + else: # update + update(pdsdir) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + status = 1 + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + if fatal or errors: status = 1 + + sys.exit(status) + +if __name__ == '__main__': + main() diff --git a/pdsfile/pds3file/__init__.py b/pdsfile/pds3file/__init__.py index cecc02c..a6101e9 100644 --- a/pdsfile/pds3file/__init__.py +++ b/pdsfile/pds3file/__init__.py @@ -50,6 +50,9 @@ class Pds3File(PdsFile): OPUS_ID_TO_SUBCLASS = rules.OPUS_ID_TO_SUBCLASS FILESPEC_TO_BUNDLESET = rules.FILESPEC_TO_BUNDLESET + IDX_EXT = '.tab' + LBL_EXT = '.lbl' + def __init__(self): super().__init__() diff --git a/pdsfile/pds4file/__init__.py b/pdsfile/pds4file/__init__.py index c4c6c66..5919a15 100644 --- a/pdsfile/pds4file/__init__.py +++ b/pdsfile/pds4file/__init__.py @@ -68,6 +68,9 @@ class Pds4File(PdsFile): LOCAL_PRELOADED = [] SUBCLASSES = {} + IDX_EXT = '.csv' + LBL_EXT = '.xml' + def __init__(self): super().__init__() From 5509b5a23105d4ec68033465db8b6ff3e9cb7ff8 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Mon, 23 Sep 2024 15:35:57 -0700 Subject: [PATCH 08/21] Update pds4linkshelf.py to make sure we can create files in _linkshelf-* directory --- holdings_maintenance/pds4/pds4linkshelf.py | 27 ++++++----- pdsfile/pds3file/__init__.py | 14 ++++++ pdsfile/pds4file/__init__.py | 14 ++++++ pdsfile/pdsfile.py | 52 ++++++++++------------ 4 files changed, 67 insertions(+), 40 deletions(-) diff --git a/holdings_maintenance/pds4/pds4linkshelf.py b/holdings_maintenance/pds4/pds4linkshelf.py index 1b9b093..96adad2 100755 --- a/holdings_maintenance/pds4/pds4linkshelf.py +++ b/holdings_maintenance/pds4/pds4linkshelf.py @@ -628,7 +628,7 @@ LINK_REGEX = re.compile(r'(?:|.*?[^/@\w\.])/?(?:\.\./)*(([A-Z0-9][-\w]+/)*' + r'(makefile\.?|[A-Z0-9][\w-]*(\.[\w-]+)+))', re.I) -EXTS_WO_LABELS = set(['.LBL', '.CAT', '.TXT', '.FMT', '.SFD']) +EXTS_WO_LABELS = set(['.XML', '.CAT', '.TXT', '.FMT', '.SFD']) ################################################################################ @@ -698,6 +698,10 @@ def generate_links(dirpath, old_links={}, # Walk the directory tree, one subdirectory "root" at a time... for (root, dirs, files) in os.walk(dirpath): + # skip ring_models dirctory + if 'ring_models' in root: + continue + local_basenames = [] # Tracks the basenames in this directory local_basenames_uc = [] # Same as above, but upper case for basename in files: @@ -860,7 +864,7 @@ def generate_links(dirpath, old_links={}, new_linkinfo_list.append(info) # Could this be the label? - if ext != '.LBL': # nope + if ext != '.XML': # nope continue # If names match up to '.LBL', then yes @@ -890,7 +894,7 @@ def generate_links(dirpath, old_links={}, basename_uc = basename.upper() ext = basename_uc[-4:] if len(basename) >= 4 else '' - if ext in (".LBL", ".FMT"): # these can't have labels + if ext in (".XML", ".FMT"): # these can't have labels continue abspath = os.path.join(root, basename) @@ -914,7 +918,7 @@ def generate_links(dirpath, old_links={}, candidates = candidate_labels.get(basename, []) # Determine if the obvious label file exists - label_guess_uc = basename_uc.partition('.')[0] + '.LBL' + label_guess_uc = basename_uc.partition('.')[0] + '.XML' if label_guess_uc in local_basenames_uc: k = local_basenames_uc.index(label_guess_uc) obvious_label_basename = local_basenames[k] @@ -1569,8 +1573,8 @@ def main(): 'their links to the link shelf file. Links of ' + 'pre-existing files are not checked.') - parser.add_argument('volume', nargs='+', type=str, - help='The path to the root directory of a volume.') + parser.add_argument('--bundle', nargs='+', type=str, + help='The path to the root directory of a bundle.') parser.add_argument('--log', '-l', type=str, default='', help='Optional root directory for a duplicate of the ' + @@ -1619,7 +1623,7 @@ def main(): # Generate a list of file paths before logging paths = [] - for path in args.volume: + for path in args.bundle: if not os.path.exists(path): print('No such file or directory: ' + path) @@ -1636,7 +1640,7 @@ def main(): print('No link shelf files for archive files: ' + path) sys.exit(1) - if pdsf.is_volset_dir: + if pdsf.is_bundleset_dir: paths += [os.path.join(path, c) for c in pdsf.childnames] else: @@ -1648,14 +1652,15 @@ def main(): for path in paths: pdsdir = pdsfile.Pds4File.from_abspath(path) - if not pdsdir.isdir: # skip volset-level readme files + # skip volset-level readme files and *_support dirctiory + if not pdsdir.isdir or '_support' in pdsdir.abspath: continue # Save logs in up to two places - logfiles = set([pdsdir.log_path_for_volume('_links', + logfiles = set([pdsdir.log_path_for_bundle('_links', task=args.task, dir='pdslinkshelf'), - pdsdir.log_path_for_volume('_links', + pdsdir.log_path_for_bundle('_links', task=args.task, dir='pdslinkshelf', place='parallel')]) diff --git a/pdsfile/pds3file/__init__.py b/pdsfile/pds3file/__init__.py index a6101e9..af05135 100644 --- a/pdsfile/pds3file/__init__.py +++ b/pdsfile/pds3file/__init__.py @@ -9,6 +9,7 @@ from pdsfile.pdsfile import PdsFile from . import rules from pdsfile.preload_and_cache import cache_lifetime_for_class +import re class Pds3File(PdsFile): @@ -53,6 +54,19 @@ class Pds3File(PdsFile): IDX_EXT = '.tab' LBL_EXT = '.lbl' + BUNDLENAME_REGEX = re.compile(r'^([A-Z][A-Z0-9]{1,5}_(?:[0-9]{4}))$') + + BUNDLENAME_REGEX_I = re.compile(BUNDLENAME_REGEX.pattern, re.I) + BUNDLENAME_PLUS_REGEX = re.compile(BUNDLENAME_REGEX.pattern[:-1] + + r'(|_[a-z]+)(|_md5\.txt|\.tar\.gz)$') + BUNDLENAME_PLUS_REGEX_I = re.compile(BUNDLENAME_PLUS_REGEX.pattern, re.I) + BUNDLENAME_VERSION = re.compile(BUNDLENAME_REGEX.pattern[:-1] + + r'(_v[0-9]+\.[0-9]+\.[0-9]+|'+ + r'_v[0-9]+\.[0-9]+|_v[0-9]+|'+ + r'_in_prep|_prelim|_peer_review|'+ + r'_lien_resolution)$') + BUNDLENAME_VERSION_I = re.compile(BUNDLENAME_VERSION.pattern, re.I) + def __init__(self): super().__init__() diff --git a/pdsfile/pds4file/__init__.py b/pdsfile/pds4file/__init__.py index 5919a15..1ebd593 100644 --- a/pdsfile/pds4file/__init__.py +++ b/pdsfile/pds4file/__init__.py @@ -71,6 +71,20 @@ class Pds4File(PdsFile): IDX_EXT = '.csv' LBL_EXT = '.xml' + # TODO: Generalize PDS4 bundlenames in the future once we have more bundles + BUNDLENAME_REGEX = re.compile(r'^([a-zA-z\_].+)$') + + BUNDLENAME_REGEX_I = re.compile(BUNDLENAME_REGEX.pattern, re.I) + BUNDLENAME_PLUS_REGEX = re.compile(BUNDLENAME_REGEX.pattern[:-1] + + r'(|_[a-z]+)(|_md5\.txt|\.tar\.gz)$') + BUNDLENAME_PLUS_REGEX_I = re.compile(BUNDLENAME_PLUS_REGEX.pattern, re.I) + BUNDLENAME_VERSION = re.compile(BUNDLENAME_REGEX.pattern[:-1] + + r'(_v[0-9]+\.[0-9]+\.[0-9]+|'+ + r'_v[0-9]+\.[0-9]+|_v[0-9]+|'+ + r'_in_prep|_prelim|_peer_review|'+ + r'_lien_resolution)$') + BUNDLENAME_VERSION_I = re.compile(BUNDLENAME_VERSION.pattern, re.I) + def __init__(self): super().__init__() diff --git a/pdsfile/pdsfile.py b/pdsfile/pdsfile.py index 02cb117..a6a84a4 100644 --- a/pdsfile/pdsfile.py +++ b/pdsfile/pdsfile.py @@ -271,20 +271,6 @@ class PdsFile(object): r'(|_md5\.txt|\.tar\.gz))$') BUNDLESET_PLUS_REGEX_I = re.compile(BUNDLESET_PLUS_REGEX.pattern, re.I) - BUNDLENAME_REGEX = re.compile(r'^(([A-Z][A-Z0-9]{1,5}_(?:[0-9]{4}))|([a-zA-z\_].+))$') - # BUNDLENAME_REGEX = re.compile(r'^([A-Z][A-Z0-9]{1,5}_(?:[0-9]{4}))$') - - BUNDLENAME_REGEX_I = re.compile(BUNDLENAME_REGEX.pattern, re.I) - BUNDLENAME_PLUS_REGEX = re.compile(BUNDLENAME_REGEX.pattern[:-1] + - r'(|_[a-z]+)(|_md5\.txt|\.tar\.gz)$') - BUNDLENAME_PLUS_REGEX_I = re.compile(BUNDLENAME_PLUS_REGEX.pattern, re.I) - BUNDLENAME_VERSION = re.compile(BUNDLENAME_REGEX.pattern[:-1] + - r'(_v[0-9]+\.[0-9]+\.[0-9]+|'+ - r'_v[0-9]+\.[0-9]+|_v[0-9]+|'+ - r'_in_prep|_prelim|_peer_review|'+ - r'_lien_resolution)$') - BUNDLENAME_VERSION_I = re.compile(BUNDLENAME_VERSION.pattern, re.I) - CATEGORY_REGEX = re.compile(r'^(|checksums\-)(|archives\-)(\w+)$') CATEGORY_REGEX_I = re.compile(CATEGORY_REGEX.pattern, re.I) @@ -1322,11 +1308,11 @@ def os_path_exists(cls, abspath, force_case_sensitive=False): return os.path.exists(abspath) # Handle index rows - if '.tab/' in abspath: - parts = abspath.partition('.tab/') - if not cls.os_path_exists(parts[0] + '.tab'): + if f'{cls.IDX_EXT}/' in abspath: + parts = abspath.partition(f'{cls.IDX_EXT}/') + if not cls.os_path_exists(parts[0] + cls.IDX_EXT): return False - pdsf = cls.from_abspath(parts[0] + '.tab') + pdsf = cls.from_abspath(parts[0] + cls.IDX_EXT) return (pdsf.exists and pdsf.child_of_index(parts[2], flag='').exists) @@ -1867,14 +1853,14 @@ def indexshelf_abspath(self): cls = type(self) if self._indexshelf_abspath is None: - if self.extension not in ('.tab', '.TAB'): + if self.extension not in (cls.IDX_EXT, cls.IDX_EXT.upper()): self._indexshelf_abspath = '' else: abspath = self.abspath abspath = abspath.replace(f'/{cls.PDS_HOLDINGS}/', f'/{cls.PDS_HOLDINGS}/_indexshelf-') - abspath = abspath.replace('.tab', '.pickle') - abspath = abspath.replace('.TAB', '.pickle') + abspath = abspath.replace(cls.IDX_EXT, '.pickle') + abspath = abspath.replace(cls.IDX_EXT.upper(), '.pickle') self._indexshelf_abspath = abspath self._recache() @@ -1887,6 +1873,7 @@ def is_index(self): presence of the corresponding indexshelf file. """ + cls = type(self) if self._is_index is None: abspath = self.indexshelf_abspath if abspath and os.path.exists(abspath): @@ -1897,7 +1884,7 @@ def is_index(self): # file is being created. # XXX This is a real hack and should be looked at again later if ('/metadata/' in self.abspath - and self.abspath.lower().endswith('.tab')): + and self.abspath.lower().endswith(cls.IDX_EXT)): return True # this value is not cached self._is_index = False @@ -1913,9 +1900,11 @@ def index_pdslabel(self): if not self.is_index: return None + cls = type(self) if self._index_pdslabel is None: - label_abspath = self.abspath.replace ('.tab', '.lbl') - label_abspath = label_abspath.replace('.TAB', '.LBL') + label_abspath = self.abspath.replace (cls.IDX_EXT, cls.LBL_EXT) + label_abspath = label_abspath.replace(cls.IDX_EXT.upper(), + cls.LBL_EXT.upper()) try: self._index_pdslabel = pdsparser.PdsLabel.from_file(label_abspath) except: @@ -2628,16 +2617,20 @@ def label_basename(self): # Take a first guess at the label filename; PDS3 only! if self.extension.isupper(): - ext_guesses = ('.LBL', '.lbl') + ext_guesses = (cls.LBL_EXT.upper(), cls.LBL_EXT) else: - ext_guesses = ('.lbl', '.LBL') + ext_guesses = (cls.LBL_EXT, cls.LBL_EXT.upper()) rootname = self.basename[:-len(self.extension)] test_basenames = [rootname + ext for ext in ext_guesses] + print('xxxxxxxx') + print(test_basenames) # If one of the guessed files exist, it's the label for test_basename in test_basenames: test_abspath = self.abspath.rpartition('/')[0] + '/' + test_basename + print('1111111111') + print(test_abspath) if cls.os_path_exists(test_abspath, force_case_sensitive=True): self._label_basename_filled = test_basename self._recache() @@ -5475,7 +5468,8 @@ def basename_is_label(self, basename): basename -- basename of a file """ - return (len(basename) > 4) and (basename[-4:].lower() == '.lbl') + cls = type(self) + return (len(basename) > 4) and (basename[-4:].lower() == cls.LBL_EXT) def basename_is_viewable(self, basename=None): """Return True if this basename is viewable. Override if viewable files can @@ -5968,8 +5962,8 @@ def associated_abspaths(self, category, must_exist=True): for pattern in patterns: # Handle an index row by separating the filepath from the suffix - if '.tab/' in pattern: - parts = pattern.rpartition('.tab') + if f'{cls.IDX_EXT}/' in pattern: + parts = pattern.rpartition(cls.IDX_EXT) pattern = parts[0] + parts[1] suffix = parts[2][1:] else: From b0a7944ede8125c3de6ab0e155cfd54302ea580a Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Wed, 25 Sep 2024 10:00:59 -0700 Subject: [PATCH 09/21] Remove the code that bypass _support & ring_modeles when creating checksums, infoshelf, and linkshelf files. --- holdings_maintenance/pds4/pds4checksums.py | 4 - holdings_maintenance/pds4/pds4infoshelf.py | 4 - holdings_maintenance/pds4/pds4linkshelf.py | 592 +-------------------- pdsfile/pdsfile.py | 4 - 4 files changed, 7 insertions(+), 597 deletions(-) diff --git a/holdings_maintenance/pds4/pds4checksums.py b/holdings_maintenance/pds4/pds4checksums.py index b78e333..3d1f02f 100755 --- a/holdings_maintenance/pds4/pds4checksums.py +++ b/holdings_maintenance/pds4/pds4checksums.py @@ -759,10 +759,6 @@ def main(): for (pdsdir, selection) in info: path = pdsdir.abspath - # skip _support dirctory - if '_support' in path: - continue - if selection: pdsf = pdsdir.child(os.path.basename(selection)) else: diff --git a/holdings_maintenance/pds4/pds4infoshelf.py b/holdings_maintenance/pds4/pds4infoshelf.py index 4814356..a5e09b4 100755 --- a/holdings_maintenance/pds4/pds4infoshelf.py +++ b/holdings_maintenance/pds4/pds4infoshelf.py @@ -801,10 +801,6 @@ def main(): try: for (pdsdir, selection) in info: - # skip _support dirctory - if '_support' in pdsdir.abspath: - continue - info_path = pdsdir.shelf_path_and_lskip('info')[0] if selection: diff --git a/holdings_maintenance/pds4/pds4linkshelf.py b/holdings_maintenance/pds4/pds4linkshelf.py index 96adad2..37f06fc 100755 --- a/holdings_maintenance/pds4/pds4linkshelf.py +++ b/holdings_maintenance/pds4/pds4linkshelf.py @@ -27,588 +27,9 @@ # Holds log file directories temporarily, used by move_old_links() LOGDIRS = [] -REPAIRS = translator.TranslatorByRegex([ - - # COCIRS - (r'.*/COCIRS_[01].*/DATAINFO\.TXT', 0, - translator.TranslatorByDict( - {'DIAG.FMT' : 'UNCALIBR/DIAG.FMT', - 'FRV.FMT' : 'UNCALIBR/FRV.FMT', - 'GEO.FMT' : 'NAV_DATA/GEO.FMT', - 'HSK.FMT' : 'HSK_DATA/HSK.FMT', - 'IFGM.FMT' : 'UNCALIBR/IFGM.FMT', - 'IHSK.FMT' : 'UNCALIBR/IHSK.FMT', - 'ISPM.FMT' : 'APODSPEC/ISPM.FMT', - 'OBS.FMT' : 'UNCALIBR/OBS.FMT', - 'POI.FMT' : 'NAV_DATA/POI.FMT', - 'RIN.FMT' : 'NAV_DATA/RIN.FMT', - 'TAR.FMT' : 'NAV_DATA/TAR.FMT'})), - (r'.*/COCIRS_[01].*/AAREADME\.TXT', 0, - translator.TranslatorByDict( - {'DATASIS.TXT' : 'DOCUMENT/DATASIS.PDF', - 'VOLSYS.TXT' : 'DOCUMENT/VOLSYS.PDF'})), - (r'.*/COCIRS_[01].*/DATASET\.CAT', 0, - translator.TranslatorByDict( - {'DATASIS.TXT' : 'DATASIS.PDF'})), - (r'.*/COCIRS_[01].*/SOFTWARE/DOC/SDOCINFO\.TXT', 0, - translator.TranslatorByDict( - {'vanilla_guide.htm' : 'vanilla-guide.html', - 'vanilla_guide.pdf' : 'vanilla-guide.pdf'})), - (r'.*/COCIRS_[01].*/DOCUMENT/DOCINFO\.TXT', 0, - translator.TranslatorByDict( - {'cirs_fov_overview.fig1.tiff' : 'cirs_fov_overview_fig1.tiff', - 'cirs_fov_overview.fig2.tiff' : 'cirs_fov_overview_fig2.tiff', - 'cirs_fov_overview.fig3.tiff' : 'cirs_fov_overview_fig3.tiff'})), - (r'.*/COCIRS_[01].*/CUBE/.*\.(LBL|lbl)', 0, - translator.TranslatorByRegex([ - (r'([0-9A-Z_]+)\.DAT', 0, r'\1.tar.gz')])), - (r'.*/COCIRS_[56].*/TUTORIAL\.TXT', 0, - translator.TranslatorByDict( - {'GEODATA.FMT' : '../DATA/GEODATA/GEODATA.FMT', - 'ISPMDATA.FMT' : '../DATA/ISPMDATA/ISPMDATA.FMT', - 'POIDATA.FMT' : '../DATA/POIDATA/POIDATA.FMT', - 'RINDATA.FMT' : '../DATA/RINDATA/RINDATA.FMT', - 'TARDATA.FMT' : '../DATA/TARDATA/TARDATA.FMT', - 'filename.FMT' : ''})), - (r'.*/COCIRS_[56].*/BROWSE/.*\.LBL', 0, - translator.TranslatorByRegex([ - (r'(SPEC[0-9]{10}_FP[134]\.DAT)', 0, r'../../DATA/APODSPEC/\1'), - (r'(ISPM[0-9]{10}_FP[134]\.TAB)', 0, r'../../DATA/ISPMDATA/\1'), - (r'(RIN[0-9]{10}_FP[134]\.TAB)', 0, r'../../DATA/RINDATA/\1'), - (r'(POI[0-9]{10}_FP[134]\.TAB)', 0, r'../../DATA/POIDATA/\1'), - (r'(TAR[0-9]{10}_FP[134]\.TAB)', 0, r'../../DATA/TARDATA/\1'), - (r'(GEO[0-9]{10}_[0-9]{3}\.TAB)', 0, r'../../DATA/GEODATA/\1')])), - (r'.*/COCIRS_[56].*/DATA/APODSPEC/.*\.LBL', 0, - translator.TranslatorByRegex([ - (r'(ISPM[0-9]{10}_FP[134]\.TAB)', 0, r'../ISPMDATA/\1'), - (r'(RIN[0-9]{10}_FP[134]\.TAB)', 0, r'../RINDATA/\1'), - (r'(POI[0-9]{10}_FP[134]\.TAB)', 0, r'../POIDATA/\1'), - (r'(TAR[0-9]{10}_FP[134]\.TAB)', 0, r'../TARDATA/\1')])), - (r'.*/COCIRS_[56].*/DATA/ISPMDATA/.*\.LBL', 0, - translator.TranslatorByRegex([ - (r'(SPEC[0-9]{10}_FP[134]\.DAT)', 0, r'../APODSPEC/\1'), - (r'(RIN[0-9]{10}_FP[134]\.TAB)', 0, r'../RINDATA/\1'), - (r'(POI[0-9]{10}_FP[134]\.TAB)', 0, r'../POIDATA/\1'), - (r'(TAR[0-9]{10}_FP[134]\.TAB)', 0, r'../TARDATA/\1')])), - (r'.*/COCIRS_[56].*/DATA/RINDATA/.*\.LBL', 0, - translator.TranslatorByRegex([ - (r'(SPEC[0-9]{10}_FP[134]\.DAT)', 0, r'../APODSPEC/\1'), - (r'(ISPM[0-9]{10}_FP[134]\.TAB)', 0, r'../ISPMDATA/\1'), - (r'(POI[0-9]{10}_FP[134]\.TAB)', 0, r'../POIDATA/\1'), - (r'(TAR[0-9]{10}_FP[134]\.TAB)', 0, r'../TARDATA/\1')])), - (r'.*/COCIRS_[56].*/DATA/POIDATA/.*\.LBL', 0, - translator.TranslatorByRegex([ - (r'(SPEC[0-9]{10}_FP[134]\.DAT)', 0, r'../APODSPEC/\1'), - (r'(ISPM[0-9]{10}_FP[134]\.TAB)', 0, r'../ISPMDATA/\1'), - (r'(RIN[0-9]{10}_FP[134]\.TAB)', 0, r'../RINDATA/\1'), - (r'(TAR[0-9]{10}_FP[134]\.TAB)', 0, r'../TARDATA/\1')])), - (r'.*/COCIRS_[56].*/DATA/TARDATA/.*\.LBL', 0, - translator.TranslatorByRegex([ - (r'(SPEC[0-9]{10}_FP[134]\.DAT)', 0, r'../APODSPEC/\1'), - (r'(ISPM[0-9]{10}_FP[134]\.TAB)', 0, r'../ISPMDATA/\1'), - (r'(RIN[0-9]{10}_FP[134]\.TAB)', 0, r'../RINDATA/\1'), - (r'(POI[0-9]{10}_FP[134]\.TAB)', 0, r'../POIDATA/\1')])), - (r'.*/COCIRS_[56].*/AAREADME\.TXT', 0, - translator.TranslatorByDict( - {'REF.CAT' : 'CATALOG/CIRSREF.CAT'})), - - # COISS - (r'.*/COISS_0.*\.lbl', 0, - translator.TranslatorByDict( - {'PREFIX8.FMT' : 'prefix.fmt'})), - (r'.*/COISS_00.*/aareadme\.txt', 0, - translator.TranslatorByDict( - {'calinfo.txt' : '../COISS_0011/calib/calinfo.txt', - 'extrinfo.txt' : '../COISS_0011/extras/extrinfo.txt'})), - (r'.*/COISS_0.*/index\.lbl', 0, - translator.TranslatorByDict( - {'CUMINDEX.TAB' : 'index.tab'})), - (r'.*/COISS_0011/calib/darkcurrent/wac_\w+_dark_parameters04222\.lbl', 0, - translator.TranslatorByRegex([ - (r'wac_(\w+)_dark_parameters04228\.xdr', 0, r'wac_\1_dark_parameters04222.xdr')])), - (r'.*/COISS_[012].*/aareadme\.txt', 0, - translator.TranslatorByDict( - {'Calds.CAT' : '../../COISS_0xxx/COISS_0001/catalog/calds.cat', - 'calds.cat' : '../../COISS_0xxx/COISS_0001/catalog/calds.cat', - 'Jupiterds.CAT' : '../../COISS_1xxx/COISS_1001/catalog/jupiterds.cat', - 'jupiterds.cat' : '../../COISS_1xxx/COISS_1001/catalog/jupiterds.cat', - 'Saturnds.CAT' : '../../COISS_2xxx/COISS_2001/catalog/saturnds.cat', - 'saturnds.cat' : '../../COISS_2xxx/COISS_2001/catalog/saturnds.cat', - 'calinfo.txt' : '../../COISS_0xxx/COISS_0011/calib/calinfo.txt', - 'calib.tar.gz' : '../../COISS_0xxx/COISS_0011/calib/calib.tar.gz', - 'in_flight_cal.tex' : '../../COISS_0xxx/COISS_0011/document/in_flight_cal.tex', - 'in_flight_cal.pdf' : '../../COISS_0xxx/COISS_0011/document/in_flight_cal.pdf', - 'in_flight_cal.lbl' : '../../COISS_0xxx/COISS_0011/document/in_flight_cal.lbl', - 'theoretical_basis.tex': '../../COISS_0xxx/COISS_0011/document/theoretical_basis.tex', - 'theoretical_basis.pdf': '../../COISS_0xxx/COISS_0011/document/theoretical_basis.pdf', - 'theoretical_basis.lbl': '../../COISS_0xxx/COISS_0011/document/theoretical_basis.lbl', - 'theoretical_basis.ps' : '../../COISS_0xxx/COISS_0011/document/theoretical_basis.pdf', - 'cisscal.tar.gz' : '../../COISS_0xxx/COISS_0011/extras/cisscal.tar.gz'})), - (r'.*/COISS_[012].*/archsis\.txt', 0, - translator.TranslatorByDict( - {'Calds.CAT' : '../../../COISS_0xxx/COISS_0001/catalog/calds.cat', - 'calds.cat' : '../../../COISS_0xxx/COISS_0001/catalog/calds.cat', - 'Jupiterds.CAT' : '../../../COISS_1xxx/COISS_1001/catalog/jupiterds.cat', - 'jupiterds.cat' : '../../../COISS_1xxx/COISS_1001/catalog/jupiterds.cat', - 'Saturnds.CAT' : '../../../COISS_2xxx/COISS_2001/catalog/saturnds.cat', - 'saturnds.cat' : '../../../COISS_2xxx/COISS_2001/catalog/saturnds.cat'})), - - # COUVIS - (r'.*/COUVIS_0.*/INDEX\.LBL', 0, - translator.TranslatorByDict( - {'CUBEDS.CAT' : '../CATALOG/SCUBEDS.CAT'})), - (r'.*/COUVIS_0.*/AAREADME\.TXT', 0, - translator.TranslatorByDict( - {'INST.CAT' : 'CATALOG/UVISINST.CAT', - 'XCALDS.CAT' : 'CATALOG/SCALDS.CAT', - 'XCUBEDS.CAT' : 'CATALOG/SCUBEDS.CAT', - 'XSPECDS.CAT' : 'CATALOG/SSPECDS.CAT', - 'XSSBDS.CAT' : 'CATALOG/SSSBDS.CAT', - 'XWAVDS.CAT' : 'CATALOG/SWAVDS.CAT'})), - (r'.*/COUVIS_0.*/CATALOG/.*\.CAT', 0, - translator.TranslatorByDict( - {'SPECDS.CAT' : 'SSPECDS.CAT', - 'CUBEDS.CAT' : 'SCUBEDS.CAT'})), - (r'.*/COUVIS_0.*/SOFTWARE/READERS/READERS_README.TXT', 0, - translator.TranslatorByDict( - {'CATALOG/CUBEDS.CAT' : '../../CATALOG/SCUBEDS.CAT'})), - (r'.*/COUVIS_0.*/SOFTWARE/READERS/OLD.*/READERS_README.TXT', 0, - translator.TranslatorByDict( - {'CATALOG/CUBEDS.CAT' : '../../../CATALOG/SCUBEDS.CAT'})), - (r'.*/COUVIS_8xxx/.*/aareadme\.txt', 0, - translator.TranslatorByDict( - {'inst.cat' : 'catalog/uvisinst.cat'})), - (r'.*/COUVIS_8xxx_v1.*/AAREADME\.TXT', 0, - translator.TranslatorByDict( - {'INST.CAT' : 'CATALOG/UVISINST.CAT'})), - (r'.*/COUVIS_8xxx_v2.*/voldesc\.cat', 0, - translator.TranslatorByDict( - {'UVISINST.CAT' : 'catalog/inst.cat', - 'PROJREF.CAT' : ''})), - (r'.*/COUVIS_8xxx_v1/.*/CATINFO\.TXT', re.I, - translator.TranslatorByDict( - {'INST.CAT' : 'UVISINST.CAT'})), - (r'.*/COUVIS_8xxx(|_v2\.0)/.*/voldesc\.cat', re.I, - translator.TranslatorByDict( - {'PROJREF.CAT' : ''})), - (r'.*/metadata/.*/COUVIS_0.*_index\.lbl', 0, - translator.TranslatorByDict( - {'CUBEDS.CAT' : ''})), - - # COVIMS - (r'.*/COVIMS_0001/aareadme\.txt', 0, - translator.TranslatorByDict( - {'band_bin_center.fmt' : '../COVIMS_0002/label/band_bin_center.fmt', - 'core_description.fmt' : '../COVIMS_0002/label/core_description.fmt', - 'suffix_description.fmt': '../COVIMS_0002/label/suffix_description.fmt', - 'labinfo.txt' : '../COVIMS_0002/label/labinfo.txt'})), - (r'.*/COVIMS_0.../aareadme\.txt', 0, - translator.TranslatorByDict( - {'caldoc.txt' : 'software/doc/caldoc.txt', - 'make_dark.sav' : 'software/bin/make_dark.sav', - 'ppvl_10_1.zip' : 'software/lib/ppvl_1_10.zip', - 'ppvl_1_10.zip' : 'software/lib/ppvl_1_10.zip', - 'libPPVL.a' : 'software/lib/ppvl_1_10/libPPVL.a', - 'Makefile' : 'software/lib/ppvl_1_10/Makefile', - 'Makefile.sun' : 'software/lib/ppvl_1_10/Makefile.sun', - 'PIRL_strings.c' : 'software/lib/ppvl_1_10/PIRL_strings.c', - 'PIRL_strings.h' : 'software/lib/ppvl_1_10/PIRL_strings.h', - 'PPVL.c' : 'software/lib/ppvl_1_10/PPVL.c', - 'PPVL.h' : 'software/lib/ppvl_1_10/PPVL.h', - 'PPVL-README' : 'software/lib/ppvl_1_10/PPVL-README', - 'PPVL_report.c' : 'software/lib/ppvl_1_10/PPVL_report.c', - 'PPVL_selections.c' : 'software/lib/ppvl_1_10/PPVL_selections.c', - 'PPVL_selections.h' : 'software/lib/ppvl_1_10/PPVL_selections.h', - 'RANLIB.csh' : 'software/lib/ppvl_1_10/RANLIB.csh', - 'README' : 'software/lib/ppvl_1_10/README', - 'PPVL.3' : 'software/lib/ppvl_1_10/doc/PPVL.3', - 'PPVL_selections.3' : 'software/lib/ppvl_1_10/doc/PPVL_selections.3', - 'PPVL_report.1' : 'software/lib/ppvl_1_10/doc/PPVL_report.1', - 'PPVL_get_PDS_EOL.3' : 'software/lib/ppvl_1_10/doc/PPVL_get_PDS_EOL.3', - 'bp_trans.c' : 'software/src/c/cube_prep/bp_trans.c', - 'cube_prep.c' : 'software/src/c/cube_prep/cube_prep.c', - 'error.h' : 'software/src/c/ir_bg/error.h', - 'fit.c' : 'software/src/c/ir_bg/fit.c', - 'ir_bg.c' : 'software/src/c/ir_bg/ir_bg.c', - 'ir_bg_sub.c' : 'software/src/c/ir_bg_sub/ir_bg_sub.c', - 'mark_saturated.c' : 'software/src/c/mark_saturated/mark_saturated.c', - 'make_dark.pro' : 'software/src/idl/make_dark.pro', - 'vims_cal_pipe.pl' : 'software/src/perl/vims_cal_pipe.pl', - 'cal_pipe2.pm' : 'software/src/perl/cal_pipe2/cal_pipe2.pm', - 'cal_occultation.pm' : 'software/src/perl/cal_pipe2/cal_occultation.pm', - 'cal_point.pm' : 'software/src/perl/cal_pipe2/cal_point.pm', - 'dark_vis.pm' : 'software/src/perl/cal_pipe2/dark_vis.pm', - 'flat_ir2.pm' : 'software/src/perl/cal_pipe2/flat_ir2.pm', - 'flat_vis2.pm' : 'software/src/perl/cal_pipe2/flat_vis2.pm', - 'isis_geo.pm' : 'software/src/perl/cal_pipe2/isis_geo.pm', - 'solar_remove.pm' : 'software/src/perl/cal_pipe2/solar_remove.pm', - 'specific_energy.pm' : 'software/src/perl/cal_pipe2/specific_energy.pm'})), - (r'.*/COVIMS_0001/data/.*\.lbl', 0, - translator.TranslatorByDict( - {'band_bin_center.fmt' : '../../../COVIMS_0002/label/band_bin_center.fmt', - 'core_description.fmt' : '../../../COVIMS_0002/label/core_description.fmt', - 'suffix_description.fmt': '../../../COVIMS_0002/label/suffix_description.fmt', - 'BAND_BIN_CENTER.FMT' : '../../../COVIMS_0002/label/band_bin_center.fmt', - 'CORE_DESCRIPTION.FMT' : '../../../COVIMS_0002/label/core_description.fmt', - 'SUFFIX_DESCRIPTION.FMT': '../../../COVIMS_0002/label/suffix_description.fmt'})), - (r'.*/COVIMS_0001/document/archsis\.txt', 0, - translator.TranslatorByDict( - {'band_bin_center.fmt' : '../../COVIMS_0002/label/band_bin_center.fmt', - 'core_description.fmt' : '../../COVIMS_0002/label/core_description.fmt', - 'suffix_description.fmt': '../../COVIMS_0002/label/suffix_description.fmt', - 'BAND_BIN_CENTER.FMT' : '../../COVIMS_0002/label/band_bin_center.fmt', - 'CORE_DESCRIPTION.FMT' : '../../COVIMS_0002/label/core_description.fmt', - 'SUFFIX_DESCRIPTION.FMT': '../../COVIMS_0002/label/suffix_description.fmt'})), - (r'.*/COVIMS_0.*/document/archsis\.txt', 0, - translator.TranslatorByDict( - {'suffix.cat' : ''})), - (r'.*/COVIMS_0.*/errata\.txt', 0, - translator.TranslatorByDict( - {'center.fmt' : 'label/band_bin_center.fmt'})), - (r'.*/COVIMS_0024/data/2008017T190718_2008017T201544/v1579292302_1\.lbl', 0, - translator.TranslatorByDict( - {"v1579292302.qub" : "v1579292302_1.qub"})), - (r'.*/metadata/COVIMS.*/.*supplemental_index.lbl', 0, - translator.TranslatorByDict( - {'dpsis.txt': '../../../volumes/COVIMS_0xxx/COVIMS_0001/document/dpsis.txt'})), - (r'.*/COVIMS_8xxx_v2.*/voldesc.cat', 0, - translator.TranslatorByDict( - {'PROJREF.CAT' : ''})), - - # EBROCC - (r'.*/EBROCC_0001/INDEX/MCD_INDEX\.LBL', 0, - translator.TranslatorByDict( - {'LIC_INDEX.TAB' : 'MCD_INDEX.TAB'})), - (r'.*/EBROCC_0001/INDEX/PAL_INDEX\.LBL', 0, - translator.TranslatorByDict( - {'LIC_INDEX.TAB' : 'PAL_INDEX.TAB'})), - (r'.*/EBROCC_0001/SORCDATA/ESO1M/ES1_INGRESS_GEOMETRY\.LBL', 0, - translator.TranslatorByDict( - {'ES1_INGRESS_GEOMETRY.LBL': 'ES1_INGRESS_GEOMETRY.DAT'})), - - # GO - (r'.*/GO_0xxx.*/AAREADME\.TXT', 0, - translator.TranslatorByDict( - {'ttds.cat' : '../GO_0020/CATALOG/TTDS.CAT'})), - (r'.*/GO_0xxx_v1/GO_00(0[789]|1[0-6])/AAREADME\.TXT', 0, - translator.TranslatorByDict( - {'CATSTATUS.TXT' : 'DOCUMENT/CATSTAT.TXT'})), - (r'.*/GO_0xxx.*/GO_0001/CATALOG/DATASET\.CAT', 0, - translator.TranslatorByRegex( - [(r'(\w\w\w[1-4][sf]_blm02\.img)', 0, r'../BLEMISH/#UPPER#\1'), - (r'(\w\w\w[sf]_cal0[1-5]\.dat)', 0, r'../SLOPE/#UPPER#\1'), - (r'([123][sf]\w+_dc0[1-5]\.dat)', 0, r'../DARK/#UPPER#\1'), - (r'calibration_so02.img', 0, r'../SHUTTER/CALIBRATION_SO02.IMG')])), - (r'.*/GO_0xxx.*/GO_000[2-6]/CATALOG/DATASET\.CAT', 0, - translator.TranslatorByDict( - {'V_E1DS.CAT' : ''})), - (r'.*/GO_0xxx.*/GO_0001/DOCUMENT/PDSLABEL\.TXT', 0, - translator.TranslatorByDict( - {'RLINEPRX.FMT' : '../../GO_0002/LABEL/RLINEPRX.FMT', - 'RTLMTAB.FMT' : '../../GO_0002/LABEL/RTLMTAB.FMT'})), - (r'.*/GO_0xxx_v1/GO_0001/INDEX/CUMINDEX\.LBL', 0, - translator.TranslatorByDict( - {'IMGINDEX.TAB' : 'CUMINDEX.TAB'})), - (r'.*/GO_0xxx_v1/GO_0001/INDEX/P1CUMINDEX\.LBL', 0, - translator.TranslatorByDict( - {'IMGINDEX.TAB' : 'P1CUMINDEX.TAB'})), - - # HST - (r'.*/HSTJ.*/AAREADME\.TXT', 0, - translator.TranslatorByDict( - {'NST.CAT' : 'CATALOG/INST.CAT'})), - (r'.*/HSTJ.*/CATINFO\.TXT', 0, - translator.TranslatorByDict( - {'NST.CAT' : 'INST.CAT'})), - (r'.*/HSTJ.*_v.*/HSTJ1_0427/DATA/VISIT_02/.*\.LBL', 0, - translator.TranslatorByDict( - {'J96O02JLQ_FLT_WFC1.JPG': '', - 'J96O02JMQ_FLT_WFC1.JPG': '', - 'J96O02JLQ_FLT_WFC2.JPG': 'J96O02JLQ_FLT.JPG', - 'J96O02JMQ_FLT_WFC2.JPG': 'J96O02JMQ_FLT.JPG', - 'J96O02JOQ_FLT_WFC2.JPG': 'J96O02JOQ_FLT.JPG', - 'J96O02JQQ_FLT_WFC2.JPG': 'J96O02JQQ_FLT.JPG', - 'J96O02JSQ_FLT_WFC2.JPG': 'J96O02JSQ_FLT.JPG'})), - (r'.*/HSTJx_xxxx.*_v.*/HSTJ1_2395/DATA/.*\.LBL', 0, - translator.TranslatorByDict( - {'JBNY02SOQ_FLT_WFC1.JPG': '', - 'JBNY02SOQ_FLT_WFC2.JPG': 'JBNY02SOQ_FLT.JPG', - 'JBNY02SQQ_FLT_WFC2.JPG': 'JBNY02SQQ_FLT.JPG', - 'JBNY02SSQ_FLT_WFC2.JPG': 'JBNY02SSQ_FLT.JPG', - 'JBNYA1T2Q_FLT_WFC2.JPG': 'JBNYA1T2Q_FLT.JPG', - 'JBNYA2SUQ_FLT_WFC2.JPG': 'JBNYA2SUQ_FLT.JPG'})), - - # JNOJIR - (r'.*/JNOJIR.*/AAREADME.TXT', 0, - translator.TranslatorByDict( - {'PERSON.CAT' : 'JNO_JIRAM_PERSON.CAT', - 'DATAINFO.TXT' : ''})), - (r'.*/JNOJIR.*/JIR_IMG_\w+_RESPONSIVITY_V03.LBL', 0, - translator.TranslatorByRegex( - [(r'(JIR_IMG_\w+_RESPONSIVITY)_V02\.DAT', 0, r'\1_V03.DAT')])), - (r'.*/JNOJIR_20(2[789]|3\d)/DATA/JIR_\w+.LBL', 0, - translator.TranslatorByRegex( - [(r'(JIR_IMG_\w+_RESPONSIVITY)_V02\.DAT', 0, r'../CALIB/\1_V03.DAT')])), - # Embedded list comprehension - # Each links a SOURCE_PRODUCT_ID on JNOJIR_2nnn to the associated EDR in - # the parallel directory on JNOJIR_1nnn. Set up through volume _2049. - ] + [ - (fr'.*/JNOJIR_xxxx/JNOJIR_20{nn:02d}/DATA/JIR_\w+.LBL', 0, - translator.TranslatorByRegex( - [(r'(JIR_\w+_EDR_20\w+)\.(DAT|IMG)', 0, - fr'../../JNOJIR_10{nn:02d}/DATA/\1.\2')])) - for nn in range(0,50)] + [ - - # JNOJNC - (r'.*/JNOJNC.*/(AAREADME|CATINFO).TXT', 0, - translator.TranslatorByDict( - {'JUNO_REF.CAT' : 'JUNO_PROJREF.CAT'})), - - # NHSP (and *SP_xxxx) - (r'.*/NHSP_xxxx_v1.*/AAREADME\.TXT', 0, - translator.TranslatorByDict( - {'personel.cat' : 'CATALOG/PERSONNEL.CAT', - 'spiceds.cat' : 'CATALOG/SPICE_INST.CAT'})), - (r'.*SP_xxxx.*/aareadme\.txt', 0, - translator.TranslatorByDict( - {'dataset.cat' : 'catalog/spiceds.cat', - 'ckinfo.txt' : 'data/ck/ckinfo.txt', - 'ekinfo.txt' : 'data/ek/ekinfo.txt', - 'fkinfo.txt' : 'data/fk/fkinfo.txt', - 'ikinfo.txt' : 'data/ik/ikinfo.txt', - 'lskinfo.txt' : 'data/lsk/lskinfo.txt', - 'pckinfo.txt' : 'data/pck/pckinfo.txt', - 'sclkinfo.txt' : 'data/sclk/sclkinfo.txt', - 'spkinfo.txt' : 'data/spk/spkinfo.txt', - 'ckdoc.txt' : 'document/ck/ckdoc.txt', - 'ekdoc.txt' : 'document/ek/ekdoc.txt', - 'mkinfo.txt' : 'extras/mk/mkinfo.txt', - 'orbinfo.txt' : 'extras/orbnum/orbinfo.txt', - 'spkxinfo.txt' : 'extras/spkxtra/spkxinfo.txt', - 'covinfo.txt' : 'extras/spkxtra/covtab/covinfo.txt', - 'ckxtinfo.txt' : 'extras/ckxtra/ckxtinfo.txt', - 'navinfo.txt' : 'extras/ckxtra/cknav/navinfo.txt', - 'issinfo.txt' : 'extras/ckxtra/ckiss/issinfo.txt'})), - - # NHxxMV/NHxxLO - (r'.*/NHxx.._xxxx_v1/NH(JU|LA).*/aareadme\.txt', 0, - translator.TranslatorByDict( - {'PAYLOAD_SSR.LBL' : 'document/payload_ssr/payload_ssr.lbl', - 'RALPH_SSR.LBL' : 'document/ralph_ssr/ralph_ssr.lbl', - 'SOC_INST_ICD.LBL' : 'document/soc_inst_icd/soc_inst_icd.lbl'})), - (r'.*/NHxx.._xxxx_v1/NH(JU|LA).*/AAREADME\.TXT', 0, - translator.TranslatorByDict( - {'PAYLOAD_SSR.LBL' : 'DOCUMENT/PAYLOAD_SSR/PAYLOAD_SSR.LBL', - 'RALPH_SSR.LBL' : 'DOCUMENT/RALPH_SSR/RALPH_SSR.LBL', - 'SOC_INST_ICD.LBL' : 'DOCUMENT/SOC_INST_ICD/SOC_INST_ICD.LBL'})), - (r'.*/NHxxLO_xxxx.*/NH..LO_2001/data/\w+/.*\.lbl', 0, - translator.TranslatorByRegex( - [(r'cflat_grnd_SFA_(\w+\.fit)', 0, r'../../calib/cflat_grnd_sfa_\1'), - (r'(cflat|dead|delta|dsmear|hot|sap)_(\w+\.fit)', 0, r'../../calib/\1_\2')])), - (r'.*/NHxxMV_xxxx.*/NH..MV_2001/data/\w+/.*\.lbl', 0, - translator.TranslatorByRegex( - [(r'(mc[0-3])_(flat_\w+\.fit)s', 0, r'../../calib/mcl/\1_\2'), - (r'(mp[12])_(flat_\w+\.fit)s', 0, r'../../calib/mp/\1_\2'), - (r'(mfr_flat_\w+\.fit)s', 0, r'../../calib/mfr/\1')])), - - # RPX - (r'.*/RPX_0101.*/R_HARRIS\.LBL', 0, - translator.TranslatorByDict( - {'R_HARRIS.DF' : 'R_HARRIS.PDF'})), - (r'.*/RPX_0101.*/F161225AB\.LBL', 0, - translator.TranslatorByDict( - {'F161225RB.GIF' : 'F161225AB.GIF'})), - (r'.*/RPX_0201.*/T0808_F1498_CAL\.LBL', 0, - translator.TranslatorByDict( - {'T0808_F1497_CAL.IMG' : 'T0808_F1498_CAL.IMG'})), - (r'.*/RPX_0401/AAREADME\.TXT', 0, - translator.TranslatorByDict( - {'INSTHOST.CAT' : 'CATALOG/HOST.CAT'})), - - # Any VG - (r'.*/VG.*/CATALOG/CATINFO\.TXT', 0, - translator.TranslatorByDict( - {'VGnNINST.CAT' : 'VG1NINST.CAT', - 'VGnHOST.CAT' : 'VG1HOST.CAT'})), - - # VG_20xx (IRIS) - (r'.*/VG_2001/.*/VG2_SAT\.LBL', 0, - translator.TranslatorByDict( - {'IRIS_ROWFMT.FMT' : '../JUPITER/IRISHEDR.FMT'})), - (r'.*/VG_2001/AAREADME\.TXT', 0, - translator.TranslatorByDict( - {'IRISHEDR.FMT' : 'JUPITER/IRISHEDR.FMT', - 'IRISTRGP.FMT' : 'JUPITER/CALIB/IRISTRGP.FMT'})), - - # VG_28xx (ring profiles) - (r'.*/VG_28[0-9]{2}/.*INFO\.TXT', 0, - translator.TranslatorByDict( - {'RS1SINST.CAT' : 'VG1SINST.CAT', - 'RS2UINST.CAT' : 'VG2UINST.CAT'})), - (r'.*/VG_28xx/VG_2801/CALIB/PS2C01\.LBL', 0, - translator.TranslatorByDict( - {'PS1C01.TAB' : 'PS2C01.TAB'})), - (r'.*/VG_28xx/VG_2801/JITTER/PS1J01\.LBL', 0, - translator.TranslatorByDict( - {'PS1J02.TAB' : 'PS1J01.TAB'})), - (r'.*/VG_28xx/VG_2801/JITTER/PU2J02\.LBL', 0, - translator.TranslatorByDict( - {'PU2J01.TAB' : 'PU2J02.TAB'})), - (r'.*/VG_280./.*/L3GUIDE\.TXT', 0, - translator.TranslatorByDict( - {'RTLMTAB.FMT' : ''})), - (r'.*/VG_2802/EDITDATA/DATAINFO\.TXT', 0, - translator.TranslatorByDict( - {'INST.CAT' : '../CATALOG/VG1INST.CAT'})), - (r'.*/VG_2802/EDITDATA/US3D01P\.LBL', 0, - translator.TranslatorByDict( - {'US3D01I.DAT' : 'US3D01P.DAT'})), - (r'.*/VG_2802/SORCDATA/DATAINFO\.TXT', 0, - translator.TranslatorByDict( - {'BETAPER.VOY' : 'BETPER.VOY', - 'BETAPER.LBL' : 'BETPER.LBL'})), - (r'.*/VG_2803.*/RS.R1BFV\.LBL', 0, - translator.TranslatorByDict( - {'RS_R1BFT.FMT' : 'RS_R1BFV.FMT'})), - - # VGn_9xxx (RSS) - (r'.*/VG[12]_9.*/CHECKSUMS.TXT', 0, # any file referenced in CHECKSUMS.TXT - # already has a full path; don't search - translator.TranslatorByRegex([(r'(.*)', 0, r'\1')])), - (r'.*/VG[12]_9.*/ERRATA.TXT', 0, - translator.TranslatorByDict( - {'_PERSON.CAT' : 'CATALOG/VG_RSS_PERSON.CAT'})), - (r'.*/VG1_9050/CATALOG/CATINFO.TXT', 0, - translator.TranslatorByDict( - {'MISSION.CAT' : 'VG_MISSION.CAT', - 'INST_HOST.CAT' : 'VG1_INST_HOST.CAT', - 'INST.CAT' : 'VG1_RSS_INST.CAT', - 'DS.CAT' : 'VG1_SAT_RSS_DS.CAT', - 'PERSON.CAT' : 'VG_RSS_PERSON.CAT', - 'REF.CAT' : 'VG1_S_RSS_REF.CAT', - 'TARGET.CAT' : 'VG_SAT_TARGET.CAT', - 'VG1_SAT_TARGET.CAT' : 'VG_SAT_TARGET.CAT'})), - (r'.*/VG1_9056/CATALOG/CATINFO.TXT', 0, - translator.TranslatorByDict( - {'MISSION.CAT' : 'VG_MISSION.CAT', - 'INSTHOST.CAT' : 'VG1_INST_HOST.CAT', - 'INST.CAT' : 'VG1_RSS_INST.CAT', - 'DS.CAT' : 'VG1_SSA_RSS_DS.CAT', - 'PERSON.CAT' : 'VG_RSS_PERSON.CAT', - 'REF.CAT' : 'VG1_SSA_RSS_REF.CAT', - 'TARGET.CAT' : 'VG_TITAN_TARGET.CAT'})), - (r'.*/VG2_9065/CATALOG/CATINFO.TXT', 0, - translator.TranslatorByDict( - {'MISSION.CAT' : 'VG_MISSION.CAT', - 'INSTHOST.CAT' : 'VG2_INST_HOST.CAT', - 'INST.CAT' : 'VG2_RSS_INST.CAT', - 'DS.CAT' : 'VG2_S_RSS_DS.CAT', - 'PERSON.CAT' : 'VG_RSS_PERSON.CAT', - 'REF.CAT' : 'VG2_S_RSS_REF.CAT', - 'TARGET.CAT' : 'VG_SAT_TARGET.CAT'})), - - # VGIRIS - (r'.*/VGIRIS_0001/AAREADME\.TXT', 0, - translator.TranslatorByDict( - {'JUPITER_ASCII.FMT' : 'DATA/JUPITER_VG1/JUPITER_ASCII.FMT', - 'JUPITER_LSB.FMT' : 'DATA/JUPITER_VG1/JUPITER_LSB.FMT', - 'JUPITER_MSB.FMT' : 'DATA/JUPITER_VG1/JUPITER_MSB.FMT', - 'SATURN_ASCII.FMT' : '', - 'SATURN_LSB.FMT' : '', - 'SATURN_MSB.FMT' : '', - 'VGnINST.CAT' : 'CATALOG/VG1INST.CAT', - 'VGnHOST.CAT' : 'CATALOG/VG1HOST.CAT'})), - (r'.*/VGIRIS_0001/DATA/DATAINFO\.TXT', 0, - translator.TranslatorByDict( - {'JUPITER_ASCII.FMT' : 'JUPITER_VG1/JUPITER_ASCII.FMT', - 'JUPITER_LSB.FMT' : 'JUPITER_VG1/JUPITER_LSB.FMT', - 'JUPITER_MSB.FMT' : 'JUPITER_VG1/JUPITER_MSB.FMT', - 'SATURN_ASCII.FMT' : '', - 'SATURN_LSB.FMT' : '', - 'SATURN_MSB.FMT' : '', - 'VGnINST.CAT' : '../CATALOG/VG1INST.CAT', - 'VGnHOST.CAT' : '../CATALOG/VG1HOST.CAT'})), - (r'.*/VGIRIS_0002/AAREADME\.TXT', 0, - translator.TranslatorByDict( - {'JUPITER_ASCII.FMT' : '', - 'JUPITER_LSB.FMT' : '', - 'JUPITER_MSB.FMT' : '', - 'SATURN_ASCII.FMT' : 'DATA/SATURN_VG1/SATURN_ASCII.FMT', - 'SATURN_LSB.FMT' : 'DATA/SATURN_VG1/SATURN_LSB.FMT', - 'SATURN_MSB.FMT' : 'DATA/SATURN_VG1/SATURN_MSB.FMT', - 'VGnINST.CAT' : 'CATALOG/VG1INST.CAT', - 'VGnHOST.CAT' : 'CATALOG/VG1HOST.CAT'})), - (r'.*/VGIRIS_0002/DATA/DATAINFO\.TXT', 0, - translator.TranslatorByDict( - {'JUPITER_ASCII.FMT' : '', - 'JUPITER_LSB.FMT' : '', - 'JUPITER_MSB.FMT' : '', - 'SATURN_ASCII.FMT' : 'SATURN_VG1/SATURN_ASCII.FMT', - 'SATURN_LSB.FMT' : 'SATURN_VG1/SATURN_LSB.FMT', - 'SATURN_MSB.FMT' : 'SATURN_VG1/SATURN_MSB.FMT', - 'VGnINST.CAT' : '../CATALOG/VG1INST.CAT', - 'VGnHOST.CAT' : '../CATALOG/VG1HOST.CAT'})), - - # VGISS - (r'.*/VGISS.*/BROWSE/C34801XX/C3480139_.*\.LBL', 0, - translator.TranslatorByDict( - {'C3480140_CALIB.JPG' : 'C3480139_CALIB.JPG', - 'C3480140_CLEANED.JPG' : 'C3480139_CLEANED.JPG', - 'C3480140_GEOMED.JPG' : 'C3480139_GEOMED.JPG', - 'C3480140_RAW.JPG' : 'C3480139_RAW.JPG'})), - (r'.*/VGISS.*/BROWSE/C43892XX/C4389208_.*\.LBL', 0, - translator.TranslatorByDict( - {'C4389209_CALIB.JPG' : 'C4389208_CALIB.JPG', - 'C4389209_CLEANED.JPG' : 'C4389208_CLEANED.JPG', - 'C4389209_GEOMED.JPG' : 'C4389208_GEOMED.JPG', - 'C4389209_RAW.JPG' : 'C4389208_RAW.JPG'})), -]) - -KNOWN_MISSING_LABELS = translator.TranslatorByRegex([ - (r'.*/document/.*', re.I, 'missing'), - (r'.*/COCIRS_.*\.VAR', 0, 'missing'), - (r'.*/COCIRS_.*VANILLA.*', re.I, 'missing'), - (r'.*/COCIRS_0209/DATA/NAV_DATA/RIN02101300.DAT', 0, 'missing'), - (r'.*/COCIRS_0602/DATA/UNCALIBR/FIFM06021412.DAT', 0, 'missing'), - (r'.*/COISS_00.*/document/report/.*', 0, 'missing'), - (r'.*/COISS_0011/calib.*\.tab', 0, 'missing'), - (r'.*/COISS_0011/calib/calib.tar.gz', 0, 'missing'), - (r'.*/COISS_0011/extras/.*\.pro', 0, 'missing'), - (r'.*/COISS_0011/extras/cisscal.*', 0, 'missing'), - (r'.*/CO(ISS|VIMS)_.*/extras/.*\.(tiff|png|jpg|jpeg|jpeg_small)', - 0, 'missing'), - (r'.*/COSP_xxxx.*\.(pdf|zip|tm|orb)', 0, 'missing'), - (r'.*/COUVIS_.*/SOFTWARE/.*\.(PRO|pro|DAT|IDL|JAR|SAV)',0, 'missing'), - (r'.*/COUVIS_.*/CALIB/.*\.DOC', 0, 'missing'), - (r'.*/COUVIS_0xxx.*/SOFTWARE/CALIB/VERSION_4/t.t', 0, 'missing'), - (r'.*/COVIMS_0xxx.*/index/index.csv', 0, 'missing'), - (r'.*/COVIMS_0xxx.*/software/.*', 0, 'missing'), - (r'.*/COVIMS_0xxx.*/calib/example.*', 0, 'missing'), - (r'.*/COVIMS_0xxx.*/calib/.*\.(tab|qub|cub|bin|lbl)', 0, 'missing'), - (r'.*/COVIMS_0xxx.*/browse/.*\.pdf', 0, 'missing'), - (r'.*/COVIMS_0xxx.*\.(lbl|qub)-old_V[0-9]+', 0, 'missing'), - (r'.*/GO_0xxx_v1/GO_0001/CATALOG/REF.CAT.BAK', 0, 'missing'), - (r'.*/GO_0xxx.*/GO_0001/SOFTWARE/GALSOS2.EXE', 0, 'missing'), - (r'.*/GO_0xxx_v1/GO_0016/AAREADME.SL9', 0, 'missing'), - (r'.*/JNOJNC_0xxx.*/EXTRAS/.*\.PNG', 0, 'missing'), - (r'.*/NH.*/browse/.*\.jpg', 0, 'missing'), - (r'.*/NH.*/index/newline', 0, 'missing'), - (r'.*/NHxxMV.*/calib/.*\.png', 0, 'missing'), - (r'.*/NHSP_xxxx.*/DATASET.HTML', 0, 'missing'), - (r'.*/RPX.*/UNZIP532.*', 0, 'missing'), - (r'.*/RPX_xxxx/RPX_0201/CALIB/.*/(-180|128)', 0, 'missing'), - (r'.*/VG.*/VG..NESR\.DAT', 0, 'missing'), - (r'.*/VG_0xxx.*/CUMINDEX.TAB', 0, 'missing'), - (r'.*/VG_0xxx.*/SOFTWARE/.*', 0, 'missing'), - (r'.*/VG._9xxx.*/SOFTWARE/.*', 0, 'missing'), - (r'.*/VG2_9065/BROWSE/C0SR01AA.LOG', 0, 'missing'), - -# These files have internal PDS3 labels, so these are not errors - (r'.*/COISS_3xxx.*\.IMG', 0, 'unneeded'), - (r'.*/COUVIS_.*/SOFTWARE/.*\.txt_.*', 0, 'unneeded'), - (r'.*/VG_.*\.(IMQ|IRQ|IBG)', 0, 'unneeded'), - (r'.*/VG_0xxx.*/(AAREADME.VMS|VTOC.SYS|IMGINDEX.DBF)', 0, 'unneeded'), -]) +REPAIRS = translator.TranslatorByRegex([]) + +KNOWN_MISSING_LABELS = translator.TranslatorByRegex([]) # Match pattern for any file name, but possibly things that are not file names PATTERN = r'\'?\"?([A-Z0-9][-\w]*\.[A-Z0-9][-\w\.]*)\'?\"?' @@ -699,8 +120,8 @@ def generate_links(dirpath, old_links={}, for (root, dirs, files) in os.walk(dirpath): # skip ring_models dirctory - if 'ring_models' in root: - continue + # if 'ring_models' in root: + # continue local_basenames = [] # Tracks the basenames in this directory local_basenames_uc = [] # Same as above, but upper case @@ -1653,7 +1074,8 @@ def main(): pdsdir = pdsfile.Pds4File.from_abspath(path) # skip volset-level readme files and *_support dirctiory - if not pdsdir.isdir or '_support' in pdsdir.abspath: + # if not pdsdir.isdir or '_support' in pdsdir.abspath: + if not pdsdir.isdir: continue # Save logs in up to two places diff --git a/pdsfile/pdsfile.py b/pdsfile/pdsfile.py index a6a84a4..039f163 100644 --- a/pdsfile/pdsfile.py +++ b/pdsfile/pdsfile.py @@ -2624,13 +2624,9 @@ def label_basename(self): rootname = self.basename[:-len(self.extension)] test_basenames = [rootname + ext for ext in ext_guesses] - print('xxxxxxxx') - print(test_basenames) # If one of the guessed files exist, it's the label for test_basename in test_basenames: test_abspath = self.abspath.rpartition('/')[0] + '/' + test_basename - print('1111111111') - print(test_abspath) if cls.os_path_exists(test_abspath, force_case_sensitive=True): self._label_basename_filled = test_basename self._recache() From 10b67435a95df2d0806bc65afb131eb24036d2e5 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Fri, 11 Oct 2024 03:19:16 +0800 Subject: [PATCH 10/21] Add an intelligence to link a file to its correspsonding label if the file is in that label's file_name tags. (line 323-336, pds4linkshelf.py) --- holdings_maintenance/pds4/pds4linkshelf.py | 35 ++++++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/holdings_maintenance/pds4/pds4linkshelf.py b/holdings_maintenance/pds4/pds4linkshelf.py index 37f06fc..12a614f 100755 --- a/holdings_maintenance/pds4/pds4linkshelf.py +++ b/holdings_maintenance/pds4/pds4linkshelf.py @@ -49,7 +49,7 @@ LINK_REGEX = re.compile(r'(?:|.*?[^/@\w\.])/?(?:\.\./)*(([A-Z0-9][-\w]+/)*' + r'(makefile\.?|[A-Z0-9][\w-]*(\.[\w-]+)+))', re.I) -EXTS_WO_LABELS = set(['.XML', '.CAT', '.TXT', '.FMT', '.SFD']) +EXTS_WO_LABELS = set(['.XML', '.CAT', '.FMT', '.SFD']) ################################################################################ @@ -85,7 +85,7 @@ def generate_links(dirpath, old_links={}, """Generate a dictionary keyed by the absolute file path for files in the given directory tree, which must correspond to a volume. - Keys ending in .LBL, .CAT and .TXT return a list of tuples + Keys ending in .XML, .CAT and .TXT return a list of tuples (recno, link, target) for each link found. Here, recno = record number in file; @@ -95,7 +95,7 @@ def generate_links(dirpath, old_links={}, Other keys return a single string, which indicates the absolute path to the label file describing this file. - Unlabeled files not ending in .LBL, .CAT or .TXT return an empty string. + Unlabeled files not ending in .XML, .CAT or .TXT return an empty string. Also return the latest modification date among all the files checked. """ @@ -158,7 +158,7 @@ def generate_links(dirpath, old_links={}, basename_uc = basename.upper() - # Only check LBL, CAT, TXT, etc. + # Only check XML, CAT, TXT, etc. ext = basename_uc[-4:] if len(basename) >= 4 else '' if ext not in EXTS_WO_LABELS: continue @@ -288,7 +288,7 @@ def generate_links(dirpath, old_links={}, if ext != '.XML': # nope continue - # If names match up to '.LBL', then yes + # If names match up to '.XML', then yes if (len(linkname_uc) > ltest and linkname_uc[:ltest] == baseroot_uc and linkname_uc[ltest] == '.'): @@ -319,6 +319,22 @@ def generate_links(dirpath, old_links={}, continue abspath = os.path.join(root, basename) + + # linkinfo_dict: a dictionary with the abspath of a label file as the key and + # a list of its corresponding files (LinkInfo objects) under file_name tags as + # the value. + # label_dict: a dictionary with the abspath of a file as the key and the + # abspath of its corresponding label as the value. + # At the current directory, if a file basename is in the list of a label's + # (in same directory) file_name tags in linkinfo_dict, create an entry of + # that file basename in label_dict. This will make sure the file is pointing + # to it's correct corresponding label. + for label_abspath, link_info_list in linkinfo_dict.items(): + for info in link_info_list: + if info.linktext == basename and abspath not in label_dict: + label_dict[abspath] = label_abspath + break + if abspath in label_dict: continue # label already found @@ -385,6 +401,7 @@ def generate_links(dirpath, old_links={}, # This occurs when a .TXT or .CAT file has a label, even though it didn't # need one. In the returned dictionary, link lists take priority. link_dict = {} + for key in abspaths: if key in linkinfo_dict: # If this is a new entry, it's a list of LinkInfo objects @@ -458,6 +475,12 @@ def read_links(abspath, logger=None): if not matchobj: break + # if 'u0_kao_91cm_734nm_ring_beta_ingress_sqw' in abspath: + # print('readdddd') + # print(rec) + # print(matchobj.group(1)) + + linktext = matchobj.group(1) links.append(LinkInfo(recno, linktext, is_target)) @@ -994,7 +1017,7 @@ def main(): 'their links to the link shelf file. Links of ' + 'pre-existing files are not checked.') - parser.add_argument('--bundle', nargs='+', type=str, + parser.add_argument('bundle', nargs='+', type=str, help='The path to the root directory of a bundle.') parser.add_argument('--log', '-l', type=str, default='', From c3f1572bfbb3100f83620393c3ec5cc5206949d4 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Fri, 11 Oct 2024 03:29:10 +0800 Subject: [PATCH 11/21] Fixed the code indentation from line 112-428 in pds4linkshelf.py --- holdings_maintenance/pds4/pds4linkshelf.py | 537 ++++++++++----------- 1 file changed, 268 insertions(+), 269 deletions(-) diff --git a/holdings_maintenance/pds4/pds4linkshelf.py b/holdings_maintenance/pds4/pds4linkshelf.py index 12a614f..901ce3c 100755 --- a/holdings_maintenance/pds4/pds4linkshelf.py +++ b/holdings_maintenance/pds4/pds4linkshelf.py @@ -109,324 +109,323 @@ def generate_links(dirpath, old_links={}, try: - linkinfo_dict = old_links.copy() # abspath: list of LinkInfo objects - label_dict = {k:v for k,v in old_links.items() if isinstance(v,str)} - # abspath: label for this file - abspaths = [] # list of all abspaths + linkinfo_dict = old_links.copy() # abspath: list of LinkInfo objects + label_dict = {k:v for k,v in old_links.items() if isinstance(v,str)} + # abspath: label for this file + abspaths = [] # list of all abspaths - latest_mtime = 0. + latest_mtime = 0. - # Walk the directory tree, one subdirectory "root" at a time... - for (root, dirs, files) in os.walk(dirpath): + # Walk the directory tree, one subdirectory "root" at a time... + for (root, dirs, files) in os.walk(dirpath): - # skip ring_models dirctory - # if 'ring_models' in root: - # continue + # skip ring_models dirctory + # if 'ring_models' in root: + # continue - local_basenames = [] # Tracks the basenames in this directory - local_basenames_uc = [] # Same as above, but upper case - for basename in files: - abspath = os.path.join(root, basename) - latest_mtime = max(latest_mtime, os.path.getmtime(abspath)) + local_basenames = [] # Tracks the basenames in this directory + local_basenames_uc = [] # Same as above, but upper case + for basename in files: + abspath = os.path.join(root, basename) + latest_mtime = max(latest_mtime, os.path.getmtime(abspath)) - if basename == '.DS_Store': # skip .DS_Store files - logger.ds_store('.DS_Store file skipped', abspath) - continue - - if basename.startswith('._'): # skip dot_underscore files - logger.dot_underscore('dot_underscore file skipped', - abspath) - continue + if basename == '.DS_Store': # skip .DS_Store files + logger.ds_store('.DS_Store file skipped', abspath) + continue - if basename.startswith('.'): # skip invisible files - logger.invisible('Invisible file skipped', abspath) - continue + if basename.startswith('._'): # skip dot_underscore files + logger.dot_underscore('dot_underscore file skipped', + abspath) + continue - abspaths.append(abspath) - local_basenames.append(basename) - local_basenames_uc.append(basename.upper()) + if basename.startswith('.'): # skip invisible files + logger.invisible('Invisible file skipped', abspath) + continue - # Update linkinfo_dict, searching each relevant file for possible links. - # If the linking file is a label and the target file has a matching - # name, update the label_dict entry for the target. - candidate_labels = {} # {target: list of possible label basenames} - for basename in local_basenames: + abspaths.append(abspath) + local_basenames.append(basename) + local_basenames_uc.append(basename.upper()) - abspath = os.path.join(root, basename) - if abspath in linkinfo_dict: # for update op, skip existing links - continue + # Update linkinfo_dict, searching each relevant file for possible links. + # If the linking file is a label and the target file has a matching + # name, update the label_dict entry for the target. + candidate_labels = {} # {target: list of possible label basenames} + for basename in local_basenames: - basename_uc = basename.upper() - - # Only check XML, CAT, TXT, etc. - ext = basename_uc[-4:] if len(basename) >= 4 else '' - if ext not in EXTS_WO_LABELS: - continue + abspath = os.path.join(root, basename) + if abspath in linkinfo_dict: # for update op, skip existing links + continue - # Get list of link info for all possible linked filenames - logger.debug('*** REVIEWING', abspath) - linkinfo_list = read_links(abspath, logger=logger) + basename_uc = basename.upper() - # Apply repairs - repairs = REPAIRS.all(abspath) - for info in linkinfo_list: - for repair in repairs: - linkname = repair.first(info.linktext) - if linkname is None: + # Only check XML, CAT, TXT, etc. + ext = basename_uc[-4:] if len(basename) >= 4 else '' + if ext not in EXTS_WO_LABELS: + continue - # Attempt repair with leading directory path removed - if '/' in info.linktext: - info.remove_path() - linkname = repair.first(info.linktext) + # Get list of link info for all possible linked filenames + logger.debug('*** REVIEWING', abspath) + linkinfo_list = read_links(abspath, logger=logger) + # Apply repairs + repairs = REPAIRS.all(abspath) + for info in linkinfo_list: + for repair in repairs: + linkname = repair.first(info.linktext) if linkname is None: - continue # no repair found - - info.linkname = linkname - if linkname == '': - logger.info('Ignoring link "%s"' % - info.linktext, abspath, force=True) - else: - logger.info('Repairing link "%s"->"%s"' % - (info.linktext, linkname), - abspath, force=True) - # Validate non-local targets of repairs - if '/' in linkname: - target = os.path.join(root, linkname) - if os.path.exists(target): - info.target = os.path.abspath(target) - else: - logger.error('Target of repaired link is missing', - target) - - break # apply only one repair per found link - - # Validate or remove other targets - new_linkinfo_list = [] - baseroot_uc = basename_uc.partition('.')[0] - ltest = len(baseroot_uc) - for info in linkinfo_list: - if info.target: # Non-local, repaired links have targets - new_linkinfo_list.append(info) - continue + # Attempt repair with leading directory path removed + if '/' in info.linktext: + info.remove_path() + linkname = repair.first(info.linktext) - # A blank linkname is from a repair; indicates to ignore - if info.linkname == '': - continue + if linkname is None: + continue # no repair found - # Ignore self-references - linkname_uc = info.linkname.upper() - if linkname_uc == basename_uc: - continue + info.linkname = linkname + if linkname == '': + logger.info('Ignoring link "%s"' % + info.linktext, abspath, force=True) + else: + logger.info('Repairing link "%s"->"%s"' % + (info.linktext, linkname), + abspath, force=True) + + # Validate non-local targets of repairs + if '/' in linkname: + target = os.path.join(root, linkname) + if os.path.exists(target): + info.target = os.path.abspath(target) + else: + logger.error('Target of repaired link is missing', target) + + break # apply only one repair per found link + + # Validate or remove other targets + new_linkinfo_list = [] + baseroot_uc = basename_uc.partition('.')[0] + ltest = len(baseroot_uc) + for info in linkinfo_list: + if info.target: # Non-local, repaired links have targets + new_linkinfo_list.append(info) + continue - # Check for target inside this directory - try: - match_index = local_basenames_uc.index(linkname_uc) - except ValueError: - match_index = None + # A blank linkname is from a repair; indicates to ignore + if info.linkname == '': + continue - # If not found, maybe it is a non-local reference (.FMT perhaps) - if match_index is None: + # Ignore self-references + linkname_uc = info.linkname.upper() + if linkname_uc == basename_uc: + continue - # It's easy to pick up floats as link candidates; ignore + # Check for target inside this directory try: - _ = float(info.linkname) - continue # Yup, it's just a float + match_index = local_basenames_uc.index(linkname_uc) except ValueError: - pass - - if info.linkname[-1] in ('e', 'E'): - try: - _ = float(info.linkname[:-1]) - continue # Float with exponent - except ValueError: - pass - - # Also ignore format specifications (e.g., "F10.3") - if info.linkname[0] in ('F', 'E', 'G'): - try: - _ = float(info.linkname[1:]) - continue # Format - except ValueError: - pass - - # Search non-locally - if '/' in info.linkname: - nonlocal_target = locate_link_with_path(abspath, + match_index = None + + # If not found, maybe it is a non-local reference (.FMT perhaps) + if match_index is None: + + # It's easy to pick up floats as link candidates; ignore + try: + _ = float(info.linkname) + continue # Yup, it's just a float + except ValueError: + pass + + if info.linkname[-1] in ('e', 'E'): + try: + _ = float(info.linkname[:-1]) + continue # Float with exponent + except ValueError: + pass + + # Also ignore format specifications (e.g., "F10.3") + if info.linkname[0] in ('F', 'E', 'G'): + try: + _ = float(info.linkname[1:]) + continue # Format + except ValueError: + pass + + # Search non-locally + if '/' in info.linkname: + nonlocal_target = locate_link_with_path(abspath, + info.linkname) + else: + nonlocal_target = locate_nonlocal_link(abspath, info.linkname) - else: - nonlocal_target = locate_nonlocal_link(abspath, - info.linkname) - - # Report the outcome - if nonlocal_target: - logger.debug('Located "%s"' % info.linkname, - nonlocal_target) - info.target = nonlocal_target - new_linkinfo_list.append(info) - continue - - if linkname_uc.endswith('.FMT'): - logger.error('Unable to locate .FMT file "%s"' % - info.linkname, abspath) - elif linkname_uc.endswith('.CAT'): - logger.error('Unable to locate .CAT file "%s"' % - info.linkname, abspath) - else: - logger.debug('Substring "%s" is not a link, ignored' % - info.linkname, abspath) - continue + # Report the outcome + if nonlocal_target: + logger.debug('Located "%s"' % info.linkname, + nonlocal_target) + info.target = nonlocal_target + new_linkinfo_list.append(info) + continue + + if linkname_uc.endswith('.FMT'): + logger.error('Unable to locate .FMT file "%s"' % + info.linkname, abspath) + elif linkname_uc.endswith('.CAT'): + logger.error('Unable to locate .CAT file "%s"' % + info.linkname, abspath) + else: + logger.debug('Substring "%s" is not a link, ignored' % + info.linkname, abspath) - # Save the match - info.linkname = local_basenames[match_index] # update case - info.target = os.path.join(root, info.linkname) - new_linkinfo_list.append(info) + continue - # Could this be the label? - if ext != '.XML': # nope - continue + # Save the match + info.linkname = local_basenames[match_index] # update case + info.target = os.path.join(root, info.linkname) + new_linkinfo_list.append(info) - # If names match up to '.XML', then yes - if (len(linkname_uc) > ltest and - linkname_uc[:ltest] == baseroot_uc and - linkname_uc[ltest] == '.'): - label_dict[info.target] = abspath - logger.debug('Label identified for %s' % info.linkname, - abspath) + # Could this be the label? + if ext != '.XML': # nope continue - # Otherwise, then maybe - if info.is_target: - if info.linkname in candidate_labels: - if basename not in candidate_labels[info.linkname]: - candidate_labels[info.linkname].append(basename) - else: - candidate_labels[info.linkname] = [basename] + # If names match up to '.XML', then yes + if (len(linkname_uc) > ltest and + linkname_uc[:ltest] == baseroot_uc and + linkname_uc[ltest] == '.'): + label_dict[info.target] = abspath + logger.debug('Label identified for %s' % info.linkname, + abspath) + continue + + # Otherwise, then maybe + if info.is_target: + if info.linkname in candidate_labels: + if basename not in candidate_labels[info.linkname]: + candidate_labels[info.linkname].append(basename) + else: + candidate_labels[info.linkname] = [basename] - logger.debug('Candidate label found for ' + - info.linkname, abspath) + logger.debug('Candidate label found for ' + + info.linkname, abspath) - linkinfo_dict[abspath] = new_linkinfo_list + linkinfo_dict[abspath] = new_linkinfo_list - # Identify labels for files - for basename in local_basenames: + # Identify labels for files + for basename in local_basenames: - basename_uc = basename.upper() - ext = basename_uc[-4:] if len(basename) >= 4 else '' - if ext in (".XML", ".FMT"): # these can't have labels - continue + basename_uc = basename.upper() + ext = basename_uc[-4:] if len(basename) >= 4 else '' + if ext in (".XML", ".FMT"): # these can't have labels + continue - abspath = os.path.join(root, basename) - - # linkinfo_dict: a dictionary with the abspath of a label file as the key and - # a list of its corresponding files (LinkInfo objects) under file_name tags as - # the value. - # label_dict: a dictionary with the abspath of a file as the key and the - # abspath of its corresponding label as the value. - # At the current directory, if a file basename is in the list of a label's - # (in same directory) file_name tags in linkinfo_dict, create an entry of - # that file basename in label_dict. This will make sure the file is pointing - # to it's correct corresponding label. - for label_abspath, link_info_list in linkinfo_dict.items(): - for info in link_info_list: - if info.linktext == basename and abspath not in label_dict: - label_dict[abspath] = label_abspath - break - - if abspath in label_dict: - continue # label already found - - # Maybe we already know the label is missing - test = KNOWN_MISSING_LABELS.first(abspath) - if test == 'unneeded': - logger.debug('Label is not neeeded', abspath) - continue + abspath = os.path.join(root, basename) + + # linkinfo_dict: a dictionary with the abspath of a label file as the key and + # a list of its corresponding files (LinkInfo objects) under file_name tags as + # the value. + # label_dict: a dictionary with the abspath of a file as the key and the + # abspath of its corresponding label as the value. + # At the current directory, if a file basename is in the list of a label's + # (in same directory) file_name tags in linkinfo_dict, create an entry of + # that file basename in label_dict. This will make sure the file is pointing + # to it's correct corresponding label. + for label_abspath, link_info_list in linkinfo_dict.items(): + for info in link_info_list: + if info.linktext == basename and abspath not in label_dict: + label_dict[abspath] = label_abspath + break + + if abspath in label_dict: + continue # label already found + + # Maybe we already know the label is missing + test = KNOWN_MISSING_LABELS.first(abspath) + if test == 'unneeded': + logger.debug('Label is not neeeded', abspath) + continue - if test == 'missing': - logger.debug('Label is known to be missing', abspath) - continue + if test == 'missing': + logger.debug('Label is known to be missing', abspath) + continue - # Determine if a label is required - label_is_required = (ext not in EXTS_WO_LABELS) + # Determine if a label is required + label_is_required = (ext not in EXTS_WO_LABELS) - # Get the list of candidate labels in this directory - candidates = candidate_labels.get(basename, []) + # Get the list of candidate labels in this directory + candidates = candidate_labels.get(basename, []) - # Determine if the obvious label file exists - label_guess_uc = basename_uc.partition('.')[0] + '.XML' - if label_guess_uc in local_basenames_uc: - k = local_basenames_uc.index(label_guess_uc) - obvious_label_basename = local_basenames[k] - else: - obvious_label_basename = '' + # Determine if the obvious label file exists + label_guess_uc = basename_uc.partition('.')[0] + '.XML' + if label_guess_uc in local_basenames_uc: + k = local_basenames_uc.index(label_guess_uc) + obvious_label_basename = local_basenames[k] + else: + obvious_label_basename = '' + + # Simplest case... + if obvious_label_basename in candidates: + if not label_is_required: + logger.debug('Unnecessary label found', abspath, force=True) - # Simplest case... - if obvious_label_basename in candidates: + label_dict[abspath] = os.path.join(root, obvious_label_basename) + continue + + # More cases... if not label_is_required: - logger.debug('Unnecessary label found', abspath, force=True) + continue # leave abspath out of label_dict - label_dict[abspath] = os.path.join(root, obvious_label_basename) - continue + # Report a phantom label + if obvious_label_basename: + logger.error('Label %s does not point to file' % + local_basenames[k], abspath) - # More cases... - if not label_is_required: - continue # leave abspath out of label_dict + if len(candidates) == 1: + logger.debug('Label found as ' + candidates[0], abspath, + force=True) + label_dict[abspath] = os.path.join(root, candidates[0]) + continue - # Report a phantom label - if obvious_label_basename: - logger.error('Label %s does not point to file' % - local_basenames[k], abspath) + # or errors... + label_dict[abspath] = "" + if len(candidates) == 0: + logger.error('Label is missing', abspath) + else: + logger.error('Ambiguous label found as %s' % candidates[0], + abspath, force=True) + for candidate in candidates[1:]: + logger.debug('Alternative label found as %s' % candidate, + abspath, force=True) - if len(candidates) == 1: - logger.debug('Label found as ' + candidates[0], abspath, - force=True) - label_dict[abspath] = os.path.join(root, candidates[0]) - continue + # Merge the dictionaries + # There are cases where a file can have both a list of links and a label. + # This occurs when a .TXT or .CAT file has a label, even though it didn't + # need one. In the returned dictionary, link lists take priority. + link_dict = {} - # or errors... - label_dict[abspath] = "" - if len(candidates) == 0: - logger.error('Label is missing', abspath) - else: - logger.error('Ambiguous label found as %s' % candidates[0], - abspath, force=True) - for candidate in candidates[1:]: - logger.debug('Alternative label found as %s' % candidate, - abspath, force=True) - - # Merge the dictionaries - # There are cases where a file can have both a list of links and a label. - # This occurs when a .TXT or .CAT file has a label, even though it didn't - # need one. In the returned dictionary, link lists take priority. - link_dict = {} - - for key in abspaths: - if key in linkinfo_dict: - # If this is a new entry, it's a list of LinkInfo objects - # If this was copied from old_links, it's already a list of tuples - values = linkinfo_dict[key] - if isinstance(values, list): - new_list = [] - for item in values: - if isinstance(item, LinkInfo): - new_list.append((item.recno, item.linktext, item.target)) - else: - new_list.append(item) - link_dict[key] = new_list + for key in abspaths: + if key in linkinfo_dict: + # If this is a new entry, it's a list of LinkInfo objects + # If this was copied from old_links, it's already a list of tuples + values = linkinfo_dict[key] + if isinstance(values, list): + new_list = [] + for item in values: + if isinstance(item, LinkInfo): + new_list.append((item.recno, item.linktext, item.target)) + else: + new_list.append(item) + link_dict[key] = new_list + else: + link_dict[key] = values + elif key in label_dict: + link_dict[key] = label_dict[key] else: - link_dict[key] = values - elif key in label_dict: - link_dict[key] = label_dict[key] - else: - link_dict[key] = '' + link_dict[key] = '' - dt = datetime.datetime.fromtimestamp(latest_mtime) - logger.info('Lastest holdings file modification date', - dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('Lastest holdings file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) - return (link_dict, latest_mtime) + return (link_dict, latest_mtime) except (Exception, KeyboardInterrupt) as e: logger.exception(e) From 0ef4578871fc86a01a1a9596aae889ef12eca301 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Fri, 11 Oct 2024 04:24:30 +0800 Subject: [PATCH 12/21] Add the intelligence to ignore files when there is no label at the same directory (line 149-154, pds4linkshelf.py) --- holdings_maintenance/pds4/pds4linkshelf.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/holdings_maintenance/pds4/pds4linkshelf.py b/holdings_maintenance/pds4/pds4linkshelf.py index 901ce3c..2feb9d0 100755 --- a/holdings_maintenance/pds4/pds4linkshelf.py +++ b/holdings_maintenance/pds4/pds4linkshelf.py @@ -146,6 +146,13 @@ def generate_links(dirpath, old_links={}, local_basenames.append(basename) local_basenames_uc.append(basename.upper()) + local_labels = [f for f in local_basenames if '.xml' in f] + local_labels_abspath = [os.path.join(root, f) for f in local_labels] + + # If the current directory doesn't have any label, we skip this directory + if len(local_labels) == 0: + continue + # Update linkinfo_dict, searching each relevant file for possible links. # If the linking file is a label and the target file has a matching # name, update the label_dict entry for the target. @@ -158,7 +165,7 @@ def generate_links(dirpath, old_links={}, basename_uc = basename.upper() - # Only check XML, CAT, TXT, etc. + # Only check XML, CAT etc. ext = basename_uc[-4:] if len(basename) >= 4 else '' if ext not in EXTS_WO_LABELS: continue @@ -329,6 +336,11 @@ def generate_links(dirpath, old_links={}, # that file basename in label_dict. This will make sure the file is pointing # to it's correct corresponding label. for label_abspath, link_info_list in linkinfo_dict.items(): + + # if the label is at the same directory, skip it. + if label_abspath not in local_labels_abspath: + continue + for info in link_info_list: if info.linktext == basename and abspath not in label_dict: label_dict[abspath] = label_abspath From 3f3d2da16be1a91dab0d2fe1f00387bde09587b1 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Sat, 12 Oct 2024 04:26:38 +0800 Subject: [PATCH 13/21] Add the intelligence to identify files like errata.txt, or checksum files that don't exist in the label nor exist in the csv. They are not part of the archive, so they don't have labels. (line 369-378 in pds4linkshelf.py) --- holdings_maintenance/pds4/pds4checksums.py | 2 +- holdings_maintenance/pds4/pds4infoshelf.py | 2 +- holdings_maintenance/pds4/pds4linkshelf.py | 61 ++++++++++++++++------ 3 files changed, 48 insertions(+), 17 deletions(-) diff --git a/holdings_maintenance/pds4/pds4checksums.py b/holdings_maintenance/pds4/pds4checksums.py index 3d1f02f..992aaf5 100755 --- a/holdings_maintenance/pds4/pds4checksums.py +++ b/holdings_maintenance/pds4/pds4checksums.py @@ -614,7 +614,7 @@ def main(): 'their MD5 checksums to the checksum file. ' + 'Checksums of pre-existing files are not checked.') - parser.add_argument('--bundle', nargs='+', type=str, + parser.add_argument('bundle', nargs='+', type=str, help='The path to the root directory of a volume or ' + 'volume set. For a volume set, all the volume ' + 'directories inside it are handled in sequence. ' + diff --git a/holdings_maintenance/pds4/pds4infoshelf.py b/holdings_maintenance/pds4/pds4infoshelf.py index a5e09b4..7238ba6 100755 --- a/holdings_maintenance/pds4/pds4infoshelf.py +++ b/holdings_maintenance/pds4/pds4infoshelf.py @@ -662,7 +662,7 @@ def main(): 'than the shelf file, update the shelf file\'s ' + 'modification date.') - parser.add_argument('--bundle', nargs='+', type=str, + parser.add_argument('bundle', nargs='+', type=str, help='The path to the root of the bundle or bundle ' + 'set. For a bundle set, all the bundle ' + 'directories inside it are handled in sequence.') diff --git a/holdings_maintenance/pds4/pds4linkshelf.py b/holdings_maintenance/pds4/pds4linkshelf.py index 2feb9d0..1c34975 100755 --- a/holdings_maintenance/pds4/pds4linkshelf.py +++ b/holdings_maintenance/pds4/pds4linkshelf.py @@ -8,7 +8,10 @@ # Enter the --help option to see more information. ################################################################################ +from collections import defaultdict + import argparse +import csv import datetime import glob import os @@ -115,14 +118,10 @@ def generate_links(dirpath, old_links={}, abspaths = [] # list of all abspaths latest_mtime = 0. - + collection_basename_dict = defaultdict(list) # Walk the directory tree, one subdirectory "root" at a time... for (root, dirs, files) in os.walk(dirpath): - # skip ring_models dirctory - # if 'ring_models' in root: - # continue - local_basenames = [] # Tracks the basenames in this directory local_basenames_uc = [] # Same as above, but upper case for basename in files: @@ -142,6 +141,24 @@ def generate_links(dirpath, old_links={}, logger.invisible('Invisible file skipped', abspath) continue + # collection_basename_dict: a dictonary with the abspath of a collection + # csv file as the key and the list of basenames of its corresponding + # entries as the value. + # Create collection_basename_dict and use it to check whether a file + # is listed in the csv later. + if basename.startswith('collection') and basename.endswith('.csv'): + with open(abspath, 'r')as file: + csv_lines = csv.reader(file) + for line in csv_lines: + if '::' in line[-1]: + lid = line[-1].rpartition('::')[0] + else: + lid = line[-1] + csv_basename = lid.rpartition(':')[-1] + + if csv_basename not in collection_basename_dict[abspath]: + collection_basename_dict[abspath].append(csv_basename) + abspaths.append(abspath) local_basenames.append(basename) local_basenames_uc.append(basename.upper()) @@ -149,10 +166,6 @@ def generate_links(dirpath, old_links={}, local_labels = [f for f in local_basenames if '.xml' in f] local_labels_abspath = [os.path.join(root, f) for f in local_labels] - # If the current directory doesn't have any label, we skip this directory - if len(local_labels) == 0: - continue - # Update linkinfo_dict, searching each relevant file for possible links. # If the linking file is a label and the target file has a matching # name, update the label_dict entry for the target. @@ -316,6 +329,10 @@ def generate_links(dirpath, old_links={}, linkinfo_dict[abspath] = new_linkinfo_list + parent_root = root.rpartition('/')[0] + local_collection_csv_prefix = f'{root}/collection' + parent_collection_csv_prefix = f'{parent_root}/collection' + # Identify labels for files for basename in local_basenames: @@ -326,18 +343,18 @@ def generate_links(dirpath, old_links={}, abspath = os.path.join(root, basename) - # linkinfo_dict: a dictionary with the abspath of a label file as the key and - # a list of its corresponding files (LinkInfo objects) under file_name tags as - # the value. + # linkinfo_dict: a dictionary with the abspath of a label file as the key + # and a list of its corresponding files (LinkInfo objects) under file_name + # tags as the value. # label_dict: a dictionary with the abspath of a file as the key and the # abspath of its corresponding label as the value. # At the current directory, if a file basename is in the list of a label's # (in same directory) file_name tags in linkinfo_dict, create an entry of - # that file basename in label_dict. This will make sure the file is pointing - # to it's correct corresponding label. + # that file basename in label_dict. This will make sure the file is + # pointing to it's correct corresponding label. for label_abspath, link_info_list in linkinfo_dict.items(): - # if the label is at the same directory, skip it. + # if the label is not at the same directory, skip it. if label_abspath not in local_labels_abspath: continue @@ -349,6 +366,20 @@ def generate_links(dirpath, old_links={}, if abspath in label_dict: continue # label already found + # For files like errata.txt, or checksum files that don't exist in the + # label nor exist in the csv, they are not part of the archive, so they + # don't have labels + is_basename_in_csv = False + for col_abspath, csv_basenames in collection_basename_dict.items(): + if (col_abspath.startswith(parent_collection_csv_prefix) or + col_abspath.startswith(local_collection_csv_prefix)): + if basename.rpartition('.')[0] in csv_basenames: + is_basename_in_csv = True + break + + if not is_basename_in_csv: + continue + # Maybe we already know the label is missing test = KNOWN_MISSING_LABELS.first(abspath) if test == 'unneeded': From a7c725d5a22932fced1b3c4be048ac15624a752e Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Wed, 16 Oct 2024 05:00:58 +0800 Subject: [PATCH 14/21] Separate BUNDLENAME* and BUNDLESET* regex, put them under PDS3 & PDS4 subclasses. --- pdsfile/pds3file/__init__.py | 11 +++++++++++ pdsfile/pds4file/__init__.py | 26 ++++++++++---------------- pdsfile/pdsfile.py | 12 ------------ 3 files changed, 21 insertions(+), 28 deletions(-) diff --git a/pdsfile/pds3file/__init__.py b/pdsfile/pds3file/__init__.py index af05135..fc90c8a 100644 --- a/pdsfile/pds3file/__init__.py +++ b/pdsfile/pds3file/__init__.py @@ -54,6 +54,17 @@ class Pds3File(PdsFile): IDX_EXT = '.tab' LBL_EXT = '.lbl' + BUNDLESET_REGEX = re.compile(r'^([A-Z][A-Z0-9x]{1,5}_[0-9x]{3}x)$') + BUNDLESET_REGEX_I = re.compile(BUNDLESET_REGEX.pattern, re.I) + BUNDLESET_PLUS_REGEX = re.compile(BUNDLESET_REGEX.pattern[:-1] + + r'(_v[0-9]+\.[0-9]+\.[0-9]+|'+ + r'_v[0-9]+\.[0-9]+|_v[0-9]+|'+ + r'_in_prep|_prelim|_peer_review|'+ + r'_lien_resolution|)' + + r'((|_calibrated|_diagrams|_metadata|_previews)' + + r'(|_md5\.txt|\.tar\.gz))$') + + BUNDLESET_PLUS_REGEX_I = re.compile(BUNDLESET_PLUS_REGEX.pattern, re.I) BUNDLENAME_REGEX = re.compile(r'^([A-Z][A-Z0-9]{1,5}_(?:[0-9]{4}))$') BUNDLENAME_REGEX_I = re.compile(BUNDLENAME_REGEX.pattern, re.I) diff --git a/pdsfile/pds4file/__init__.py b/pdsfile/pds4file/__init__.py index 1ebd593..8fd33b8 100644 --- a/pdsfile/pds4file/__init__.py +++ b/pdsfile/pds4file/__init__.py @@ -13,22 +13,6 @@ class Pds4File(PdsFile): - BUNDLESET_REGEX = re.compile(r'^(uranus_occs_earthbased|^cassini_iss|^cassini_vims)$') - BUNDLESET_PLUS_REGEX = re.compile(BUNDLESET_REGEX.pattern[:-1] + - r'(_v[0-9]+\.[0-9]+\.[0-9]+|' + - r'_v[0-9]+\.[0-9]+|_v[0-9]+|' + - r'_in_prep|_prelim|_peer_review|' + - r'_lien_resolution|)' + - r'((|_calibrated|_diagrams|_metadata|_previews)' + - r'(|_md5\.txt|\.tar\.gz))$') - BUNDLESET_PLUS_REGEX_I = re.compile(BUNDLESET_PLUS_REGEX.pattern, re.I) - - BUNDLENAME_REGEX = re.compile(r'((^uranus_occ_u\d{0,4}._[a-z]*_(fos|\d{2,3}cm))'+ - r'|(^cassini\_[a-z]{3,4}\_cruise))$') - BUNDLENAME_PLUS_REGEX = re.compile(BUNDLENAME_REGEX.pattern[:-1] + - r'(|_[a-z]+)(|_md5\.txt|\.tar\.gz)$') - BUNDLENAME_PLUS_REGEX_I = re.compile(BUNDLENAME_PLUS_REGEX.pattern, re.I) - PDS_HOLDINGS = 'pds4-holdings' BUNDLE_DIR_NAME = 'bundles' @@ -72,6 +56,16 @@ class Pds4File(PdsFile): LBL_EXT = '.xml' # TODO: Generalize PDS4 bundlenames in the future once we have more bundles + BUNDLESET_REGEX = re.compile(r'^(uranus_occs_earthbased|^cassini_iss|^cassini_vims)$') + BUNDLESET_REGEX_I = re.compile(BUNDLESET_REGEX.pattern, re.I) + BUNDLESET_PLUS_REGEX = re.compile(BUNDLESET_REGEX.pattern[:-1] + + r'(_v[0-9]+\.[0-9]+\.[0-9]+|' + + r'_v[0-9]+\.[0-9]+|_v[0-9]+|' + + r'_in_prep|_prelim|_peer_review|' + + r'_lien_resolution|)' + + r'((|_calibrated|_diagrams|_metadata|_previews)' + + r'(|_md5\.txt|\.tar\.gz))$') + BUNDLESET_PLUS_REGEX_I = re.compile(BUNDLESET_PLUS_REGEX.pattern, re.I) BUNDLENAME_REGEX = re.compile(r'^([a-zA-z\_].+)$') BUNDLENAME_REGEX_I = re.compile(BUNDLENAME_REGEX.pattern, re.I) diff --git a/pdsfile/pdsfile.py b/pdsfile/pdsfile.py index 039f163..0a3cebd 100644 --- a/pdsfile/pdsfile.py +++ b/pdsfile/pdsfile.py @@ -259,18 +259,6 @@ class PdsFile(object): VIEWABLE_EXTS = set(['jpg', 'png', 'gif', 'tif', 'tiff', 'jpeg', 'jpeg_small']) DATAFILE_EXTS = set(['dat', 'img', 'cub', 'qub', 'fit', 'fits']) - # REGEX - BUNDLESET_REGEX = re.compile(r'^([A-Z][A-Z0-9x]{1,5}_[0-9x]{3}x)$') - BUNDLESET_REGEX_I = re.compile(BUNDLESET_REGEX.pattern, re.I) - BUNDLESET_PLUS_REGEX = re.compile(BUNDLESET_REGEX.pattern[:-1] + - r'(_v[0-9]+\.[0-9]+\.[0-9]+|'+ - r'_v[0-9]+\.[0-9]+|_v[0-9]+|'+ - r'_in_prep|_prelim|_peer_review|'+ - r'_lien_resolution|)' + - r'((|_calibrated|_diagrams|_metadata|_previews)' + - r'(|_md5\.txt|\.tar\.gz))$') - BUNDLESET_PLUS_REGEX_I = re.compile(BUNDLESET_PLUS_REGEX.pattern, re.I) - CATEGORY_REGEX = re.compile(r'^(|checksums\-)(|archives\-)(\w+)$') CATEGORY_REGEX_I = re.compile(CATEGORY_REGEX.pattern, re.I) From e059710836ad2ea02e166094a0c1a3a6e3146228 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Wed, 16 Oct 2024 05:08:46 +0800 Subject: [PATCH 15/21] Removed redundant BUNDLESET* and BUNDLENAME* regex for PDS3. --- pdsfile/pds3file/__init__.py | 24 ------------------ pdsfile/pds4file/__init__.py | 49 ++++++++++++++++++------------------ 2 files changed, 25 insertions(+), 48 deletions(-) diff --git a/pdsfile/pds3file/__init__.py b/pdsfile/pds3file/__init__.py index 812e64a..e8591b1 100644 --- a/pdsfile/pds3file/__init__.py +++ b/pdsfile/pds3file/__init__.py @@ -91,30 +91,6 @@ class Pds3File(PdsFile): IDX_EXT = '.tab' LBL_EXT = '.lbl' - BUNDLESET_REGEX = re.compile(r'^([A-Z][A-Z0-9x]{1,5}_[0-9x]{3}x)$') - BUNDLESET_REGEX_I = re.compile(BUNDLESET_REGEX.pattern, re.I) - BUNDLESET_PLUS_REGEX = re.compile(BUNDLESET_REGEX.pattern[:-1] + - r'(_v[0-9]+\.[0-9]+\.[0-9]+|'+ - r'_v[0-9]+\.[0-9]+|_v[0-9]+|'+ - r'_in_prep|_prelim|_peer_review|'+ - r'_lien_resolution|)' + - r'((|_calibrated|_diagrams|_metadata|_previews)' + - r'(|_md5\.txt|\.tar\.gz))$') - - BUNDLESET_PLUS_REGEX_I = re.compile(BUNDLESET_PLUS_REGEX.pattern, re.I) - BUNDLENAME_REGEX = re.compile(r'^([A-Z][A-Z0-9]{1,5}_(?:[0-9]{4}))$') - - BUNDLENAME_REGEX_I = re.compile(BUNDLENAME_REGEX.pattern, re.I) - BUNDLENAME_PLUS_REGEX = re.compile(BUNDLENAME_REGEX.pattern[:-1] + - r'(|_[a-z]+)(|_md5\.txt|\.tar\.gz)$') - BUNDLENAME_PLUS_REGEX_I = re.compile(BUNDLENAME_PLUS_REGEX.pattern, re.I) - BUNDLENAME_VERSION = re.compile(BUNDLENAME_REGEX.pattern[:-1] + - r'(_v[0-9]+\.[0-9]+\.[0-9]+|'+ - r'_v[0-9]+\.[0-9]+|_v[0-9]+|'+ - r'_in_prep|_prelim|_peer_review|'+ - r'_lien_resolution)$') - BUNDLENAME_VERSION_I = re.compile(BUNDLENAME_VERSION.pattern, re.I) - def __init__(self): super().__init__() diff --git a/pdsfile/pds4file/__init__.py b/pdsfile/pds4file/__init__.py index 8fd33b8..76e5257 100644 --- a/pdsfile/pds4file/__init__.py +++ b/pdsfile/pds4file/__init__.py @@ -16,6 +16,31 @@ class Pds4File(PdsFile): PDS_HOLDINGS = 'pds4-holdings' BUNDLE_DIR_NAME = 'bundles' + # TODO: Generalize PDS4 bundlenames in the future once we have more bundles + # REGEX + BUNDLESET_REGEX = re.compile(r'^(uranus_occs_earthbased|^cassini_iss|^cassini_vims)$') + BUNDLESET_REGEX_I = re.compile(BUNDLESET_REGEX.pattern, re.I) + BUNDLESET_PLUS_REGEX = re.compile(BUNDLESET_REGEX.pattern[:-1] + + r'(_v[0-9]+\.[0-9]+\.[0-9]+|' + + r'_v[0-9]+\.[0-9]+|_v[0-9]+|' + + r'_in_prep|_prelim|_peer_review|' + + r'_lien_resolution|)' + + r'((|_calibrated|_diagrams|_metadata|_previews)' + + r'(|_md5\.txt|\.tar\.gz))$') + BUNDLESET_PLUS_REGEX_I = re.compile(BUNDLESET_PLUS_REGEX.pattern, re.I) + BUNDLENAME_REGEX = re.compile(r'^([a-zA-z\_].+)$') + + BUNDLENAME_REGEX_I = re.compile(BUNDLENAME_REGEX.pattern, re.I) + BUNDLENAME_PLUS_REGEX = re.compile(BUNDLENAME_REGEX.pattern[:-1] + + r'(|_[a-z]+)(|_md5\.txt|\.tar\.gz)$') + BUNDLENAME_PLUS_REGEX_I = re.compile(BUNDLENAME_PLUS_REGEX.pattern, re.I) + BUNDLENAME_VERSION = re.compile(BUNDLENAME_REGEX.pattern[:-1] + + r'(_v[0-9]+\.[0-9]+\.[0-9]+|'+ + r'_v[0-9]+\.[0-9]+|_v[0-9]+|'+ + r'_in_prep|_prelim|_peer_review|'+ + r'_lien_resolution)$') + BUNDLENAME_VERSION_I = re.compile(BUNDLENAME_VERSION.pattern, re.I) + # Logger LOGGER = pdslogger.NullLogger() @@ -55,30 +80,6 @@ class Pds4File(PdsFile): IDX_EXT = '.csv' LBL_EXT = '.xml' - # TODO: Generalize PDS4 bundlenames in the future once we have more bundles - BUNDLESET_REGEX = re.compile(r'^(uranus_occs_earthbased|^cassini_iss|^cassini_vims)$') - BUNDLESET_REGEX_I = re.compile(BUNDLESET_REGEX.pattern, re.I) - BUNDLESET_PLUS_REGEX = re.compile(BUNDLESET_REGEX.pattern[:-1] + - r'(_v[0-9]+\.[0-9]+\.[0-9]+|' + - r'_v[0-9]+\.[0-9]+|_v[0-9]+|' + - r'_in_prep|_prelim|_peer_review|' + - r'_lien_resolution|)' + - r'((|_calibrated|_diagrams|_metadata|_previews)' + - r'(|_md5\.txt|\.tar\.gz))$') - BUNDLESET_PLUS_REGEX_I = re.compile(BUNDLESET_PLUS_REGEX.pattern, re.I) - BUNDLENAME_REGEX = re.compile(r'^([a-zA-z\_].+)$') - - BUNDLENAME_REGEX_I = re.compile(BUNDLENAME_REGEX.pattern, re.I) - BUNDLENAME_PLUS_REGEX = re.compile(BUNDLENAME_REGEX.pattern[:-1] + - r'(|_[a-z]+)(|_md5\.txt|\.tar\.gz)$') - BUNDLENAME_PLUS_REGEX_I = re.compile(BUNDLENAME_PLUS_REGEX.pattern, re.I) - BUNDLENAME_VERSION = re.compile(BUNDLENAME_REGEX.pattern[:-1] + - r'(_v[0-9]+\.[0-9]+\.[0-9]+|'+ - r'_v[0-9]+\.[0-9]+|_v[0-9]+|'+ - r'_in_prep|_prelim|_peer_review|'+ - r'_lien_resolution)$') - BUNDLENAME_VERSION_I = re.compile(BUNDLENAME_VERSION.pattern, re.I) - def __init__(self): super().__init__() From d83b14637088e25807db5078e99a08804160605a Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Fri, 18 Oct 2024 01:12:20 +0800 Subject: [PATCH 16/21] Change is_volume_dir to is_bundle_dir in pds4checksums.py --- holdings_maintenance/pds4/pds4checksums.py | 4 ++-- pdsfile/pds4file/__init__.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/holdings_maintenance/pds4/pds4checksums.py b/holdings_maintenance/pds4/pds4checksums.py index 992aaf5..9f68127 100755 --- a/holdings_maintenance/pds4/pds4checksums.py +++ b/holdings_maintenance/pds4/pds4checksums.py @@ -733,7 +733,7 @@ def main(): info += [(c, None) for c in children if c.isdir] # "if c.isdir" is False for volset level readme files - elif pdsf.is_volume_dir: + elif pdsf.is_bundle_dir: # Checksum one volume info.append((pdsf, None)) @@ -746,7 +746,7 @@ def main(): if pdsf.is_volume_file: # Checksum one archive file info.append((pdsdir, pdsf.basename)) - elif pdsdir.is_volume_dir: + elif pdsdir.is_bundle_dir: # Checksum one top-level file in volume info.append((pdsdir, pdsf.basename)) else: diff --git a/pdsfile/pds4file/__init__.py b/pdsfile/pds4file/__init__.py index 76e5257..01f326f 100644 --- a/pdsfile/pds4file/__init__.py +++ b/pdsfile/pds4file/__init__.py @@ -18,7 +18,10 @@ class Pds4File(PdsFile): # TODO: Generalize PDS4 bundlenames in the future once we have more bundles # REGEX - BUNDLESET_REGEX = re.compile(r'^(uranus_occs_earthbased|^cassini_iss|^cassini_vims)$') + BUNDLESET_REGEX = re.compile(r'^(uranus_occs_earthbased|' + + r'^cassini_iss.*|' + + r'^cassini_vims.*|' + + r'^cassini_uvis.*)$') BUNDLESET_REGEX_I = re.compile(BUNDLESET_REGEX.pattern, re.I) BUNDLESET_PLUS_REGEX = re.compile(BUNDLESET_REGEX.pattern[:-1] + r'(_v[0-9]+\.[0-9]+\.[0-9]+|' + From 1c06fe943aba8074f4798f553b81d88516145576 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Fri, 18 Oct 2024 10:53:52 +0800 Subject: [PATCH 17/21] Update read_links function in pds4linkshelf.py to capture file names in the file_name tags and avoid capturing the file name in the title tag of the label. This will prevent us from getting duplicated file name of the LinkInfo object when the file name exists in the title tag. --- holdings_maintenance/pds4/pds4linkshelf.py | 80 +++++++++++++++------- 1 file changed, 57 insertions(+), 23 deletions(-) diff --git a/holdings_maintenance/pds4/pds4linkshelf.py b/holdings_maintenance/pds4/pds4linkshelf.py index 1c34975..e870e4b 100755 --- a/holdings_maintenance/pds4/pds4linkshelf.py +++ b/holdings_maintenance/pds4/pds4linkshelf.py @@ -37,8 +37,9 @@ # Match pattern for any file name, but possibly things that are not file names PATTERN = r'\'?\"?([A-Z0-9][-\w]*\.[A-Z0-9][-\w\.]*)\'?\"?' -# Match pattern for the file name in anything of the form "keyword = filename" -TARGET_REGEX1 = re.compile(r'^ *\^?\w+ *= *\(?\{? *' + PATTERN, re.I) +# Match pattern for the file name in anything of the form +# "file name" in the PDS4 label +TARGET_REGEX1 = re.compile(r'^ *\' + PATTERN + r'\<\/file_name\>', re.I) # Match pattern for a file name on a line by itself TARGET_REGEX2 = re.compile(r'^ *,? *' + PATTERN, re.I) @@ -147,7 +148,8 @@ def generate_links(dirpath, old_links={}, # Create collection_basename_dict and use it to check whether a file # is listed in the csv later. if basename.startswith('collection') and basename.endswith('.csv'): - with open(abspath, 'r')as file: + logger.debug('Construct collection basename dictionary from', abspath) + with open(abspath, 'r') as file: csv_lines = csv.reader(file) for line in csv_lines: if '::' in line[-1]: @@ -184,7 +186,8 @@ def generate_links(dirpath, old_links={}, continue # Get list of link info for all possible linked filenames - logger.debug('*** REVIEWING', abspath) + # logger.debug('*** REVIEWING', abspath) + logger.info('*** Get link info and review', abspath) linkinfo_list = read_links(abspath, logger=logger) # Apply repairs @@ -281,20 +284,20 @@ def generate_links(dirpath, old_links={}, # Report the outcome if nonlocal_target: logger.debug('Located "%s"' % info.linkname, - nonlocal_target) + nonlocal_target) info.target = nonlocal_target new_linkinfo_list.append(info) continue if linkname_uc.endswith('.FMT'): logger.error('Unable to locate .FMT file "%s"' % - info.linkname, abspath) + info.linkname, abspath) elif linkname_uc.endswith('.CAT'): logger.error('Unable to locate .CAT file "%s"' % - info.linkname, abspath) + info.linkname, abspath) else: logger.debug('Substring "%s" is not a link, ignored' % - info.linkname, abspath) + info.linkname, abspath) continue @@ -312,8 +315,10 @@ def generate_links(dirpath, old_links={}, linkname_uc[:ltest] == baseroot_uc and linkname_uc[ltest] == '.'): label_dict[info.target] = abspath - logger.debug('Label identified for %s' % info.linkname, - abspath) + # logger.debug('Label identified for %s' % info.linkname, + # abspath) + logger.info('Label identified (by name) for %s' % + info.linkname, abspath) continue # Otherwise, then maybe @@ -325,7 +330,7 @@ def generate_links(dirpath, old_links={}, candidate_labels[info.linkname] = [basename] logger.debug('Candidate label found for ' + - info.linkname, abspath) + info.linkname, abspath) linkinfo_dict[abspath] = new_linkinfo_list @@ -352,6 +357,7 @@ def generate_links(dirpath, old_links={}, # (in same directory) file_name tags in linkinfo_dict, create an entry of # that file basename in label_dict. This will make sure the file is # pointing to it's correct corresponding label. + is_label_found = False for label_abspath, link_info_list in linkinfo_dict.items(): # if the label is not at the same directory, skip it. @@ -361,7 +367,12 @@ def generate_links(dirpath, old_links={}, for info in link_info_list: if info.linktext == basename and abspath not in label_dict: label_dict[abspath] = label_abspath + logger.info('Label identified (by file_name tag) for %s' % + info.linktext, label_abspath) + is_label_found = True break + if is_label_found: + break if abspath in label_dict: continue # label already found @@ -419,11 +430,11 @@ def generate_links(dirpath, old_links={}, # Report a phantom label if obvious_label_basename: logger.error('Label %s does not point to file' % - local_basenames[k], abspath) + local_basenames[k], abspath) if len(candidates) == 1: logger.debug('Label found as ' + candidates[0], abspath, - force=True) + force=True) label_dict[abspath] = os.path.join(root, candidates[0]) continue @@ -433,10 +444,10 @@ def generate_links(dirpath, old_links={}, logger.error('Label is missing', abspath) else: logger.error('Ambiguous label found as %s' % candidates[0], - abspath, force=True) + abspath, force=True) for candidate in candidates[1:]: logger.debug('Alternative label found as %s' % candidate, - abspath, force=True) + abspath, force=True) # Merge the dictionaries # There are cases where a file can have both a list of links and a label. @@ -494,6 +505,18 @@ def read_links(abspath, logger=None): # Search for the target of a link is_target = True matchobj = TARGET_REGEX1.match(rec) + + # if matchobj: + # print('----------------') + # print(rec) + # print(recno) + # print(matchobj.group(1)) + # obj2 = TARGET_REGEX2.match(rec) + # if obj2: + # print('obj2') + # print(obj2.group(1)) + + if matchobj: subrec = rec[:matchobj.end()] if '(' in subrec or '{' in subrec: @@ -504,22 +527,29 @@ def read_links(abspath, logger=None): matchobj = TARGET_REGEX2.match(rec) # If not found, search for any other referenced file name or path - if not matchobj: - if ')' in rec or '}' in rec: - multiple_targets = False + # if not matchobj: + # if ')' in rec or '}' in rec: + # multiple_targets = False - is_target = False - matchobj = LINK_REGEX.match(rec) - if matchobj: - multiple_targets = False + # is_target = False + # matchobj = LINK_REGEX.match(rec) + # if matchobj: + # multiple_targets = False # No more matches in this record if not matchobj: break - # if 'u0_kao_91cm_734nm_ring_beta_ingress_sqw' in abspath: + # if 'data_raw/129xxxxxxx/12945xxxxx/1294561143w.xml' in abspath and matchobj: # print('readdddd') # print(rec) + # print(recno) + # print('match TARGET_REGEX1') + # print(TARGET_REGEX1.match(rec)) + # print('match TARGET_REGEX2') + # print(TARGET_REGEX2.match(rec)) + # print('match LINK_REGEX') + # print(LINK_REGEX.match(rec)) # print(matchobj.group(1)) @@ -528,6 +558,10 @@ def read_links(abspath, logger=None): rec = rec[matchobj.end():] + # if 'data_raw/129xxxxxxx/12945xxxxx/1294561143w.xml' in abspath: + # for link in links: + # print(link.linktext) + return links def locate_nonlocal_link(abspath, filename): From 0eea1fe6becd3a65e77c9af25d80461f459f6e88 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Fri, 18 Oct 2024 10:58:27 +0800 Subject: [PATCH 18/21] Remove the commented out code in the read_links function of pds4linkshelf.py. --- holdings_maintenance/pds4/pds4linkshelf.py | 38 ---------------------- 1 file changed, 38 deletions(-) diff --git a/holdings_maintenance/pds4/pds4linkshelf.py b/holdings_maintenance/pds4/pds4linkshelf.py index e870e4b..a6c40cc 100755 --- a/holdings_maintenance/pds4/pds4linkshelf.py +++ b/holdings_maintenance/pds4/pds4linkshelf.py @@ -506,17 +506,6 @@ def read_links(abspath, logger=None): is_target = True matchobj = TARGET_REGEX1.match(rec) - # if matchobj: - # print('----------------') - # print(rec) - # print(recno) - # print(matchobj.group(1)) - # obj2 = TARGET_REGEX2.match(rec) - # if obj2: - # print('obj2') - # print(obj2.group(1)) - - if matchobj: subrec = rec[:matchobj.end()] if '(' in subrec or '{' in subrec: @@ -526,42 +515,15 @@ def read_links(abspath, logger=None): elif multiple_targets: matchobj = TARGET_REGEX2.match(rec) - # If not found, search for any other referenced file name or path - # if not matchobj: - # if ')' in rec or '}' in rec: - # multiple_targets = False - - # is_target = False - # matchobj = LINK_REGEX.match(rec) - # if matchobj: - # multiple_targets = False - # No more matches in this record if not matchobj: break - # if 'data_raw/129xxxxxxx/12945xxxxx/1294561143w.xml' in abspath and matchobj: - # print('readdddd') - # print(rec) - # print(recno) - # print('match TARGET_REGEX1') - # print(TARGET_REGEX1.match(rec)) - # print('match TARGET_REGEX2') - # print(TARGET_REGEX2.match(rec)) - # print('match LINK_REGEX') - # print(LINK_REGEX.match(rec)) - # print(matchobj.group(1)) - - linktext = matchobj.group(1) links.append(LinkInfo(recno, linktext, is_target)) rec = rec[matchobj.end():] - # if 'data_raw/129xxxxxxx/12945xxxxx/1294561143w.xml' in abspath: - # for link in links: - # print(link.linktext) - return links def locate_nonlocal_link(abspath, filename): From b2fca02befe1a6ad8e7df5f8e89fe07029ac26bc Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Tue, 22 Oct 2024 02:33:25 +0800 Subject: [PATCH 19/21] Update generate_links and improve its running time by: - moving the intelligence to check if a file is in the file_name tag of a label. Now this step is done after checking whether the file is in the label_dict already. - moving the intelligence to check if a file is in the collection csv files. Now this step is done right before raising an error when we can't find its corresponding label. These two modifications can avoid unnecessary looping of linkinfo_dict and collection_basename_dict. --- holdings_maintenance/pds4/pds4linkshelf.py | 60 +++++++++++++--------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/holdings_maintenance/pds4/pds4linkshelf.py b/holdings_maintenance/pds4/pds4linkshelf.py index a6c40cc..714dcb8 100755 --- a/holdings_maintenance/pds4/pds4linkshelf.py +++ b/holdings_maintenance/pds4/pds4linkshelf.py @@ -8,8 +8,6 @@ # Enter the --help option to see more information. ################################################################################ -from collections import defaultdict - import argparse import csv import datetime @@ -119,7 +117,7 @@ def generate_links(dirpath, old_links={}, abspaths = [] # list of all abspaths latest_mtime = 0. - collection_basename_dict = defaultdict(list) + collection_basename_dict = {} # Walk the directory tree, one subdirectory "root" at a time... for (root, dirs, files) in os.walk(dirpath): @@ -143,12 +141,15 @@ def generate_links(dirpath, old_links={}, continue # collection_basename_dict: a dictonary with the abspath of a collection - # csv file as the key and the list of basenames of its corresponding + # csv file as the key and the set of basenames of its corresponding # entries as the value. # Create collection_basename_dict and use it to check whether a file # is listed in the csv later. - if basename.startswith('collection') and basename.endswith('.csv'): + if (basename.startswith('collection') and + basename.endswith('.csv') and + not abspath in collection_basename_dict): logger.debug('Construct collection basename dictionary from', abspath) + csv_basenames = set() with open(abspath, 'r') as file: csv_lines = csv.reader(file) for line in csv_lines: @@ -157,9 +158,9 @@ def generate_links(dirpath, old_links={}, else: lid = line[-1] csv_basename = lid.rpartition(':')[-1] + csv_basenames.add(csv_basename) - if csv_basename not in collection_basename_dict[abspath]: - collection_basename_dict[abspath].append(csv_basename) + collection_basename_dict[abspath] = csv_basenames abspaths.append(abspath) local_basenames.append(basename) @@ -279,7 +280,7 @@ def generate_links(dirpath, old_links={}, info.linkname) else: nonlocal_target = locate_nonlocal_link(abspath, - info.linkname) + info.linkname) # Report the outcome if nonlocal_target: @@ -348,6 +349,10 @@ def generate_links(dirpath, old_links={}, abspath = os.path.join(root, basename) + if abspath in label_dict: + logger.info('Label already found for %s' % abspath) + continue # label already found + # linkinfo_dict: a dictionary with the abspath of a label file as the key # and a list of its corresponding files (LinkInfo objects) under file_name # tags as the value. @@ -356,7 +361,7 @@ def generate_links(dirpath, old_links={}, # At the current directory, if a file basename is in the list of a label's # (in same directory) file_name tags in linkinfo_dict, create an entry of # that file basename in label_dict. This will make sure the file is - # pointing to it's correct corresponding label. + # pointing to its correct corresponding label. is_label_found = False for label_abspath, link_info_list in linkinfo_dict.items(): @@ -374,21 +379,8 @@ def generate_links(dirpath, old_links={}, if is_label_found: break - if abspath in label_dict: - continue # label already found - - # For files like errata.txt, or checksum files that don't exist in the - # label nor exist in the csv, they are not part of the archive, so they - # don't have labels - is_basename_in_csv = False - for col_abspath, csv_basenames in collection_basename_dict.items(): - if (col_abspath.startswith(parent_collection_csv_prefix) or - col_abspath.startswith(local_collection_csv_prefix)): - if basename.rpartition('.')[0] in csv_basenames: - is_basename_in_csv = True - break - - if not is_basename_in_csv: + # label found by searching linkinfo_dict + if is_label_found: continue # Maybe we already know the label is missing @@ -420,7 +412,9 @@ def generate_links(dirpath, old_links={}, if not label_is_required: logger.debug('Unnecessary label found', abspath, force=True) - label_dict[abspath] = os.path.join(root, obvious_label_basename) + label_abspath = os.path.join(root, obvious_label_basename) + label_dict[abspath] = label_abspath + logger.info('Label found for %s' % abspath, label_abspath) continue # More cases... @@ -438,6 +432,22 @@ def generate_links(dirpath, old_links={}, label_dict[abspath] = os.path.join(root, candidates[0]) continue + # Before raising an error, check this: + # For files like errata.txt, or checksum files that don't exist in the + # label nor exist in the csv, they are not part of the archive, so they + # don't have labels + is_basename_in_csv = False + logger.info('Check if %s is in the collection csv' % basename) + for col_abspath, csv_basenames in collection_basename_dict.items(): + if (col_abspath.startswith(parent_collection_csv_prefix) or + col_abspath.startswith(local_collection_csv_prefix)): + if basename.rpartition('.')[0] in csv_basenames: + is_basename_in_csv = True + break + + if not is_basename_in_csv: + continue + # or errors... label_dict[abspath] = "" if len(candidates) == 0: From 8fa331deb7dbff0a4b6c826e0b99b542f7ab56c5 Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Tue, 22 Oct 2024 02:44:33 +0800 Subject: [PATCH 20/21] Remove commented out logger.debug generate_links --- holdings_maintenance/pds4/pds4linkshelf.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/holdings_maintenance/pds4/pds4linkshelf.py b/holdings_maintenance/pds4/pds4linkshelf.py index 714dcb8..6f6bbee 100755 --- a/holdings_maintenance/pds4/pds4linkshelf.py +++ b/holdings_maintenance/pds4/pds4linkshelf.py @@ -187,7 +187,6 @@ def generate_links(dirpath, old_links={}, continue # Get list of link info for all possible linked filenames - # logger.debug('*** REVIEWING', abspath) logger.info('*** Get link info and review', abspath) linkinfo_list = read_links(abspath, logger=logger) @@ -316,8 +315,6 @@ def generate_links(dirpath, old_links={}, linkname_uc[:ltest] == baseroot_uc and linkname_uc[ltest] == '.'): label_dict[info.target] = abspath - # logger.debug('Label identified for %s' % info.linkname, - # abspath) logger.info('Label identified (by name) for %s' % info.linkname, abspath) continue From 39639c671ac433f6efadef47efb5337b70bddc9e Mon Sep 17 00:00:00 2001 From: Dave Chang Date: Tue, 22 Oct 2024 03:02:17 +0800 Subject: [PATCH 21/21] At line 156-158 of pds4linkshelf.py, skip the empty entry of a csv file when trying to parse each entry to get the basename of a file in the archive. --- holdings_maintenance/pds4/pds4linkshelf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/holdings_maintenance/pds4/pds4linkshelf.py b/holdings_maintenance/pds4/pds4linkshelf.py index 6f6bbee..775c99a 100755 --- a/holdings_maintenance/pds4/pds4linkshelf.py +++ b/holdings_maintenance/pds4/pds4linkshelf.py @@ -153,6 +153,9 @@ def generate_links(dirpath, old_links={}, with open(abspath, 'r') as file: csv_lines = csv.reader(file) for line in csv_lines: + # skip the empty line + if not line: + continue if '::' in line[-1]: lid = line[-1].rpartition('::')[0] else: