diff --git a/holdings_maintenance/pds3/pdschecksums.py b/holdings_maintenance/pds3/pdschecksums.py index cd99d31..2a16513 100755 --- a/holdings_maintenance/pds3/pdschecksums.py +++ b/holdings_maintenance/pds3/pdschecksums.py @@ -614,7 +614,7 @@ def main(): 'their MD5 checksums to the checksum file. ' + 'Checksums of pre-existing files are not checked.') - parser.add_argument('volume', nargs='+', type=str, + parser.add_argument('--volume', nargs='+', type=str, help='The path to the root directory of a volume or ' + 'volume set. For a volume set, all the volume ' + 'directories inside it are handled in sequence. ' + @@ -768,21 +768,21 @@ def main(): # Save logs in up to two places if pdsf.volname: - logfiles = set([pdsf.log_path_for_volume('_md5', + logfiles = set([pdsf.log_path_for_bundle('_md5', task=args.task, dir='pdschecksums'), - pdsf.log_path_for_volume('_md5', + pdsf.log_path_for_bundle('_md5', task=args.task, dir='pdschecksums', place='parallel')]) else: - logfiles = set([pdsf.log_path_for_volset('_md5', - task=args.task, - dir='pdschecksums'), - pdsf.log_path_for_volset('_md5', - task=args.task, - dir='pdschecksums', - place='parallel')]) + logfiles = set([pdsf.log_path_for_bundleset('_md5', + task=args.task, + dir='pdschecksums'), + pdsf.log_path_for_bundleset('_md5', + task=args.task, + dir='pdschecksums', + place='parallel')]) # Create all the handlers for this level in the logger local_handlers = [] diff --git a/holdings_maintenance/pds3/pdsinfoshelf.py b/holdings_maintenance/pds3/pdsinfoshelf.py index 7396946..cbf4a12 100755 --- a/holdings_maintenance/pds3/pdsinfoshelf.py +++ b/holdings_maintenance/pds3/pdsinfoshelf.py @@ -662,7 +662,7 @@ def main(): 'than the shelf file, update the shelf file\'s ' + 'modification date.') - parser.add_argument('volume', nargs='+', type=str, + parser.add_argument('--volume', nargs='+', type=str, help='The path to the root of the volume or volume ' + 'set. For a volume set, all the volume ' + 'directories inside it are handled in sequence.') diff --git a/holdings_maintenance/pds4/pds4checksums.py b/holdings_maintenance/pds4/pds4checksums.py new file mode 100755 index 0000000..9f68127 --- /dev/null +++ b/holdings_maintenance/pds4/pds4checksums.py @@ -0,0 +1,859 @@ +#!/usr/bin/env python3 +################################################################################ +# pdschecksums.py library and main program +# +# Syntax: +# pdschecksums.py --task path [path ...] +# +# Enter the --help option to see more information. +################################################################################ + +import argparse +import datetime +import glob +import hashlib +import os +import shutil +import sys + +import pdslogger +import pdsfile + +# Holds log file directories temporarily, used by move_old_checksums() +LOGDIRS = [] + +LOGNAME = 'pds.validation.checksums' +LOGROOT_ENV = 'PDS_LOG_ROOT' + +################################################################################ + +# From http://stackoverflow.com/questions/3431825/- +# generating-an-md5-checksum-of-a-file + +def hashfile(fname, blocksize=65536): + f = open(fname, 'rb') + hasher = hashlib.md5() + buf = f.read(blocksize) + while len(buf) > 0: + hasher.update(buf) + buf = f.read(blocksize) + return hasher.hexdigest() + +################################################################################ + +def generate_checksums(pdsdir, selection=None, oldpairs=[], regardless=True, + limits={'normal':-1}, logger=None): + """Generate a list of tuples (abspath, checksum) recursively from the given + directory tree. + + If a selection is specified, it is interpreted as the basename of a file, + and only that file is processed. + + The optional oldpairs is a list of (abspath, checksum) pairs. For any file + that already has a checksum in the shortcut list, the checksum is copied + from this list rather than re-calculated. This list is merged with the + selection if a selection is identified. + + If regardless is True, then the checksum of a selection is calculated + regardless of whether it is already in abspairs. + + Also return the latest modification date among all the files checked. + """ + + dirpath = pdsdir.abspath + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Generating MD5 checksums', dirpath, limits=limits) + + latest_mtime = 0. + try: + md5_dict = {} + for (abspath, hex) in oldpairs: + md5_dict[abspath] = hex + + newtuples = [] + for (path, dirs, files) in os.walk(dirpath): + for file in files: + abspath = os.path.join(path, file) + latest_mtime = max(latest_mtime, os.path.getmtime(abspath)) + + if selection and file != selection: + continue + + if file == '.DS_Store': # skip .DS_Store files + logger.ds_store('.DS_Store skipped', abspath) + continue + + if file.startswith('._'): # skip dot-underscore files + logger.dot_underscore('._* file skipped', abspath) + continue + + if '/.' in abspath: # flag invisible files + logger.invisible('Invisible file', abspath) + + if regardless and selection: + md5 = hashfile(abspath) + newtuples.append((abspath, md5, file)) + logger.normal('Selected MD5=%s' % md5, abspath) + + elif abspath in md5_dict: + newtuples.append((abspath, md5_dict[abspath], file)) + logger.debug('MD5 copied', abspath) + + else: + md5 = hashfile(abspath) + newtuples.append((abspath, md5, file)) + logger.normal('MD5=%s' % md5, abspath) + + if selection: + if len(newtuples) == 0: + logger.error('File selection not found', selection) + return ({}, latest_mtime) + + if len(newtuples) > 1: + logger.error('Multiple copies of file selection found', + selection) + return ({}, latest_mtime) + + # Add new values to dictionary + for (abspath, md5, _) in newtuples: + md5_dict[abspath] = md5 + + # Restore original order, old keys then new + old_keys = [p[0] for p in oldpairs] + + newpairs = [] + for key in old_keys: + newpairs.append((key, md5_dict[key])) + del md5_dict[key] + + for (key, new_md5, new_file) in newtuples: + if key in md5_dict: # if not already copied to list of pairs + newpairs.append((key, md5_dict[key])) + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('Lastest holdings file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + return (newpairs, latest_mtime) + +################################################################################ + +def read_checksums(check_path, selection=None, limits={}, logger=None): + + """Return a list of tuples (abspath, checksum) from a checksum file. + + If a selection is specified, then only the checksum with this file name + is returned.""" + + check_path = os.path.abspath(check_path) + pdscheck = pdsfile.Pds4File.from_abspath(check_path) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdscheck.root_) + logger.open('Reading MD5 checksums', check_path, limits=limits) + + try: + logger.info('MD5 checksum file', check_path) + + if not os.path.exists(check_path): + logger.error('MD5 checksum file not found', check_path) + return [] + + prefix_ = pdscheck.dirpath_and_prefix_for_checksum()[1] + + # Read the pairs + abspairs = [] + with open(check_path, 'r') as f: + for rec in f: + hexval = rec[:32] + filepath = rec[34:].rstrip() + + if selection and os.path.basename(filepath) != selection: + continue + + basename = os.path.basename(filepath) + if basename == '.DS_Store': + logger.error('.DS_Store found in checksum file', filepath) + continue + + if basename.startswith('._'): + logger.error('._* file found in checksum file', filepath) + continue + + if basename[0] == '.': + logger.invisible('Checksum for invisible file', filepath) + + abspairs.append((prefix_ + filepath, hexval)) + logger.debug('Read', filepath) + + if selection and len(abspairs) == 0: + logger.error('File selection not found', selection) + return [] + + except Exception as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + return abspairs + +################################################################################ + +def checksum_dict(dirpath, logger=None): + + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.Pds4File.from_abspath(dirpath) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.info('Loading checksums for', dirpath, force=True) + + check_path = pdsdir.checksum_path_and_lskip()[0] + abspairs = read_checksums(check_path, logger=logger) + + pair_dict = {} + for (abspath, checksum) in abspairs: + pair_dict[abspath] = checksum + + logger.info('Checksum load completed', dirpath, force=True) + return pair_dict + +################################################################################ + +def write_checksums(check_path, abspairs, + limits={'dot_':-1, 'ds_store':-1, 'invisible':100}, + logger=None): + """Write a checksum table containing the given pairs (abspath, checksum).""" + + check_path = os.path.abspath(check_path) + pdscheck = pdsfile.Pds4File.from_abspath(check_path) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdscheck.root_) + logger.open('Writing MD5 checksums', check_path, limits=limits) + + try: + # Create parent directory if necessary + parent = os.path.split(check_path)[0] + if not os.path.exists(parent): + logger.normal('Creating directory', parent) + os.makedirs(parent) + + prefix_ = pdscheck.dirpath_and_prefix_for_checksum()[1] + lskip = len(prefix_) + + # Write file + f = open(check_path, 'w') + for pair in abspairs: + (abspath, hex) = pair + + if abspath.endswith('/.DS_Store'): # skip .DS_Store files + logger.ds_store('.DS_Store skipped', abspath) + continue + + if '/._' in abspath: # skip dot-underscore files + logger.dot_underscore('._* file skipped', abspath) + continue + + if '/.' in abspath: # flag invisible files + logger.invisible('Invisible file', abspath) + + f.write('%s %s\n' % (hex, abspath[lskip:])) + logger.debug('Written', abspath) + + f.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def validate_pairs(pairs1, pairs2, selection=None, limits={}, logger=None): + """Validate the first checksum list against the second. + + If a selection is specified, only a file with that basename is checked.""" + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.open('Validating checksums', limits=limits) + + success = True + try: + md5_dict = {} + for (abspath, hex) in pairs2: + md5_dict[abspath] = hex + + for (abspath, hex) in pairs1: + if selection and selection != os.path.basename(abspath): + continue + + if abspath not in md5_dict: + logger.error('Missing checksum', abspath) + success = False + + elif hex != md5_dict[abspath]: + del md5_dict[abspath] + logger.error('Checksum mismatch', abspath) + success = False + + else: + del md5_dict[abspath] + logger.normal('Validated', abspath) + + if not selection: + abspaths = list(md5_dict.keys()) + abspaths.sort() + for abspath in abspaths: + logger.error('Extra file', abspath) + success = False + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + logger.close() + return success + +################################################################################ + +def move_old_checksums(check_path, logger=None): + """Appends a version number to an existing checksum file and moves it to + the associated log directory.""" + + if not os.path.exists(check_path): return + + check_basename = os.path.basename(check_path) + (check_prefix, check_ext) = os.path.splitext(check_basename) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + from_logged = False + for log_dir in LOGDIRS: + dest_template = log_dir + '/' + check_prefix + '_v???' + check_ext + version_paths = glob.glob(dest_template) + + max_version = 0 + lskip = len(check_ext) + for version_path in version_paths: + version = int(version_path[-lskip-3:-lskip]) + max_version = max(max_version, version) + + new_version = max_version + 1 + dest = dest_template.replace('???', '%03d' % new_version) + shutil.copy(check_path, dest) + + if not from_logged: + logger.info('Checksum file moved from: ' + check_path) + from_logged = True + + logger.info('Checksum file moved to', dest) + +################################################################################ +# Simplified functions to perform tasks +################################################################################ + +def initialize(pdsdir, selection=None, logger=None): + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Make sure checksum file does not exist + if os.path.exists(check_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Checksum file already exists', check_path) + return False + + # Check selection + if selection: + raise ValueError('File selection is disallowed for task ' + + '"initialize": ' + selection) + + # Generate checksums + (pairs, _) = generate_checksums(pdsdir, logger=logger) + if not pairs: + return False + + # Write new checksum file + write_checksums(check_path, pairs, logger=logger) + return True + +def reinitialize(pdsdir, selection=None, logger=None): + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Warn if checksum file does not exist + if not os.path.exists(check_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Checksum file does not exist', check_path) + return False + else: + logger.warn('Checksum file does not exist; initializing', check_path) + return initialize(pdsdir, selection=selection, logger=logger) + + # Re-initialize just the selection; preserve others + if selection: + oldpairs = read_checksums(check_path, logger=logger) + if not oldpairs: + return False + else: + oldpairs = [] + + # Generate new checksums + (pairs, _) = generate_checksums(pdsdir, selection, oldpairs, + regardless=True, logger=logger) + if not pairs: + return False + + # Write new checksum file + move_old_checksums(check_path, logger=logger) + write_checksums(check_path, pairs, logger=logger) + return True + +def validate(pdsdir, selection=None, logger=None): + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Make sure checksum file exists + if not os.path.exists(check_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Checksum file does not exist', check_path) + return False + + # Read checksum file + md5pairs = read_checksums(check_path, selection, logger=logger) + if not md5pairs: + return False + + # Generate checksums + (dirpairs, _) = generate_checksums(pdsdir, selection, logger=logger) + if not dirpairs: + return False + + # Validate + return validate_pairs(dirpairs, md5pairs, selection, logger=logger) + +def repair(pdsdir, selection=None, logger=None): + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Make sure checksum file exists + if not os.path.exists(check_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Checksum file does not exist', check_path) + return False + else: + logger.warn('Checksum file does not exist; initializing', check_path) + return initialize(pdsdir, selection=selection, logger=logger) + + # Read checksums file + md5pairs = read_checksums(check_path, logger=logger) + if not md5pairs: + return False + + # Generate new checksums + if selection: + (dirpairs, + latest_mtime) = generate_checksums(pdsdir, selection, md5pairs, + regardless=True, logger=logger) + else: + (dirpairs, + latest_mtime) = generate_checksums(pdsdir, logger=logger) + + if not dirpairs: + return False + + # Compare checksums + md5pairs.sort() + dirpairs.sort() + canceled = (dirpairs == md5pairs) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + check_mtime = os.path.getmtime(check_path) + if latest_mtime > check_mtime: + logger.info('!!! Checksum file content is up to date', + check_path, force=True) + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('!!! Latest holdings file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + check_mtime = os.path.getmtime(check_path) + dt = datetime.datetime.fromtimestamp(check_mtime) + logger.info('!!! Checksum file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + delta = latest_mtime - check_mtime + if delta >= 86400/10: + logger.info('!!! Checksum file is out of date %.1f days' % + (delta / 86400.), force=True) + else: + logger.info('!!! Checksum file is out of date %.1f minutes' % + (delta / 60.), force=True) + + dt = datetime.datetime.now() + os.utime(check_path) + logger.info('!!! Time tag on checksum file set to', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + else: + logger.info('!!! Checksum file is up to date; repair canceled', + check_path, force=True) + return True + + # Write checksum file + move_old_checksums(check_path, logger=logger) + write_checksums(check_path, dirpairs, logger=logger) + return True + +def update(pdsdir, selection=None, logger=None): + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Make sure file exists + if not os.path.exists(check_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Checksum file does not exist', check_path) + return False + else: + logger.warn('Checksum file does not exist; initializing', check_path) + return initialize(pdsdir, selection=selection, logger=logger) + + # Read checksums file + md5pairs = read_checksums(check_path, logger=logger) + if not md5pairs: + return False + + # Generate new checksums if necessary + (dirpairs, + latest_mtime) = generate_checksums(pdsdir, selection, md5pairs, + regardless=False, logger=logger) + if not dirpairs: + return False + + # Compare checksums + md5pairs.sort() + dirpairs.sort() + canceled = (dirpairs == md5pairs) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.info('!!! Checksum file content is complete; update canceled', + check_path) + return True + + # Write checksum file + move_old_checksums(check_path, logger=logger) + write_checksums(check_path, dirpairs, logger=logger) + return True + +################################################################################ +# Executable program +################################################################################ + +def main(): + + # Set up parser + parser = argparse.ArgumentParser( + description='pdschecksums: Create, maintain and validate MD5 ' + + 'checksum files for PDS volumes and volume sets.') + + parser.add_argument('--initialize', '--init', const='initialize', + default='', action='store_const', dest='task', + help='Create an MD5 checksum file for a volume or ' + + 'volume set. Abort if the checksum file ' + + 'already exists.') + + parser.add_argument('--reinitialize', '--reinit', const='reinitialize', + default='', action='store_const', dest='task', + help='Create an MD5 checksum file for a volume or ' + + 'volume set. Replace the checksum file if it ' + + 'already exists. If a single file is specified, ' + + 'such as one archive file in a volume set, only ' + + 'single checksum is re-initialized.') + + parser.add_argument('--validate', const='validate', + default='', action='store_const', dest='task', + help='Validate every file in a volume directory tree ' + + 'against its MD5 checksum. If a single file ' + + 'is specified, such as one archive file in a ' + + 'volume set, only that single checksum is ' + + 'validated.') + + parser.add_argument('--repair', const='repair', + default='', action='store_const', dest='task', + help='Validate every file in a volume directory tree ' + + 'against its MD5 checksum. If any disagreement ' + + 'is found, the checksum file is replaced; ' + + 'otherwise it is unchanged. If a single file is ' + + 'specified, such as one archive file of a ' + + 'volume set, then only that single checksum is ' + + 'repaired. If any of the files checked are newer' + + 'than the checksum file, update shelf file\'s ' + + 'modification date') + + parser.add_argument('--update', const='update', + default='', action='store_const', dest='task', + help='Search a directory for any new files and add ' + + 'their MD5 checksums to the checksum file. ' + + 'Checksums of pre-existing files are not checked.') + + parser.add_argument('bundle', nargs='+', type=str, + help='The path to the root directory of a volume or ' + + 'volume set. For a volume set, all the volume ' + + 'directories inside it are handled in sequence. ' + + 'Note that, for archive directories, checksums ' + + 'are grouped into one file for the entire ' + + 'volume set.') + + parser.add_argument('--log', '-l', type=str, default='', + help='Optional root directory for a duplicate of the ' + + 'log files. If not specified, the value of ' + + 'environment variable "%s" ' % LOGROOT_ENV + + 'is used. In addition, individual logs are ' + + 'written into the "logs" directory parallel to ' + + '"holdings". Logs are created inside the ' + + '"pdschecksums" subdirectory of each log root ' + + 'directory.') + + parser.add_argument('--quiet', '-q', action='store_true', + help='Do not also log to the terminal.') + + parser.add_argument('--archives', '-a', default=False, action='store_true', + help='Instead of referring to a volume, refer to the ' + + 'the archive file for that volume.') + + parser.add_argument('--infoshelf', '-i', dest='infoshelf', + default=False, action='store_true', + help='After a successful run, also execute the ' + + 'equivalent pdsinfoshelf command.') + + + # Parse and validate the command line + args = parser.parse_args() + + if not args.task: + print('pdschecksums error: Missing task') + sys.exit(1) + + # Define the logging directory + if args.log == '': + try: + args.log = os.environ[LOGROOT_ENV] + except KeyError: + args.log = None + + # Initialize the logger + logger = pdslogger.PdsLogger(LOGNAME) + pdsfile.Pds4File.set_log_root(args.log) + + if not args.quiet: + logger.add_handler(pdslogger.stdout_handler) + + if args.log: + path = os.path.join(args.log, 'pdschecksums') + warning_handler = pdslogger.warning_handler(path) + logger.add_handler(warning_handler) + + error_handler = pdslogger.error_handler(path) + logger.add_handler(error_handler) + + # Prepare the list of paths + abspaths = [] + for path in args.bundle: + + # Make sure path makes sense + path = os.path.abspath(path) + parts = path.partition('/pds4-holdings/') + if not parts[1]: + print('Not a holdings subdirectory: ' + path) + sys.exit(1) + + if parts[2].startswith('checksums-'): + print('No checksums for checksum files: ' + path) + sys.exit(1) + + # Convert to an archives path if necessary + if args.archives and not parts[2].startswith('archives-'): + path = parts[0] + '/pds4-holdings/archives-' + parts[2] + + # Convert to a list of absolute paths that exist (volsets or volumes) + try: + pdsf = pdsfile.Pds4File.from_abspath(path, must_exist=True) + abspaths.append(pdsf.abspath) + + except (ValueError, IOError): + # Allow a volume name to stand in for a .tar.gz archive + (dir, basename) = os.path.split(path) + pdsdir = pdsfile.Pds4File.from_abspath(dir) + if pdsdir.archives_ and '.' not in basename: + if pdsdir.voltype_ == 'volumes/': + basename += '.tar.gz' + else: + basename += '_%s.tar.gz' % pdsdir.voltype_[:-1] + + newpaths = glob.glob(os.path.join(dir, basename)) + if len(newpaths) == 0: + raise + + abspaths += newpaths + continue + else: + raise + + # Generate a list of tuples (pdsfile, selection) + info = [] + for path in abspaths: + pdsf = pdsfile.Pds4File.from_abspath(path) + + if pdsf.is_bundleset_dir: + # Archive directories are checksumed by volset + if pdsf.archives_: + info.append((pdsf, None)) + + # Others are checksumed by volume + else: + children = [pdsf.child(c) for c in pdsf.childnames] + info += [(c, None) for c in children if c.isdir] + # "if c.isdir" is False for volset level readme files + + elif pdsf.is_bundle_dir: + # Checksum one volume + info.append((pdsf, None)) + + elif pdsf.isdir: + print('Invalid directory for checksumming: ' + pdsf.logical_path) + sys.exit(1) + + else: + pdsdir = pdsf.parent() + if pdsf.is_volume_file: + # Checksum one archive file + info.append((pdsdir, pdsf.basename)) + elif pdsdir.is_bundle_dir: + # Checksum one top-level file in volume + info.append((pdsdir, pdsf.basename)) + else: + print('Invalid file for checksumming: ' + pdsf.logical_path) + sys.exit(1) + + # Begin logging and loop through tuples... + logger.open(' '.join(sys.argv)) + try: + for (pdsdir, selection) in info: + path = pdsdir.abspath + + if selection: + pdsf = pdsdir.child(os.path.basename(selection)) + else: + pdsf = pdsdir + + check_path = pdsdir.checksum_path_and_lskip()[0] + + # Save logs in up to two places + if pdsf.bundlename: + logfiles = set([pdsf.log_path_for_bundle('_md5', + task=args.task, + dir='pdschecksums'), + pdsf.log_path_for_bundle('_md5', + task=args.task, + dir='pdschecksums', + place='parallel')]) + else: + logfiles = set([pdsf.log_path_for_bundleset('_md5', + task=args.task, + dir='pdschecksums'), + pdsf.log_path_for_bundleset('_md5', + task=args.task, + dir='pdschecksums', + place='parallel')]) + + # Create all the handlers for this level in the logger + local_handlers = [] + LOGDIRS = [] # used by move_old_checksums() + for logfile in logfiles: + local_handlers.append(pdslogger.file_handler(logfile)) + logdir = os.path.split(logfile)[0] + LOGDIRS.append(os.path.split(logfile)[0]) + + # These handlers are only used if they don't already exist + warning_handler = pdslogger.warning_handler(logdir) + error_handler = pdslogger.error_handler(logdir) + local_handlers += [warning_handler, error_handler] + + # Open the next level of the log + if len(info) > 1: + logger.blankline() + + if selection: + logger.open('Task "' + args.task + '" for selection ' + + selection, path, handler=local_handlers) + else: + logger.open('Task "' + args.task + '" for', path, + handler=local_handlers) + + try: + for logfile in logfiles: + logger.info('Log file', logfile) + + if args.task == 'initialize': + proceed = initialize(pdsdir, selection) + + elif args.task == 'reinitialize': + if selection: # don't erase everything else! + proceed = update(pdsdir, selection) + else: + proceed = reinitialize(pdsdir, selection) + + elif args.task == 'validate': + proceed = validate(pdsdir, selection) + + elif args.task == 'repair': + proceed = repair(pdsdir, selection) + + else: # update + proceed = update(pdsdir, selection) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + proceed = False + raise + + finally: + _ = logger.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + proceed = False + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + if fatal or errors: + proceed = False + + # If everything went well, execute pdsinfoshelf too + if proceed and args.infoshelf: + new_list = [a.replace('pdschecksums', 'pdsinfoshelf') for a in sys.argv] + new_list = [a for a in new_list if a not in ('--infoshelf', '-i')] + status = os.system(' '.join(new_list)) + sys.exit(status) + +if __name__ == '__main__': + main() diff --git a/holdings_maintenance/pds4/pds4indexshelf.py b/holdings_maintenance/pds4/pds4indexshelf.py new file mode 100755 index 0000000..177e419 --- /dev/null +++ b/holdings_maintenance/pds4/pds4indexshelf.py @@ -0,0 +1,499 @@ +#!/usr/bin/env python3 +################################################################################ +# pdsindexshelf.py library and main program +# +# Syntax: +# pdsindexshelf.py --task index_path.csv [index_path.csv ...] +# +# Enter the --help option to see more information. +################################################################################ + +import argparse +import datetime +import glob +import os +import pickle +import sys + +import pdslogger +import pdsfile +import pdstable + +LOGNAME = 'pds.validation.indexshelf' +LOGROOT_ENV = 'PDS_LOG_ROOT' + +################################################################################ + +def generate_indexdict(pdsf, logger=None): + """Generate a dictionary keyed by row key for each row in the given table. + The value returned is a list containing all the associated row indices. + """ + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsf.root_) + logger.open('Tabulating index rows for', pdsf.abspath) + + try: + table = pdstable.PdsTable(pdsf.label_abspath, + filename_keylen=pdsf.filename_keylen) + + table.index_rows_by_filename_key() # fills in table.filename_keys + childnames = table.filename_keys + index_dict = {c:table.row_indices_by_filename_key(c) + for c in childnames} + + logger.info('Rows tabulated', str(len(index_dict)), force=True) + + latest_mtime = max(os.path.getmtime(pdsf.abspath), + os.path.getmtime(pdsf.label_abspath)) + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('Latest index file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + except (OSError, ValueError) as e: + logger.error(str(e)) + raise e + + finally: + _ = logger.close() + + return (index_dict, latest_mtime) + +################################################################################ + +def write_indexdict(pdsf, index_dict, logger=None): + """Write a new shelf file for the rows of this index.""" + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsf.root_) + logger.open('Writing index shelf file info for', pdsf.abspath) + + try: + pdsfile.Pds4File.close_all_shelves() # prevents using a cached shelf file + + shelf_path = pdsf.indexshelf_abspath + logger.info('Index shelf file', shelf_path) + + # Create parent directory if necessary + parent = os.path.split(shelf_path)[0] + if not os.path.exists(parent): + logger.info('Creating parent directory', parent) + os.makedirs(parent) + + # Write the pickle file + with open(shelf_path, 'wb') as f: + pickle.dump(index_dict, f) + + # Write the Python file + python_path = shelf_path.rpartition('.')[0] + '.py' + logger.info('Writing Python file', python_path) + + # Determine the maximum length of the keys + len_path = 0 + for key in index_dict: + len_path = max(len_path, len(key)) + + name = os.path.basename(shelf_path).rpartition('.')[0] + with open(python_path, 'w', encoding='latin-1') as f: + f.write(name + ' = {\n') + for key in index_dict: + f.write(' "%s: ' % (key + '"' + (len_path-len(key)) * ' ')) + + rows = index_dict[key] + if len(rows) == 1: + f.write('%d,\n' % rows[0]) + else: + f.write('(') + for row in rows[:-1]: + f.write('%d, ' % row) + f.write('%d),\n' % rows[-1]) + + f.write('}\n\n') + + logger.info('Two files written') + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def load_indexdict(pdsf, logger=None): + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsf.root_) + logger.open('Reading index shelf file for', pdsf.abspath) + + try: + shelf_path = pdsf.indexshelf_abspath + logger.info('Index shelf file', shelf_path) + + if not os.path.exists(shelf_path): + logger.error('Index shelf file not found', shelf_path) + return {} + + with open(shelf_path, 'rb') as f: + index_dict = pickle.load(f) + + logger.info('Shelf records loaded', str(len(index_dict))) + + except pickle.PickleError as e: + logger.exception(e) + raise + + finally: + logger.close() + + return index_dict + +################################################################################ + +def validate_infodict(pdsf, tabdict, shelfdict, logger=None): + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsf.root_) + logger.info('Validating index file for', pdsf.abspath) + + if tabdict == shelfdict: + logger.info('Validation complete') + else: + logger.error('Validation failed for', pdsf.abspath) + +################################################################################ +# Simplified functions to perform tasks +################################################################################ + +def initialize(pdsf, logger=None): + + shelf_path = pdsf.indexshelf_abspath + + # Make sure file does not exist + if os.path.exists(pdsf.indexshelf_abspath): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Index shelf file already exists', shelf_path) + return + + # Generate info + (index_dict, _) = generate_indexdict(pdsf, logger=logger) + if index_dict is None: + return + + # Save info file + write_indexdict(pdsf, index_dict, logger=logger) + +def reinitialize(pdsf, logger=None): + + shelf_path = pdsf.indexshelf_abspath + + # Warn if shelf file does not exist + if not os.path.exists(shelf_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Index shelf file does not exist; initializing', shelf_path) + initialize(pdsf, logger=logger) + return + + # Generate info + (index_dict, _) = generate_indexdict(pdsf, logger=logger) + if not index_dict: + return + + # Save info file + write_indexdict(pdsf, index_dict, logger=logger) + +def validate(pdsf, logger=None): + + shelf_path = pdsf.indexshelf_abspath + + # Make sure file exists + if not os.path.exists(shelf_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Index shelf file does not exist', shelf_path) + return + + (table_indexdict, _) = generate_indexdict(pdsf, logger=logger) + if table_indexdict is None: + return + + shelf_indexdict = load_indexdict(pdsf, logger=logger) + if not shelf_indexdict: + return + + # Validate + validate_infodict(pdsf, table_indexdict, shelf_indexdict, + logger=logger) + +def repair(pdsf, logger=None, op='repair'): + + shelf_path = pdsf.indexshelf_abspath + + # Make sure file exists + if not os.path.exists(shelf_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Index shelf file does not exist; initializing', + shelf_path) + initialize(pdsf, logger=logger) + return + + (table_indexdict, latest_mtime) = generate_indexdict(pdsf, logger=logger) + if not table_indexdict: + return + + shelf_indexdict = load_indexdict(pdsf, logger=logger) + if not shelf_indexdict: + return + + # Compare + canceled = (table_indexdict == shelf_indexdict) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + shelf_pypath = shelf_path.replace('.pickle', '.py') + shelf_mtime = min(os.path.getmtime(shelf_path), + os.path.getmtime(shelf_pypath)) + if latest_mtime > shelf_mtime: + logger.info('!!! Index shelf file content is up to date', + shelf_path, force=True) + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('!!! Index file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + dt = datetime.datetime.fromtimestamp(shelf_mtime) + logger.info('!!! Index shelf file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + delta = latest_mtime - shelf_mtime + if delta >= 86400/10: + logger.info('!!! Index shelf file is out of date %.1f days' % + (delta / 86400.), force=True) + else: + logger.info('!!! Index shelf file is out of date %.1f minutes' % + (delta / 60.), force=True) + + dt = datetime.datetime.now() + os.utime(shelf_path) + os.utime(shelf_pypath) + logger.info('!!! Time tag on index shelf files set to', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + else: + logger.info('!!! Index shelf file is up to date; repair canceled', + shelf_path, force=True) + + return + + # Write new info + write_indexdict(pdsf, table_indexdict, logger=logger) + +def update(pdsf, selection=None, logger=None): + + shelf_path = pdsf.indexshelf_abspath + if os.path.exists(shelf_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.info('!!! Index shelf file exists; not updated', pdsf.abspath) + + else: + initialize(pdsf, logger) + +################################################################################ +################################################################################ + +def main(): + + # Set up parser + parser = argparse.ArgumentParser( + description='pdsindexshelf: Create, maintain and validate shelf files ' + + 'containing row lookup information for index files.') + + parser.add_argument('--initialize', '--init', const='initialize', + default='', action='store_const', dest='task', + help='Create an indexshelf file for an index or for ' + + 'an entire metadata directory. Abort if the file '+ + 'already exists.') + + parser.add_argument('--reinitialize', '--reinit', const='reinitialize', + default='', action='store_const', dest='task', + help='Create an indexshelf file for an index or for ' + + 'an entire metadata directory. Replace any files '+ + 'that already exists.') + + parser.add_argument('--validate', const='validate', + default='', action='store_const', dest='task', + help='Validate an indexshelf file or metadata ' + + 'directory.') + + parser.add_argument('--repair', const='repair', + default='', action='store_const', dest='task', + help='Validate an index shelf file; replace only if ' + + 'necessary. If the shelf file content is correct '+ + 'but it is older than either the file or the ' + + 'label, update the shelf file\'s modification ' + + 'date.') + + parser.add_argument('--update', const='update', + default='', action='store_const', dest='task', + help='Search a metadata directory for any new index ' + + 'files and add create an index shelf file for ' + + 'each one. Existing index shelf files are not ' + + 'checked.') + + parser.add_argument('--table', nargs='+', type=str, + help='Path to an index file or metadata directory.') + + parser.add_argument('--log', '-l', type=str, default='', + help='Optional root directory for a duplicate of the ' + + 'log files. If not specified, the value of ' + + 'environment variable "%s" ' % LOGROOT_ENV + + 'is used. In addition, individual logs are ' + + 'written into the "logs" directory parallel to ' + + '"holdings". Logs are created inside the "index" '+ + 'subdirectory of each log root directory.') + + parser.add_argument('--quiet', '-q', action='store_true', + help='Do not also log to the terminal.') + + # Parse and validate the command line + args = parser.parse_args() + + if not args.task: + print('pdsindexshelf error: Missing task') + sys.exit(1) + + status = 0 + + # Define the logging directory + if args.log == '': + try: + args.log = os.environ[LOGROOT_ENV] + except KeyError: + args.log = None + + # Initialize the logger + logger = pdslogger.PdsLogger(LOGNAME) + pdsfile.Pds4File.set_log_root(args.log) + + if not args.quiet: + logger.add_handler(pdslogger.stdout_handler) + + if args.log: + path = os.path.join(args.log, 'pdsindexshelf') + warning_handler = pdslogger.warning_handler(path) + logger.add_handler(warning_handler) + + error_handler = pdslogger.error_handler(path) + logger.add_handler(error_handler) + + # Generate a list of Pds4File objects before logging + pdsfiles = [] + + for path in args.table: + + if not os.path.exists(path): + + print('No such file or directory: ' + path) + sys.exit(1) + + path = os.path.abspath(path) + pdsf = pdsfile.Pds4File.from_abspath(path) + + if pdsf.isdir: + if not '/metadata/' in path: + print('Not a metadata directory: ' + path) + sys.exit(1) + + tables = glob.glob(os.path.join(path, '*.csv')) + if not tables: + tables = glob.glob(os.path.join(path, '*/*.csv')) + + if not tables: + print('No .csv files in directory: ' + path) + sys.exit(1) + + pdsfiles += pdsfile.Pds4File.pdsfiles_for_abspaths(tables) + + else: + if not '/metadata/' in path: + print('Not a metadata file: ' + path) + sys.exit(1) + if not path.endswith('.csv'): + print('Not a table file: ' + path) + sys.exit(1) + + pdsfiles.append(pdsf) + + # Open logger and loop through tables... + logger.open(' '.join(sys.argv)) + try: + for pdsf in pdsfiles: + + # Save logs in up to two places + logfiles = [pdsf.log_path_for_index(task=args.task, + dir='pdsindexshelf'), + pdsf.log_path_for_index(task=args.task, + dir='pdsindexshelf', + place='parallel')] + if logfiles[0] == logfiles[1]: + logfiles = logfiles[:-1] + + # Create all the handlers for this level in the logger + local_handlers = [] + for logfile in logfiles: + local_handlers.append(pdslogger.file_handler(logfile)) + logdir = (logfile.rpartition('/pdsindexshelf/')[0] + + '/pdsindexshelf') + + # These handlers are only used if they don't already exist + warning_handler = pdslogger.warning_handler(logdir) + error_handler = pdslogger.error_handler(logdir) + local_handlers += [warning_handler, error_handler] + + # Open the next level of the log + if len(pdsfiles) > 1: + logger.blankline() + + logger.open('Task "' + args.task + '" for', pdsf.abspath, + handler=local_handlers) + + try: + for logfile in logfiles: + logger.info('Log file', logfile) + + if args.task == 'initialize': + initialize(pdsf) + + elif args.task == 'reinitialize': + reinitialize(pdsf) + + elif args.task == 'validate': + validate(pdsf) + + elif args.task == 'repair': + repair(pdsf) + + else: # update + update(pdsf) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + print(sys.exc_info()[2]) + status = 1 + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + if fatal or errors: status = 1 + + sys.exit(status) + +if __name__ == '__main__': + main() diff --git a/holdings_maintenance/pds4/pds4infoshelf.py b/holdings_maintenance/pds4/pds4infoshelf.py new file mode 100755 index 0000000..7238ba6 --- /dev/null +++ b/holdings_maintenance/pds4/pds4infoshelf.py @@ -0,0 +1,896 @@ +#!/usr/bin/env python3 +################################################################################ +# pdsinfoshelf.py library and main program +# +# Syntax: +# pdsinfoshelf.py --task path [path ...] +# +# Enter the --help option to see more information. +################################################################################ + +import argparse +import datetime +import glob +import os +from pathlib import Path +import pickle +import shutil +import sys +from PIL import Image + +import pdslogger +import pdsfile + +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +sys.path.insert(0, str(REPO_ROOT)) + +from holdings_maintenance.pds4 import pds4checksums + +# Holds log file directories temporarily, used by move_old_info() +LOGDIRS = [] + +LOGNAME = 'pds.validation.fileinfo' +LOGROOT_ENV = 'PDS_LOG_ROOT' + +PREVIEW_EXTS = set(['.jpg', '.png', '.gif', '.tif', '.tiff', + '.jpeg', '.jpeg_small']) + + +################################################################################ + +def generate_infodict(pdsdir, selection, old_infodict={}, + limits={'normal':-1}, logger=None): + """Generate a dictionary keyed by absolute file path for each file in the + directory tree. Value returned is a tuple (nbytes, child_count, modtime, + checksum, preview size). + + If a selection is specified, it is interpreted as the basename of a file, + and only that file is processed. + + The optional old_infodict overrides information found in the directory. + This dictionary is merged with the new information assembled. However, if + a selection is specified, information about the selection is always updated. + + Also return the latest modification date among all the files checked. + """ + + ### Internal function + + def get_info_for_file(abspath): + + nbytes = os.path.getsize(abspath) + children = 0 + mtime = os.path.getmtime(abspath) + dt = datetime.datetime.fromtimestamp(mtime) + modtime = dt.strftime('%Y-%m-%d %H:%M:%S.%f') + try: + checksum = checkdict[abspath] + except KeyError: + logger.error('Missing entry in checksum file', abspath) + checksum = '' + + size = (0,0) + ext = os.path.splitext(abspath)[1] + if ext.lower() in PREVIEW_EXTS: + try: + im = Image.open(abspath) + size = im.size + im.close() + except Exception: + logger.error('Preview size not found', abspath) + + return (nbytes, children, modtime, checksum, size) + + def get_info(abspath, infodict, old_infodict, checkdict): + """Info about the given abspath.""" + + if os.path.isdir(abspath): + nbytes = 0 + children = 0 + modtime = '' + + files = os.listdir(abspath) + for file in files: + absfile = os.path.join(abspath, file) + + if file == '.DS_Store': # skip .DS_Store files + logger.ds_store('.DS_Store skipped', absfile) + continue + + if file.startswith('._'): # skip dot-underscore files + logger.dot_underscore('._* file skipped', absfile) + continue + + if '/.' in abspath: # flag invisible files + logger.invisible('Invisible file', absfile) + + info = get_info(absfile, infodict, old_infodict, checkdict) + nbytes += info[0] + children += 1 + modtime = max(modtime, info[2]) + + info = (nbytes, children, modtime, '', (0,0)) + + elif abspath in old_infodict: + info = old_infodict[abspath] + + else: + info = get_info_for_file(abspath) + logger.normal('File info generated', abspath) + + infodict[abspath] = info + return info + + ################################ + # Begin executable code + ################################ + + dirpath = pdsdir.abspath + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + + if selection: + logger.open('Generating file info for selection "%s"' % selection, + dirpath, limits) + else: + logger.open('Generating file info', dirpath, limits) + + try: + # Load checksum dictionary + checkdict = pds4checksums.checksum_dict(dirpath, logger=logger) +# Removed... because we can't ignore empty directories +# if not checkdict: +# return ({}, 0.) + + # Generate info recursively + infodict = {} + if selection: + root = os.path.join(dirpath, selection) + else: + root = pdsdir.abspath + + info = get_info(root, infodict, old_infodict, checkdict) + latest_modtime = info[2] + + # Merge dictionaries + merged = old_infodict.copy() + + if selection: + merged[root] = infodict[root] + + else: + for (key, value) in infodict.items(): + if key not in merged: + info = infodict[key] + merged[key] = info + latest_modtime = max(latest_modtime, info[2]) + + if not merged: + logger.info('No files found') + latest_modtime = '' + else: + logger.info('Latest holdings file modification date = ' + + latest_modtime[:19], force=True) + + # We also have to check the modtime of the checksum file! + check_path = pdsdir.checksum_path_and_lskip()[0] + timestamp = os.path.getmtime(check_path) + check_datetime = datetime.datetime.fromtimestamp(timestamp) + check_modtime = check_datetime.strftime('%Y-%m-%d %H:%M:%S.%f') + logger.info('Checksum file modification date = ' + check_modtime[:19], + check_path, force=True) + if check_modtime > latest_modtime: + latest_modtime = check_modtime + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + return (merged, latest_modtime) + +################################################################################ + +def load_infodict(pdsdir, logger=None): + + dirpath = pdsdir.abspath + dirpath_ = dirpath.rstrip('/') + '/' + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Reading info shelf file for', dirpath_[:-1]) + + try: + (info_path, lskip) = pdsdir.shelf_path_and_lskip('info') + logger.info('Info shelf file', info_path) + + if not os.path.exists(info_path): + logger.error('Info shelf file not found', info_path) + return {} + + # Read the shelf file and convert to a dictionary + with open(info_path, 'rb') as f: + shelf = pickle.load(f) + + infodict = {} + for (key,info) in shelf.items(): + # Remove a 'null' checksum indicated by a string of dashes + # (Directories do not have checksums.) + if info[3] and info[3][0] == '-': + info = info[:3] + ('',) + info[4:] + + if key == '': + infodict[dirpath_[:-1]] = info + else: + infodict[dirpath_[:lskip] + key] = info + + return infodict + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def write_infodict(pdsdir, infodict, limits={}, logger=None): + """Write a new info shelf file for a directory tree.""" + + # Initialize + dirpath = pdsdir.abspath + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Writing info file info for', dirpath, limits=limits) + + try: + (info_path, lskip) = pdsdir.shelf_path_and_lskip('info') + logger.info('Info shelf file', info_path) + + # Create parent directory if necessary + parent = os.path.split(info_path)[0] + if not os.path.exists(parent): + logger.info('Creating parent directory', parent) + os.makedirs(parent) + + # Write the pickle file + pickle_dict = {} + for (key, values) in infodict.items(): + short_key = key[lskip:] + pickle_dict[short_key] = values + + with open(info_path, 'wb') as f: + pickle.dump(pickle_dict, f) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + logger.open('Writing Python dictionary', dirpath, limits=limits) + try: + # Determine the maximum length of the file path + len_path = 0 + for (abspath, values) in infodict.items(): + len_path = max(len_path, len(abspath)) + + len_path -= lskip + + # Write the python dictionary version + python_path = info_path.rpartition('.')[0] + '.py' + name = os.path.basename(python_path) + parts = name.split('_') + name = '_'.join(parts[:2]) + '_info' + abspaths = list(infodict.keys()) + abspaths.sort() + + with open(python_path, 'w', encoding='latin-1') as f: + f.write(name + ' = {\n') + for abspath in abspaths: + path = abspath[lskip:] + (nbytes, children, modtime, checksum, size) = infodict[abspath] + f.write(' "%s: ' % (path + '"' + (len_path-len(path)) * ' ')) + f.write('(%11d, %3d, ' % (nbytes, children)) + f.write('"%s", ' % modtime) + f.write('"%-33s, ' % (checksum + '"')) + f.write('(%4d,%4d)),\n' % size) + + f.write('}\n\n') + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def validate_infodict(pdsdir, dirdict, shelfdict, selection, + limits={'normal': 0}, logger=None): + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + + if selection: + logger.open('Validating file info for selection %s' % selection, + pdsdir.abspath, limits=limits) + else: + logger.open('Validating file info for', pdsdir.abspath, limits=limits) + + # Prune the shelf dictionary if necessary + if selection: + keys = list(shelfdict.keys()) + full_path = os.path.join(pdsdir.abspath, selection) + for key in keys: + if key != full_path: + del shelfdict[key] + + try: + keys = list(dirdict.keys()) + for key in keys: + if key in shelfdict: + dirinfo = dirdict[key] + shelfinfo = shelfdict[key] + + (bytes1, count1, modtime1, checksum1, size1) = dirinfo + (bytes2, count2, modtime2, checksum2, size2) = shelfinfo + + # Truncate modtimes to seconds + modtime1 = modtime1.rpartition('.')[0] + modtime2 = modtime2.rpartition('.')[0] + + agreement = True + if bytes1 != bytes2: + logger.error('File size mismatch %d %d' % + (bytes1, bytes2), key) + agreement = False + + if count1 != count2: + logger.error('Child count mismatch %d %d' % + (count1, count1), key) + agreement = False + + if abs(modtime1 != modtime2) > 1: + logger.error('Modification time mismatch "%s" "%s"' % + (modtime1, modtime2), key) + agreement = False + + if checksum1 != checksum1: + logger.error('Checksum mismatch', key) + agreement = False + + if size1 != size2: + logger.error('Display size mismatch', key) + agreement = False + + if agreement: + logger.normal('File info matches', key) + + del shelfdict[key] + del dirdict[key] + + keys = list(dirdict.keys()) + keys.sort() + for key in keys: + logger.error('Missing shelf info for', key) + + keys = list(shelfdict.keys()) + keys.sort() + for key in keys: + logger.error('Shelf info for missing file', key) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + return logger.close() + +################################################################################ + +def move_old_info(shelf_file, logger=None): + """Move a file to the /logs/ directory tree and append a time tag.""" + + if not os.path.exists(shelf_file): return + + shelf_basename = os.path.basename(shelf_file) + (shelf_prefix, shelf_ext) = os.path.splitext(shelf_basename) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + from_logged = False + for log_dir in LOGDIRS: + dest_template = log_dir + '/' + shelf_prefix + '_v???' + shelf_ext + version_paths = glob.glob(dest_template) + + max_version = 0 + lskip = len(shelf_ext) + for version_path in version_paths: + version = int(version_path[-lskip-3:-lskip]) + max_version = max(max_version, version) + + new_version = max_version + 1 + dest = dest_template.replace('???', '%03d' % new_version) + shutil.copy(shelf_file, dest) + + if not from_logged: + logger.info('Info shelf file moved from: ' + shelf_file) + from_logged = True + + logger.info('Info shelf file moved to', dest) + + python_file = shelf_file.rpartition('.')[0] + '.py' + dest = dest.rpartition('.')[0] + '.py' + shutil.copy(python_file, dest) + +################################################################################ +# Simplified functions to perform tasks +################################################################################ + +def initialize(pdsdir, selection=None, logger=None): + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + # Make sure file does not exist + if os.path.exists(info_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Info shelf file already exists', info_path) + return + + # Check selection + if selection: + logger.error('File selection is disallowed for task "initialize"', + selection) + return + + # Generate info + (infodict, _) = generate_infodict(pdsdir, selection, logger=logger) + + # Save info file + write_infodict(pdsdir, infodict, logger=logger) + +def reinitialize(pdsdir, selection=None, logger=None): + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + # Warn if shelf file does not exist + if not os.path.exists(info_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Info shelf file does not exist', info_path) + else: + logger.warn('Info shelf file does not exist; initializing', + info_path) + initialize(pdsdir, selection=selection, logger=logger) + return + + # Generate info + (infodict, _) = generate_infodict(pdsdir, selection, logger=logger) + if not infodict: + return + + # Move old file if necessary + if os.path.exists(info_path): + move_old_info(info_path, logger=logger) + + # Save info file + write_infodict(pdsdir, infodict, logger=logger) + +def validate(pdsdir, selection=None, logger=None): + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + # Make sure file exists + if not os.path.exists(info_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Info shelf file does not exist', info_path) + return + + # Read info shelf file + shelf_infodict = load_infodict(pdsdir, logger=logger) + + # Generate info + (dir_infodict, _) = generate_infodict(pdsdir, selection, logger=logger) + + # Validate + validate_infodict(pdsdir, dir_infodict, shelf_infodict, selection=selection, + logger=logger) + +def repair(pdsdir, selection=None, logger=None): + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + # Make sure file exists + if not os.path.exists(info_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Info shelf file does not exist', info_path) + else: + logger.warn('Info shelf file does not exist; initializing', + info_path) + initialize(pdsdir, selection=selection, logger=logger) + return + + # Read info shelf file + shelf_infodict = load_infodict(pdsdir, logger=logger) + + # Generate info + (dir_infodict, latest_modtime) = generate_infodict(pdsdir, selection, + logger=logger) + latest_iso = latest_modtime.replace(' ', 'T') + latest_datetime = datetime.datetime.fromisoformat(latest_iso) + + # For a single selection, use the old information + if selection: + key = list(dir_infodict.keys())[0] + value = dir_infodict[key] + dir_infodict = shelf_infodict.copy() + dir_infodict[key] = value + + # Compare + canceled = (dir_infodict == shelf_infodict) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + info_pypath = info_path.replace('.pickle', '.py') + timestamp = min(os.path.getmtime(info_path), + os.path.getmtime(info_pypath)) + info_datetime = datetime.datetime.fromtimestamp(timestamp) + info_iso = info_datetime.isoformat(timespec='microseconds') + + if latest_iso > info_iso: + logger.info('!!! Info shelf file content is up to date', + info_path, force=True) + logger.info('!!! Latest holdings file modification date', + latest_iso, force=True) + logger.info('!!! Info shelf file modification date', + info_iso, force=True) + + delta = (latest_datetime - info_datetime).total_seconds() + if delta >= 86400/10: + logger.info('!!! Info shelf file is out of date %.1f days' % + (delta / 86400.), force=True) + else: + logger.info('!!! Info shelf file is out of date %.1f minutes' % + (delta / 60.), force=True) + + dt = datetime.datetime.now() + os.utime(info_path) + os.utime(info_pypath) + logger.info('!!! Time tag on info shelf files set to', + dt.strftime('%Y-%m-%dT%H:%M:%S'), force=True) + else: + logger.info('!!! Info shelf file is up to date; repair canceled', + info_path, force=True) + return + + # Move files and write new info + move_old_info(info_path, logger=logger) + write_infodict(pdsdir, dir_infodict, logger=logger) + +def update(pdsdir, selection=None, logger=None): + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + # Make sure info shelf file exists + if not os.path.exists(info_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + if selection: + logger.error('Info shelf file does not exist', info_path) + else: + logger.warn('Info shelf file does not exist; initializing', + info_path) + initialize(pdsdir, selection=selection, logger=logger) + return + + # Read info shelf file + shelf_infodict = load_infodict(pdsdir, logger=logger) + + # Generate info + (dir_infodict, _) = generate_infodict(pdsdir, selection, shelf_infodict, + logger=logger) + + # Compare + canceled = (dir_infodict == shelf_infodict) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.info('!!! Info shelf file content is complete; update canceled', + info_path, force=True) + return + + # Write checksum file + move_old_info(info_path, logger=logger) + write_infodict(pdsdir, dir_infodict, logger=logger) + +################################################################################ +################################################################################ + +def main(): + + # Set up parser + parser = argparse.ArgumentParser( + description='pdsinfoshelf: Create, maintain and validate shelf files ' + + 'containing basic information about each file.') + + parser.add_argument('--initialize', '--init', const='initialize', + default='', action='store_const', dest='task', + help='Create an infoshelf file for a bundle. Abort ' + + 'if the file already exists.') + + parser.add_argument('--reinitialize', '--reinit', const='reinitialize', + default='', action='store_const', dest='task', + help='Create an infoshelf file for a bundle. Replace ' + + 'the file if it already exists. If a single ' + + 'file is specified, such as one archive file in ' + + 'a bundle set, then only information about that ' + + 'file is re-initialized.') + + parser.add_argument('--validate', const='validate', + default='', action='store_const', dest='task', + help='Validate every file in a bundle against the ' + + 'contents of its infoshelf file. If a single ' + + 'file is specified, such as an archive file in ' + + 'a bundle set, then only information about that ' + + 'file is validated') + + parser.add_argument('--repair', const='repair', + default='', action='store_const', dest='task', + help='Validate every file in a bundle against the ' + + 'contents of its infoshelf file. If any file ' + + 'has changed, the infoshelf file is replaced. ' + + 'If a single file is specified, such as an ' + + 'archive file in a bundle set, then only ' + + 'information about that file is repaired. If any '+ + 'of the files checked are newer than the shelf ' + + 'file, update the shelf file\'s modification ' + + 'date.') + + parser.add_argument('--update', const='update', + default='', action='store_const', dest='task', + help='Search a directory for any new files and add ' + + 'their information to the infoshelf file. ' + + 'Information about pre-existing files is not ' + + 'updated. If any of the files checked are newer ' + + 'than the shelf file, update the shelf file\'s ' + + 'modification date.') + + parser.add_argument('bundle', nargs='+', type=str, + help='The path to the root of the bundle or bundle ' + + 'set. For a bundle set, all the bundle ' + + 'directories inside it are handled in sequence.') + + parser.add_argument('--log', '-l', type=str, default='', + help='Optional root directory for a duplicate of the ' + + 'log files. If not specified, the value of ' + + 'environment variable "%s" ' % LOGROOT_ENV + + 'is used. In addition, individual logs are ' + + 'written into the "logs" directory parallel to ' + + '"holdings". Logs are created inside the ' + + '"pdsinfoshelf" subdirectory of each log root ' + + 'directory.' + ) + + parser.add_argument('--quiet', '-q', action='store_true', + help='Do not also log to the terminal.') + + parser.add_argument('--archives', '-a', default=False, action='store_true', + help='Instead of referring to a bundle, refer to the ' + + 'the archive file for that bundle.') + + + # Parse and validate the command line + args = parser.parse_args() + + if not args.task: + print('pdsinfoshelf error: Missing task') + sys.exit(1) + + status = 0 + + # Define the logging directory + if args.log == '': + try: + args.log = os.environ[LOGROOT_ENV] + except KeyError: + args.log = None + + # Initialize the logger + logger = pdslogger.PdsLogger(LOGNAME) + pdsfile.Pds4File.set_log_root(args.log) + + if not args.quiet: + logger.add_handler(pdslogger.stdout_handler) + + if args.log: + path = os.path.join(args.log, 'pdsinfoshelf') + warning_handler = pdslogger.warning_handler(path) + logger.add_handler(warning_handler) + + error_handler = pdslogger.error_handler(path) + logger.add_handler(error_handler) + + # Prepare the list of paths + abspaths = [] + for path in args.bundle: + + # Make sure path makes sense + path = os.path.abspath(path) + parts = path.partition('/pds4-holdings/') + if not parts[1]: + print('Not a holdings subdirectory: ' + path) + sys.exit(1) + + if parts[2].startswith('checksums-'): + print('No infoshelves for checksum files: ' + path) + sys.exit(1) + + # Convert to an archives path if necessary + if args.archives and not parts[2].startswith('archives-'): + path = parts[0] + '/pds4-holdings/archives-' + parts[2] + + # Convert to a list of absolute paths that exist (bundlsets or bundles) + try: + pdsf = pdsfile.Pds4File.from_abspath(path, must_exist=True) + abspaths.append(pdsf.abspath) + + except (ValueError, IOError): + # Allow a bundle name to stand in for a .tar.gz archive + (dir, basename) = os.path.split(path) + pdsdir = pdsfile.Pds4File.from_abspath(dir) + if pdsdir.archives_ and '.' not in basename: + if pdsdir.voltype_ == 'bundles/': + basename += '.tar.gz' + else: + basename += '_%s.tar.gz' % pdsdir.voltype_[:-1] + + newpaths = glob.glob(os.path.join(dir, basename)) + if len(newpaths) == 0: + raise + + abspaths += newpaths + continue + else: + raise + + # Generate a list of tuples (pdsfile, selection) + info = [] + for path in abspaths: + pdsf = pdsfile.Pds4File.from_abspath(path) + + if pdsf.is_bundleset_dir: + # Info about archive directories is stored by bundleset + if pdsf.archives_: + info.append((pdsf, None)) + + # Others are checksumed by bundle + else: + children = [pdsf.child(c) for c in pdsf.childnames] + info += [(c, None) for c in children if c.isdir] + # "if c.isdir" is False for bundleset level readme files + + elif pdsf.is_bundle_dir: + # Shelve one bundle + info.append((pdsf, None)) + + elif pdsf.isdir: + print('Invalid directory for an infoshelf: ' + pdsf.logical_path) + sys.exit(1) + + else: + pdsdir = pdsf.parent() + if pdsf.is_bundle_file: + # Shelve one archive file + info.append((pdsdir, pdsf.basename)) + elif pdsdir.is_bundle_dir: + # Shelve one top-level file in bundle + info.append((pdsdir, pdsf.basename)) + else: + print('Invalid file for an infoshelf: ' + pdsf.logical_path) + sys.exit(1) + + # Open logger and loop through tuples... + logger.open(' '.join(sys.argv)) + try: + for (pdsdir, selection) in info: + + info_path = pdsdir.shelf_path_and_lskip('info')[0] + + if selection: + pdsf = pdsdir.child(os.path.basename(selection)) + else: + pdsf = pdsdir + + # Save logs in up to two places + if pdsf.bundlename: + logfiles = set([pdsf.log_path_for_bundle('_info', + task=args.task, + dir='pdsinfoshelf'), + pdsf.log_path_for_bundle('_info', + task=args.task, + dir='pdsinfoshelf', + place='parallel')]) + else: + logfiles = set([pdsf.log_path_for_bundleset('_info', + task=args.task, + dir='pdsinfoshelf'), + pdsf.log_path_for_bundleset('_info', + task=args.task, + dir='pdsinfoshelf', + place='parallel')]) + + # Create all the handlers for this level in the logger + local_handlers = [] + LOGDIRS = [] # used by move_old_info() + for logfile in logfiles: + local_handlers.append(pdslogger.file_handler(logfile)) + logdir = os.path.split(logfile)[0] + LOGDIRS.append(os.path.split(logfile)[0]) + + # These handlers are only used if they don't already exist + warning_handler = pdslogger.warning_handler(logdir) + error_handler = pdslogger.error_handler(logdir) + local_handlers += [warning_handler, error_handler] + + # Open the next level of the log + if len(info) > 1: + logger.blankline() + + if selection: + logger.open('Task "' + args.task + '" for selection ' + + selection, pdsdir.abspath, handler=local_handlers) + else: + logger.open('Task "' + args.task + '" for', pdsdir.abspath, + handler=local_handlers) + + try: + for logfile in logfiles: + logger.info('Log file', logfile) + + if args.task == 'initialize': + initialize(pdsdir, selection) + + elif args.task == 'reinitialize': + if selection: # don't erase everything else! + update(pdsdir, selection) + else: + reinitialize(pdsdir, selection) + + elif args.task == 'validate': + validate(pdsdir, selection) + + elif args.task == 'repair': + repair(pdsdir, selection) + + else: # update + update(pdsdir, selection) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + print(sys.exc_info()[2]) + status = 1 + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + if fatal or errors: + status = 1 + + sys.exit(status) + +if __name__ == '__main__': + main() diff --git a/holdings_maintenance/pds4/pds4linkshelf.py b/holdings_maintenance/pds4/pds4linkshelf.py new file mode 100755 index 0000000..775c99a --- /dev/null +++ b/holdings_maintenance/pds4/pds4linkshelf.py @@ -0,0 +1,1219 @@ +#!/usr/bin/env python3 +################################################################################ +# # pdslinkshelf.py library and main program +# +# Syntax: +# pdslinkshelf.py --task path [path ...] +# +# Enter the --help option to see more information. +################################################################################ + +import argparse +import csv +import datetime +import glob +import os +import pickle +import re +import shutil +import sys + +import pdslogger +import pdsfile +import translator + +LOGNAME = 'pds.validation.links' +LOGROOT_ENV = 'PDS_LOG_ROOT' + +# Holds log file directories temporarily, used by move_old_links() +LOGDIRS = [] + +REPAIRS = translator.TranslatorByRegex([]) + +KNOWN_MISSING_LABELS = translator.TranslatorByRegex([]) + +# Match pattern for any file name, but possibly things that are not file names +PATTERN = r'\'?\"?([A-Z0-9][-\w]*\.[A-Z0-9][-\w\.]*)\'?\"?' + +# Match pattern for the file name in anything of the form +# "file name" in the PDS4 label +TARGET_REGEX1 = re.compile(r'^ *\' + PATTERN + r'\<\/file_name\>', re.I) + +# Match pattern for a file name on a line by itself +TARGET_REGEX2 = re.compile(r'^ *,? *' + PATTERN, re.I) + +# Match pattern for one or more file names embedded in a row of a text file. +# A file name begins with a letter, followed by any number of letters, digits, +# underscore or dash. Unless the name is "Makefile", it must have one or more +# extensions, each containing one or more characters. It can also have any +# number of directory prefixes separate by slashes. + +LINK_REGEX = re.compile(r'(?:|.*?[^/@\w\.])/?(?:\.\./)*(([A-Z0-9][-\w]+/)*' + + r'(makefile\.?|[A-Z0-9][\w-]*(\.[\w-]+)+))', re.I) + +EXTS_WO_LABELS = set(['.XML', '.CAT', '.FMT', '.SFD']) + +################################################################################ + +class LinkInfo(object): + """Used internally to describe a link within a specified record of a file. + """ + + def __init__(self, recno, linkname, is_target): + + self.recno = recno # record number + self.linktext = linkname # substring within this record that looks + # like a link. + self.linkname = linkname # link text after possible repair for known + # errors. + self.is_target = is_target # True if, based on the local context, this + # might be a target of a label file + self.target = '' # abspath to target of link, if any. + # If not blank, this file must exist. + + def remove_path(self): + """Remove any leading directory path from this LinkInfo object.""" + + if '/' in self.linktext: + self.linktext = self.linktext.rpartition('/')[2] + self.linkname = self.linktext + + def __str__(self): + return ('%d %s %s %s' % (self.recno, self.linktext, str(self.is_target), + self.target or '[' + self.linkname + ']')) + +def generate_links(dirpath, old_links={}, + limits={'info':-1, 'debug':500, 'ds_store':10}, logger=None): + """Generate a dictionary keyed by the absolute file path for files in the + given directory tree, which must correspond to a volume. + + Keys ending in .XML, .CAT and .TXT return a list of tuples + (recno, link, target) + for each link found. Here, + recno = record number in file; + link = the text of the link; + target = absolute path to the target of the link. + + Other keys return a single string, which indicates the absolute path to the + label file describing this file. + + Unlabeled files not ending in .XML, .CAT or .TXT return an empty string. + + Also return the latest modification date among all the files checked. + """ + + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.Pds4File.from_abspath(dirpath) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Finding link shelf files', dirpath, limits) + + try: + + linkinfo_dict = old_links.copy() # abspath: list of LinkInfo objects + label_dict = {k:v for k,v in old_links.items() if isinstance(v,str)} + # abspath: label for this file + abspaths = [] # list of all abspaths + + latest_mtime = 0. + collection_basename_dict = {} + # Walk the directory tree, one subdirectory "root" at a time... + for (root, dirs, files) in os.walk(dirpath): + + local_basenames = [] # Tracks the basenames in this directory + local_basenames_uc = [] # Same as above, but upper case + for basename in files: + abspath = os.path.join(root, basename) + latest_mtime = max(latest_mtime, os.path.getmtime(abspath)) + + if basename == '.DS_Store': # skip .DS_Store files + logger.ds_store('.DS_Store file skipped', abspath) + continue + + if basename.startswith('._'): # skip dot_underscore files + logger.dot_underscore('dot_underscore file skipped', + abspath) + continue + + if basename.startswith('.'): # skip invisible files + logger.invisible('Invisible file skipped', abspath) + continue + + # collection_basename_dict: a dictonary with the abspath of a collection + # csv file as the key and the set of basenames of its corresponding + # entries as the value. + # Create collection_basename_dict and use it to check whether a file + # is listed in the csv later. + if (basename.startswith('collection') and + basename.endswith('.csv') and + not abspath in collection_basename_dict): + logger.debug('Construct collection basename dictionary from', abspath) + csv_basenames = set() + with open(abspath, 'r') as file: + csv_lines = csv.reader(file) + for line in csv_lines: + # skip the empty line + if not line: + continue + if '::' in line[-1]: + lid = line[-1].rpartition('::')[0] + else: + lid = line[-1] + csv_basename = lid.rpartition(':')[-1] + csv_basenames.add(csv_basename) + + collection_basename_dict[abspath] = csv_basenames + + abspaths.append(abspath) + local_basenames.append(basename) + local_basenames_uc.append(basename.upper()) + + local_labels = [f for f in local_basenames if '.xml' in f] + local_labels_abspath = [os.path.join(root, f) for f in local_labels] + + # Update linkinfo_dict, searching each relevant file for possible links. + # If the linking file is a label and the target file has a matching + # name, update the label_dict entry for the target. + candidate_labels = {} # {target: list of possible label basenames} + for basename in local_basenames: + + abspath = os.path.join(root, basename) + if abspath in linkinfo_dict: # for update op, skip existing links + continue + + basename_uc = basename.upper() + + # Only check XML, CAT etc. + ext = basename_uc[-4:] if len(basename) >= 4 else '' + if ext not in EXTS_WO_LABELS: + continue + + # Get list of link info for all possible linked filenames + logger.info('*** Get link info and review', abspath) + linkinfo_list = read_links(abspath, logger=logger) + + # Apply repairs + repairs = REPAIRS.all(abspath) + for info in linkinfo_list: + for repair in repairs: + linkname = repair.first(info.linktext) + if linkname is None: + + # Attempt repair with leading directory path removed + if '/' in info.linktext: + info.remove_path() + linkname = repair.first(info.linktext) + + if linkname is None: + continue # no repair found + + info.linkname = linkname + if linkname == '': + logger.info('Ignoring link "%s"' % + info.linktext, abspath, force=True) + else: + logger.info('Repairing link "%s"->"%s"' % + (info.linktext, linkname), + abspath, force=True) + + # Validate non-local targets of repairs + if '/' in linkname: + target = os.path.join(root, linkname) + if os.path.exists(target): + info.target = os.path.abspath(target) + else: + logger.error('Target of repaired link is missing', target) + + break # apply only one repair per found link + + # Validate or remove other targets + new_linkinfo_list = [] + baseroot_uc = basename_uc.partition('.')[0] + ltest = len(baseroot_uc) + for info in linkinfo_list: + if info.target: # Non-local, repaired links have targets + new_linkinfo_list.append(info) + continue + + # A blank linkname is from a repair; indicates to ignore + if info.linkname == '': + continue + + # Ignore self-references + linkname_uc = info.linkname.upper() + if linkname_uc == basename_uc: + continue + + # Check for target inside this directory + try: + match_index = local_basenames_uc.index(linkname_uc) + except ValueError: + match_index = None + + # If not found, maybe it is a non-local reference (.FMT perhaps) + if match_index is None: + + # It's easy to pick up floats as link candidates; ignore + try: + _ = float(info.linkname) + continue # Yup, it's just a float + except ValueError: + pass + + if info.linkname[-1] in ('e', 'E'): + try: + _ = float(info.linkname[:-1]) + continue # Float with exponent + except ValueError: + pass + + # Also ignore format specifications (e.g., "F10.3") + if info.linkname[0] in ('F', 'E', 'G'): + try: + _ = float(info.linkname[1:]) + continue # Format + except ValueError: + pass + + # Search non-locally + if '/' in info.linkname: + nonlocal_target = locate_link_with_path(abspath, + info.linkname) + else: + nonlocal_target = locate_nonlocal_link(abspath, + info.linkname) + + # Report the outcome + if nonlocal_target: + logger.debug('Located "%s"' % info.linkname, + nonlocal_target) + info.target = nonlocal_target + new_linkinfo_list.append(info) + continue + + if linkname_uc.endswith('.FMT'): + logger.error('Unable to locate .FMT file "%s"' % + info.linkname, abspath) + elif linkname_uc.endswith('.CAT'): + logger.error('Unable to locate .CAT file "%s"' % + info.linkname, abspath) + else: + logger.debug('Substring "%s" is not a link, ignored' % + info.linkname, abspath) + + continue + + # Save the match + info.linkname = local_basenames[match_index] # update case + info.target = os.path.join(root, info.linkname) + new_linkinfo_list.append(info) + + # Could this be the label? + if ext != '.XML': # nope + continue + + # If names match up to '.XML', then yes + if (len(linkname_uc) > ltest and + linkname_uc[:ltest] == baseroot_uc and + linkname_uc[ltest] == '.'): + label_dict[info.target] = abspath + logger.info('Label identified (by name) for %s' % + info.linkname, abspath) + continue + + # Otherwise, then maybe + if info.is_target: + if info.linkname in candidate_labels: + if basename not in candidate_labels[info.linkname]: + candidate_labels[info.linkname].append(basename) + else: + candidate_labels[info.linkname] = [basename] + + logger.debug('Candidate label found for ' + + info.linkname, abspath) + + linkinfo_dict[abspath] = new_linkinfo_list + + parent_root = root.rpartition('/')[0] + local_collection_csv_prefix = f'{root}/collection' + parent_collection_csv_prefix = f'{parent_root}/collection' + + # Identify labels for files + for basename in local_basenames: + + basename_uc = basename.upper() + ext = basename_uc[-4:] if len(basename) >= 4 else '' + if ext in (".XML", ".FMT"): # these can't have labels + continue + + abspath = os.path.join(root, basename) + + if abspath in label_dict: + logger.info('Label already found for %s' % abspath) + continue # label already found + + # linkinfo_dict: a dictionary with the abspath of a label file as the key + # and a list of its corresponding files (LinkInfo objects) under file_name + # tags as the value. + # label_dict: a dictionary with the abspath of a file as the key and the + # abspath of its corresponding label as the value. + # At the current directory, if a file basename is in the list of a label's + # (in same directory) file_name tags in linkinfo_dict, create an entry of + # that file basename in label_dict. This will make sure the file is + # pointing to its correct corresponding label. + is_label_found = False + for label_abspath, link_info_list in linkinfo_dict.items(): + + # if the label is not at the same directory, skip it. + if label_abspath not in local_labels_abspath: + continue + + for info in link_info_list: + if info.linktext == basename and abspath not in label_dict: + label_dict[abspath] = label_abspath + logger.info('Label identified (by file_name tag) for %s' % + info.linktext, label_abspath) + is_label_found = True + break + if is_label_found: + break + + # label found by searching linkinfo_dict + if is_label_found: + continue + + # Maybe we already know the label is missing + test = KNOWN_MISSING_LABELS.first(abspath) + if test == 'unneeded': + logger.debug('Label is not neeeded', abspath) + continue + + if test == 'missing': + logger.debug('Label is known to be missing', abspath) + continue + + # Determine if a label is required + label_is_required = (ext not in EXTS_WO_LABELS) + + # Get the list of candidate labels in this directory + candidates = candidate_labels.get(basename, []) + + # Determine if the obvious label file exists + label_guess_uc = basename_uc.partition('.')[0] + '.XML' + if label_guess_uc in local_basenames_uc: + k = local_basenames_uc.index(label_guess_uc) + obvious_label_basename = local_basenames[k] + else: + obvious_label_basename = '' + + # Simplest case... + if obvious_label_basename in candidates: + if not label_is_required: + logger.debug('Unnecessary label found', abspath, force=True) + + label_abspath = os.path.join(root, obvious_label_basename) + label_dict[abspath] = label_abspath + logger.info('Label found for %s' % abspath, label_abspath) + continue + + # More cases... + if not label_is_required: + continue # leave abspath out of label_dict + + # Report a phantom label + if obvious_label_basename: + logger.error('Label %s does not point to file' % + local_basenames[k], abspath) + + if len(candidates) == 1: + logger.debug('Label found as ' + candidates[0], abspath, + force=True) + label_dict[abspath] = os.path.join(root, candidates[0]) + continue + + # Before raising an error, check this: + # For files like errata.txt, or checksum files that don't exist in the + # label nor exist in the csv, they are not part of the archive, so they + # don't have labels + is_basename_in_csv = False + logger.info('Check if %s is in the collection csv' % basename) + for col_abspath, csv_basenames in collection_basename_dict.items(): + if (col_abspath.startswith(parent_collection_csv_prefix) or + col_abspath.startswith(local_collection_csv_prefix)): + if basename.rpartition('.')[0] in csv_basenames: + is_basename_in_csv = True + break + + if not is_basename_in_csv: + continue + + # or errors... + label_dict[abspath] = "" + if len(candidates) == 0: + logger.error('Label is missing', abspath) + else: + logger.error('Ambiguous label found as %s' % candidates[0], + abspath, force=True) + for candidate in candidates[1:]: + logger.debug('Alternative label found as %s' % candidate, + abspath, force=True) + + # Merge the dictionaries + # There are cases where a file can have both a list of links and a label. + # This occurs when a .TXT or .CAT file has a label, even though it didn't + # need one. In the returned dictionary, link lists take priority. + link_dict = {} + + for key in abspaths: + if key in linkinfo_dict: + # If this is a new entry, it's a list of LinkInfo objects + # If this was copied from old_links, it's already a list of tuples + values = linkinfo_dict[key] + if isinstance(values, list): + new_list = [] + for item in values: + if isinstance(item, LinkInfo): + new_list.append((item.recno, item.linktext, item.target)) + else: + new_list.append(item) + link_dict[key] = new_list + else: + link_dict[key] = values + elif key in label_dict: + link_dict[key] = label_dict[key] + else: + link_dict[key] = '' + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('Lastest holdings file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + return (link_dict, latest_mtime) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +def read_links(abspath, logger=None): + """Return a list of LinkInfo objects for anything linked or labeled by this + file. + """ + + with open(abspath, 'r', encoding='latin-1') as f: + recs = f.readlines() + + links = [] + multiple_targets = False + for recno,rec in enumerate(recs): + + while True: + + # Search for the target of a link + is_target = True + matchobj = TARGET_REGEX1.match(rec) + + if matchobj: + subrec = rec[:matchobj.end()] + if '(' in subrec or '{' in subrec: + multiple_targets = True + + # ... on the same line or the next line + elif multiple_targets: + matchobj = TARGET_REGEX2.match(rec) + + # No more matches in this record + if not matchobj: + break + + linktext = matchobj.group(1) + links.append(LinkInfo(recno, linktext, is_target)) + + rec = rec[matchobj.end():] + + return links + +def locate_nonlocal_link(abspath, filename): + """Return the absolute path associated with a link in a PDS file. This is + done by searching up the tree and also by looking inside the LABEL, + CATALOG and INCLUDE directories if they exist.""" + + filename_uc = filename.upper() + + parts = abspath.split('/')[:-1] + + # parts are [..., 'holdings', 'volumes', volset, volname, ...] + # Therefore, if 'holdings' is in parts[:-3], then there's a volname in this + # path. + while 'pds4-holdings' in parts[:-3]: + testpath = '/'.join(parts) + basenames = os.listdir(testpath) + basenames_uc = [b.upper() for b in basenames] + try: + k = basenames_uc.index(filename_uc) + return testpath + '/' + basenames[k] + except ValueError: + pass + + for dirname in ['LABEL', 'CATALOG', 'INCLUDE', 'INDEX', 'DOCUMENT', + 'DATA', 'CALIB', 'EXTRAS', 'SOFTWARE']: + try: + k = basenames_uc.index(dirname) + subnames = os.listdir(testpath + '/' + basenames[k]) + subupper = [s.upper() for s in subnames] + try: + kk = subupper.index(filename_uc) + return testpath + '/' + basenames[k] + '/' + subnames[kk] + except ValueError: + pass + except ValueError: + pass + + parts = parts[:-1] + + return '' + +def locate_link_with_path(abspath, filename): + """Return the absolute path associated with a link that contains a leading + directory path. + """ + + parts = filename.split('/') + link_path = locate_nonlocal_link(abspath, parts[0]) + if not link_path: + return '' + + for part in parts[1:]: + basenames = os.listdir(link_path) + if part in basenames: + link_path += '/' + part + else: + basenames_uc = [b.upper() for b in basenames] + part_uc = part.upper() + if part_uc in basenames_uc: + k = basenames_uc.index(part_uc) + link_path += '/' + basenames[k] + else: + return '' + + return link_path + +################################################################################ + +def load_links(dirpath, limits={}, logger=None): + """Load link dictionary from a shelf file, converting interior paths to + absolute paths.""" + + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.Pds4File.from_abspath(dirpath) + + dirpath_ = dirpath.rstrip('/') + '/' + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Reading link shelf file for', dirpath, limits) + + try: + (link_path, lskip) = pdsdir.shelf_path_and_lskip('link') + prefix_ = pdsdir.volume_abspath() + '/' + + logger.info('Link shelf file', link_path) + + if not os.path.exists(link_path): + raise IOError('File not found: ' + link_path) + + # Read the shelf file and convert to a dictionary + with open(link_path, 'rb') as f: + interior_dict = pickle.load(f) + + # Convert interior paths to absolute paths + link_dict = {} + for (key, values) in interior_dict.items(): + long_key = dirpath_ + key + + if isinstance(values, list): + new_list = [] + for (recno, basename, interior_path) in values: + abspath = dirpath_ + str(interior_path) + if '../' in abspath: + abspath = os.path.abspath(abspath) + + new_list.append((recno, str(basename), abspath)) + + link_dict[long_key] = new_list + else: + values = str(values) + if values == '': + link_dict[long_key] = '' + else: + link_dict[long_key] = dirpath_ + values + + return link_dict + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def write_linkdict(dirpath, link_dict, limits={}, logger=None): + """Write a new link shelf file for a directory tree.""" + + # Initialize + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.Pds4File.from_abspath(dirpath) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Writing link shelf file for', dirpath, limits) + + try: + (link_path, lskip) = pdsdir.shelf_path_and_lskip('link') + logger.info('Link shelf file', link_path) + + # Create a dictionary using interior paths instead of absolute paths + interior_dict = {} + prefix = (dirpath + '/')[:lskip] + for (key, values) in link_dict.items(): + if isinstance(values, list): + new_list = [] + for (basename, recno, link_abspath) in values: + if link_abspath[:lskip] == prefix: + new_list.append((basename, recno, link_abspath[lskip:])) + else: # link outside this volume + link = pdsfile.Pds4File.from_abspath(link_abspath) + if (link.category_ == pdsdir.category_ and + link.bundleset == pdsdir.bundleset and + link.suffix == pdsdir.suffix): + link_relpath = '../' + link.bundlename_ + link.interior + elif link.category_ == pdsdir.category_: + link_relpath = ('../../' + link.bundleset_ + + link.bundlename_ + link.interior) + else: + link_relpath = ('../../../' + link.category_ + + link.bundleset_ + + link.bundlename_ + link.interior) + new_list.append((basename, recno, link_relpath)) + + interior_dict[key[lskip:]] = new_list + else: + interior_dict[key[lskip:]] = values[lskip:] + + # Create parent directory if necessary + parent = os.path.split(link_path)[0] + if not os.path.exists(parent): + logger.normal('Creating directory', parent) + os.makedirs(parent) + + # Write the shelf + with open(link_path, 'wb') as f: + pickle.dump(interior_dict, f) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + logger.open('Writing Python dictionary', dirpath) + try: + # Determine the maximum length of the file path and basename + len_key = 0 + len_base = 0 + for (key, value) in interior_dict.items(): + len_key = max(len_key, len(key)) + if isinstance(value, list): + tuples = value + for (recno, basename, interior_path) in tuples: + len_base = max(len_base, len(basename)) + + len_key = min(len_key, 60) + + # Write the python dictionary version + python_path = link_path.rpartition('.')[0] + '.py' + name = os.path.basename(python_path) + parts = name.split('_') + name = '_'.join(parts[:2]) + '_links' + keys = list(interior_dict.keys()) + keys.sort() + + with open(python_path, 'w', encoding='latin-1') as f: + f.write(name + ' = {\n') + for valtype in (list, str): + for key in keys: + if not isinstance(interior_dict[key], valtype): continue + + f.write(' "%s"' % key) + if len(key) < len_key: + f.write((len_key - len(key)) * ' ') + f.write(': ') + tuple_indent = max(len(key),len_key) + 7 + + values = interior_dict[key] + if isinstance(values, str): + f.write('"%s",\n' % values) + elif len(values) == 0: + f.write('[],\n') + else: + f.write('[') + for k in range(len(values)): + (recno, basename, interior_path) = values[k] + f.write('(%4d, ' % recno) + f.write('"%s, ' % (basename + '"' + + (len_base-len(basename)) * ' ')) + f.write('"%s")' % interior_path) + + if k < len(values) - 1: + f.write(',\n' + tuple_indent * ' ') + else: + f.write('],\n') + + f.write('}\n\n') + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + +################################################################################ + +def validate_links(dirpath, dirdict, shelfdict, limits={}, logger=None): + + dirpath = os.path.abspath(dirpath) + pdsdir = pdsfile.Pds4File.from_abspath(dirpath) + + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.replace_root(pdsdir.root_) + logger.open('Validating link shelf file for', dirpath, limits=limits) + + try: + keys = list(dirdict.keys()) + for key in keys: + if key in shelfdict: + dirinfo = dirdict[key] + shelfinfo = shelfdict[key] + + if type(dirinfo) == list: + dirinfo.sort() + + if type(shelfinfo) == list: + shelfinfo.sort() + + if dirinfo != shelfinfo: + logger.error('Link target mismatch', key) + + del shelfdict[key] + del dirdict[key] + + keys = list(dirdict.keys()) + keys.sort() + for key in keys: + logger.error('Missing link shelf file entry for', key) + + keys = list(shelfdict.keys()) + keys.sort() + for key in keys: + logger.error('Link shelf file entry found for missing file', key) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + return logger.close() + +################################################################################ + +def move_old_links(shelf_file, logger=None): + """Move a file to the /logs/ directory tree and append a time tag.""" + + if not os.path.exists(shelf_file): return + + shelf_basename = os.path.basename(shelf_file) + (shelf_prefix, shelf_ext) = os.path.splitext(shelf_basename) + + if logger is None: + logger = pdslogger.PdsLogger.get_logger(LOGNAME) + + from_logged = False + for log_dir in LOGDIRS: + dest_template = log_dir + '/' + shelf_prefix + '_v???' + shelf_ext + version_paths = glob.glob(dest_template) + + max_version = 0 + lskip = len(shelf_ext) + for version_path in version_paths: + version = int(version_path[-lskip-3:-lskip]) + max_version = max(max_version, version) + + new_version = max_version + 1 + dest = dest_template.replace('???', '%03d' % new_version) + shutil.copy(shelf_file, dest) + + if not from_logged: + logger.info('Link shelf file moved from: ' + shelf_file) + from_logged = True + + logger.info('Link shelf file moved to ' + dest) + + python_src = shelf_file.rpartition('.')[0] + '.py' + python_dest = dest.rpartition('.')[0] + '.py' + shutil.copy(python_src, python_dest) + + pickle_src = shelf_file.rpartition('.')[0] + '.pickle' + pickle_dest = dest.rpartition('.')[0] + '.pickle' + shutil.copy(pickle_src, pickle_dest) + +################################################################################ +# Simplified functions to perform tasks +################################################################################ + +def initialize(pdsdir, logger=None): + + link_path = pdsdir.shelf_path_and_lskip('link')[0] + + # Make sure file does not exist + if os.path.exists(link_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Link shelf file already exists', link_path) + return + + # Generate link info + (link_dict, _) = generate_links(pdsdir.abspath, logger=logger) + + # Move old file if necessary + if os.path.exists(link_path): + move_old_links(link_path, logger=logger) + + # Save link files + write_linkdict(pdsdir.abspath, link_dict, logger=logger) + +def reinitialize(pdsdir, logger=None): + + link_path = pdsdir.shelf_path_and_lskip('link')[0] + + # Warn if shelf file does not exist + if not os.path.exists(link_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Link shelf file does not exist; initializing', link_path) + initialize(pdsdir, logger=logger) + return + + # Generate link info + (link_dict, _) = generate_links(pdsdir.abspath, logger=logger) + + # Move old file if necessary + if os.path.exists(link_path): + move_old_links(link_path, logger=logger) + + # Save link files + write_linkdict(pdsdir.abspath, link_dict, logger=logger) + +def validate(pdsdir, logger=None): + + link_path = pdsdir.shelf_path_and_lskip('link')[0] + + # Make sure file exists + if not os.path.exists(link_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.error('Link shelf file does not exist', link_path) + return + + # Read link shelf file + shelf_linkdict = load_links(pdsdir.abspath, logger=logger) + + # Generate link dict + (dir_linkdict, _) = generate_links(pdsdir.abspath, logger=logger) + + # Validate + validate_links(pdsdir.abspath, dir_linkdict, shelf_linkdict, logger=logger) + +def repair(pdsdir, logger=None): + + link_path = pdsdir.shelf_path_and_lskip('link')[0] + + # Make sure file exists + if not os.path.exists(link_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Link shelf file does not exist; initializing', link_path) + return + + # Read link shelf file + shelf_linkdict = load_links(pdsdir.abspath, logger=logger) + + # Generate link dict + (dir_linkdict, latest_mtime) = generate_links(pdsdir.abspath, logger=logger) + + # Compare + canceled = (dir_linkdict == shelf_linkdict) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + + link_pypath = link_path.replace('.pickle', '.py') + link_mtime = min(os.path.getmtime(link_path), + os.path.getmtime(link_pypath)) + if latest_mtime > link_mtime: + logger.info('!!! Link shelf file content is up to date', + link_path, force=True) + + dt = datetime.datetime.fromtimestamp(latest_mtime) + logger.info('!!! Latest holdings file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + dt = datetime.datetime.fromtimestamp(link_mtime) + logger.info('!!! Link shelf file modification date', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + + delta = latest_mtime - link_mtime + if delta >= 86400/10: + logger.info('!!! Link shelf file is out of date %.1f days' % + (delta / 86400.), force=True) + else: + logger.info('!!! Link shelf file is out of date %.1f minutes' % + (delta / 60.), force=True) + + dt = datetime.datetime.now() + os.utime(link_path) + os.utime(link_pypath) + logger.info('!!! Time tag on link shelf files set to', + dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True) + else: + logger.info(f'!!! Link shelf file is up to date; repair canceled', + link_path, force=True) + return + + # Move files and write new links + move_old_links(link_path, logger=logger) + write_linkdict(pdsdir.abspath, dir_linkdict, logger=logger) + +def update(pdsdir, logger=None): + + link_path = pdsdir.shelf_path_and_lskip('link')[0] + + # Make sure link shelf file exists + if not os.path.exists(link_path): + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.warn('Link shelf file does not exist; initializing', link_path) + initialize(pdsdir, logger=logger) + return + + # Read link shelf file + shelf_linkdict = load_links(pdsdir.abspath, logger=logger) + + # Generate link dict + (dir_linkdict, + latest_mtime) = generate_links(pdsdir.abspath, shelf_linkdict, + logger=logger) + + # Compare + canceled = (dir_linkdict == shelf_linkdict) + if canceled: + logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME) + logger.info('!!! Link shelf file content is complete; update canceled', + link_path, force=True) + return + + # Move files and write new links + move_old_links(link_path, logger=logger) + write_linkdict(pdsdir.abspath, dir_linkdict, logger=logger) + +################################################################################ + +def main(): + + # Set up parser + parser = argparse.ArgumentParser( + description='pdslinkshelf: Create, maintain and validate shelves of ' + + 'links between files.') + + parser.add_argument('--initialize', '--init', const='initialize', + default='', action='store_const', dest='task', + help='Create a link shelf file for a volume. Abort ' + + 'if the checksum file already exists.') + + parser.add_argument('--reinitialize', '--reinit', const='reinitialize', + default='', action='store_const', dest='task', + help='Create a link shelf file for a volume. Replace ' + + 'the file if it already exists.') + + parser.add_argument('--validate', const='validate', + default='', action='store_const', dest='task', + help='Validate every link in a volume directory tree ' + + 'against its link shelf file.') + + parser.add_argument('--repair', const='repair', + default='', action='store_const', dest='task', + help='Validate every link in a volume directory tree ' + + 'against its link shelf file. If any ' + + 'disagreement is found, replace the shelf ' + + 'file; otherwise leave it unchanged. If any of ' + + 'the files checked are newer than the link shelf '+ + 'file, update shelf file\'s modification date') + + parser.add_argument('--update', const='update', + default='', action='store_const', dest='task', + help='Search a directory for any new files and add ' + + 'their links to the link shelf file. Links of ' + + 'pre-existing files are not checked.') + + parser.add_argument('bundle', nargs='+', type=str, + help='The path to the root directory of a bundle.') + + parser.add_argument('--log', '-l', type=str, default='', + help='Optional root directory for a duplicate of the ' + + 'log files. If not specified, the value of ' + + 'environment variable "%s" ' % LOGROOT_ENV + + 'is used. In addition, individual logs are ' + + 'written into the "logs" directory parallel to ' + + '"holdings". Logs are created inside the ' + + '"pdslinkshelf" subdirectory of each log root ' + + 'directory.' + ) + + parser.add_argument('--quiet', '-q', action='store_true', + help='Do not also log to the terminal.') + + # Parse and validate the command line + args = parser.parse_args() + + if not args.task: + print('pdslinkshelf error: Missing task') + sys.exit(1) + + status = 0 + + # Define the logging directory + if args.log == '': + try: + args.log = os.environ[LOGROOT_ENV] + except KeyError: + args.log = None + + # Initialize the logger + logger = pdslogger.PdsLogger(LOGNAME) + pdsfile.Pds4File.set_log_root(args.log) + + if not args.quiet: + logger.add_handler(pdslogger.stdout_handler) + + if args.log: + path = os.path.join(args.log, 'pdslinkshelf') + warning_handler = pdslogger.warning_handler(path) + logger.add_handler(warning_handler) + + error_handler = pdslogger.error_handler(path) + logger.add_handler(error_handler) + + # Generate a list of file paths before logging + paths = [] + for path in args.bundle: + + if not os.path.exists(path): + print('No such file or directory: ' + path) + sys.exit(1) + + path = os.path.abspath(path) + pdsf = pdsfile.Pds4File.from_abspath(path) + + if pdsf.checksums_: + print('No link shelf files for checksum files: ' + path) + sys.exit(1) + + if pdsf.archives_: + print('No link shelf files for archive files: ' + path) + sys.exit(1) + + if pdsf.is_bundleset_dir: + paths += [os.path.join(path, c) for c in pdsf.childnames] + + else: + paths.append(os.path.abspath(path)) + + # Loop through tuples... + logger.open(' '.join(sys.argv)) + try: + for path in paths: + + pdsdir = pdsfile.Pds4File.from_abspath(path) + # skip volset-level readme files and *_support dirctiory + # if not pdsdir.isdir or '_support' in pdsdir.abspath: + if not pdsdir.isdir: + continue + + # Save logs in up to two places + logfiles = set([pdsdir.log_path_for_bundle('_links', + task=args.task, + dir='pdslinkshelf'), + pdsdir.log_path_for_bundle('_links', + task=args.task, + dir='pdslinkshelf', + place='parallel')]) + + # Create all the handlers for this level in the logger + local_handlers = [] + LOGDIRS = [] # used by move_old_links() + for logfile in logfiles: + local_handlers.append(pdslogger.file_handler(logfile)) + logdir = os.path.split(logfile)[0] + LOGDIRS.append(os.path.split(logfile)[0]) + + # These handlers are only used if they don't already exist + warning_handler = pdslogger.warning_handler(logdir) + error_handler = pdslogger.error_handler(logdir) + local_handlers += [warning_handler, error_handler] + + # Open the next level of the log + if len(paths) > 1: + logger.blankline() + + logger.open('Task "' + args.task + '" for', path, + handler=local_handlers) + + try: + for logfile in logfiles: + logger.info('Log file', logfile) + + if args.task == 'initialize': + initialize(pdsdir) + + elif args.task == 'reinitialize': + reinitialize(pdsdir) + + elif args.task == 'validate': + validate(pdsdir) + + elif args.task == 'repair': + repair(pdsdir) + + else: # update + update(pdsdir) + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + raise + + finally: + _ = logger.close() + + except (Exception, KeyboardInterrupt) as e: + logger.exception(e) + status = 1 + raise + + finally: + (fatal, errors, warnings, tests) = logger.close() + if fatal or errors: status = 1 + + sys.exit(status) + +if __name__ == '__main__': + main() diff --git a/pdsfile/pds3file/__init__.py b/pdsfile/pds3file/__init__.py index 255d449..e8591b1 100644 --- a/pdsfile/pds3file/__init__.py +++ b/pdsfile/pds3file/__init__.py @@ -11,6 +11,7 @@ from pdsfile.pdsfile import PdsFile from . import rules from pdsfile.preload_and_cache import cache_lifetime_for_class +import re class Pds3File(PdsFile): @@ -87,6 +88,9 @@ class Pds3File(PdsFile): OPUS_ID_TO_SUBCLASS = rules.OPUS_ID_TO_SUBCLASS FILESPEC_TO_BUNDLESET = rules.FILESPEC_TO_BUNDLESET + IDX_EXT = '.tab' + LBL_EXT = '.lbl' + def __init__(self): super().__init__() diff --git a/pdsfile/pds4file/__init__.py b/pdsfile/pds4file/__init__.py index c4c6c66..01f326f 100644 --- a/pdsfile/pds4file/__init__.py +++ b/pdsfile/pds4file/__init__.py @@ -13,7 +13,16 @@ class Pds4File(PdsFile): - BUNDLESET_REGEX = re.compile(r'^(uranus_occs_earthbased|^cassini_iss|^cassini_vims)$') + PDS_HOLDINGS = 'pds4-holdings' + BUNDLE_DIR_NAME = 'bundles' + + # TODO: Generalize PDS4 bundlenames in the future once we have more bundles + # REGEX + BUNDLESET_REGEX = re.compile(r'^(uranus_occs_earthbased|' + + r'^cassini_iss.*|' + + r'^cassini_vims.*|' + + r'^cassini_uvis.*)$') + BUNDLESET_REGEX_I = re.compile(BUNDLESET_REGEX.pattern, re.I) BUNDLESET_PLUS_REGEX = re.compile(BUNDLESET_REGEX.pattern[:-1] + r'(_v[0-9]+\.[0-9]+\.[0-9]+|' + r'_v[0-9]+\.[0-9]+|_v[0-9]+|' + @@ -22,15 +31,18 @@ class Pds4File(PdsFile): r'((|_calibrated|_diagrams|_metadata|_previews)' + r'(|_md5\.txt|\.tar\.gz))$') BUNDLESET_PLUS_REGEX_I = re.compile(BUNDLESET_PLUS_REGEX.pattern, re.I) + BUNDLENAME_REGEX = re.compile(r'^([a-zA-z\_].+)$') - BUNDLENAME_REGEX = re.compile(r'((^uranus_occ_u\d{0,4}._[a-z]*_(fos|\d{2,3}cm))'+ - r'|(^cassini\_[a-z]{3,4}\_cruise))$') + BUNDLENAME_REGEX_I = re.compile(BUNDLENAME_REGEX.pattern, re.I) BUNDLENAME_PLUS_REGEX = re.compile(BUNDLENAME_REGEX.pattern[:-1] + r'(|_[a-z]+)(|_md5\.txt|\.tar\.gz)$') BUNDLENAME_PLUS_REGEX_I = re.compile(BUNDLENAME_PLUS_REGEX.pattern, re.I) - - PDS_HOLDINGS = 'pds4-holdings' - BUNDLE_DIR_NAME = 'bundles' + BUNDLENAME_VERSION = re.compile(BUNDLENAME_REGEX.pattern[:-1] + + r'(_v[0-9]+\.[0-9]+\.[0-9]+|'+ + r'_v[0-9]+\.[0-9]+|_v[0-9]+|'+ + r'_in_prep|_prelim|_peer_review|'+ + r'_lien_resolution)$') + BUNDLENAME_VERSION_I = re.compile(BUNDLENAME_VERSION.pattern, re.I) # Logger LOGGER = pdslogger.NullLogger() @@ -68,6 +80,9 @@ class Pds4File(PdsFile): LOCAL_PRELOADED = [] SUBCLASSES = {} + IDX_EXT = '.csv' + LBL_EXT = '.xml' + def __init__(self): super().__init__() diff --git a/pdsfile/pdsfile.py b/pdsfile/pdsfile.py index 50c710f..0a3cebd 100644 --- a/pdsfile/pdsfile.py +++ b/pdsfile/pdsfile.py @@ -259,30 +259,6 @@ class PdsFile(object): VIEWABLE_EXTS = set(['jpg', 'png', 'gif', 'tif', 'tiff', 'jpeg', 'jpeg_small']) DATAFILE_EXTS = set(['dat', 'img', 'cub', 'qub', 'fit', 'fits']) - # REGEX - BUNDLESET_REGEX = re.compile(r'^([A-Z][A-Z0-9x]{1,5}_[0-9x]{3}x)$') - BUNDLESET_REGEX_I = re.compile(BUNDLESET_REGEX.pattern, re.I) - BUNDLESET_PLUS_REGEX = re.compile(BUNDLESET_REGEX.pattern[:-1] + - r'(_v[0-9]+\.[0-9]+\.[0-9]+|'+ - r'_v[0-9]+\.[0-9]+|_v[0-9]+|'+ - r'_in_prep|_prelim|_peer_review|'+ - r'_lien_resolution|)' + - r'((|_calibrated|_diagrams|_metadata|_previews)' + - r'(|_md5\.txt|\.tar\.gz))$') - BUNDLESET_PLUS_REGEX_I = re.compile(BUNDLESET_PLUS_REGEX.pattern, re.I) - - BUNDLENAME_REGEX = re.compile(r'^([A-Z][A-Z0-9]{1,5}_(?:[0-9]{4}))$') - BUNDLENAME_REGEX_I = re.compile(BUNDLENAME_REGEX.pattern, re.I) - BUNDLENAME_PLUS_REGEX = re.compile(BUNDLENAME_REGEX.pattern[:-1] + - r'(|_[a-z]+)(|_md5\.txt|\.tar\.gz)$') - BUNDLENAME_PLUS_REGEX_I = re.compile(BUNDLENAME_PLUS_REGEX.pattern, re.I) - BUNDLENAME_VERSION = re.compile(BUNDLENAME_REGEX.pattern[:-1] + - r'(_v[0-9]+\.[0-9]+\.[0-9]+|'+ - r'_v[0-9]+\.[0-9]+|_v[0-9]+|'+ - r'_in_prep|_prelim|_peer_review|'+ - r'_lien_resolution)$') - BUNDLENAME_VERSION_I = re.compile(BUNDLENAME_VERSION.pattern, re.I) - CATEGORY_REGEX = re.compile(r'^(|checksums\-)(|archives\-)(\w+)$') CATEGORY_REGEX_I = re.compile(CATEGORY_REGEX.pattern, re.I) @@ -1320,11 +1296,11 @@ def os_path_exists(cls, abspath, force_case_sensitive=False): return os.path.exists(abspath) # Handle index rows - if '.tab/' in abspath: - parts = abspath.partition('.tab/') - if not cls.os_path_exists(parts[0] + '.tab'): + if f'{cls.IDX_EXT}/' in abspath: + parts = abspath.partition(f'{cls.IDX_EXT}/') + if not cls.os_path_exists(parts[0] + cls.IDX_EXT): return False - pdsf = cls.from_abspath(parts[0] + '.tab') + pdsf = cls.from_abspath(parts[0] + cls.IDX_EXT) return (pdsf.exists and pdsf.child_of_index(parts[2], flag='').exists) @@ -1865,14 +1841,14 @@ def indexshelf_abspath(self): cls = type(self) if self._indexshelf_abspath is None: - if self.extension not in ('.tab', '.TAB'): + if self.extension not in (cls.IDX_EXT, cls.IDX_EXT.upper()): self._indexshelf_abspath = '' else: abspath = self.abspath abspath = abspath.replace(f'/{cls.PDS_HOLDINGS}/', f'/{cls.PDS_HOLDINGS}/_indexshelf-') - abspath = abspath.replace('.tab', '.pickle') - abspath = abspath.replace('.TAB', '.pickle') + abspath = abspath.replace(cls.IDX_EXT, '.pickle') + abspath = abspath.replace(cls.IDX_EXT.upper(), '.pickle') self._indexshelf_abspath = abspath self._recache() @@ -1885,6 +1861,7 @@ def is_index(self): presence of the corresponding indexshelf file. """ + cls = type(self) if self._is_index is None: abspath = self.indexshelf_abspath if abspath and os.path.exists(abspath): @@ -1895,7 +1872,7 @@ def is_index(self): # file is being created. # XXX This is a real hack and should be looked at again later if ('/metadata/' in self.abspath - and self.abspath.lower().endswith('.tab')): + and self.abspath.lower().endswith(cls.IDX_EXT)): return True # this value is not cached self._is_index = False @@ -1911,9 +1888,11 @@ def index_pdslabel(self): if not self.is_index: return None + cls = type(self) if self._index_pdslabel is None: - label_abspath = self.abspath.replace ('.tab', '.lbl') - label_abspath = label_abspath.replace('.TAB', '.LBL') + label_abspath = self.abspath.replace (cls.IDX_EXT, cls.LBL_EXT) + label_abspath = label_abspath.replace(cls.IDX_EXT.upper(), + cls.LBL_EXT.upper()) try: self._index_pdslabel = pdsparser.PdsLabel.from_file(label_abspath) except: @@ -2626,9 +2605,9 @@ def label_basename(self): # Take a first guess at the label filename; PDS3 only! if self.extension.isupper(): - ext_guesses = ('.LBL', '.lbl') + ext_guesses = (cls.LBL_EXT.upper(), cls.LBL_EXT) else: - ext_guesses = ('.lbl', '.LBL') + ext_guesses = (cls.LBL_EXT, cls.LBL_EXT.upper()) rootname = self.basename[:-len(self.extension)] test_basenames = [rootname + ext for ext in ext_guesses] @@ -4832,7 +4811,7 @@ def checksum_path_and_lskip(self): raise ValueError('No checksums of checksum files: ' + self.logical_path) - if self.voltype_ == 'volumes/': + if self.voltype_ == 'volumes/' or self.voltype_ == 'bundles/': suffix = '' else: suffix = '_' + self.voltype_[:-1] @@ -5473,7 +5452,8 @@ def basename_is_label(self, basename): basename -- basename of a file """ - return (len(basename) > 4) and (basename[-4:].lower() == '.lbl') + cls = type(self) + return (len(basename) > 4) and (basename[-4:].lower() == cls.LBL_EXT) def basename_is_viewable(self, basename=None): """Return True if this basename is viewable. Override if viewable files can @@ -5966,8 +5946,8 @@ def associated_abspaths(self, category, must_exist=True): for pattern in patterns: # Handle an index row by separating the filepath from the suffix - if '.tab/' in pattern: - parts = pattern.rpartition('.tab') + if f'{cls.IDX_EXT}/' in pattern: + parts = pattern.rpartition(cls.IDX_EXT) pattern = parts[0] + parts[1] suffix = parts[2][1:] else: