diff --git a/holdings_maintenance/pds3/pdschecksums.py b/holdings_maintenance/pds3/pdschecksums.py
index cd99d31..2a16513 100755
--- a/holdings_maintenance/pds3/pdschecksums.py
+++ b/holdings_maintenance/pds3/pdschecksums.py
@@ -614,7 +614,7 @@ def main():
'their MD5 checksums to the checksum file. ' +
'Checksums of pre-existing files are not checked.')
- parser.add_argument('volume', nargs='+', type=str,
+ parser.add_argument('--volume', nargs='+', type=str,
help='The path to the root directory of a volume or ' +
'volume set. For a volume set, all the volume ' +
'directories inside it are handled in sequence. ' +
@@ -768,21 +768,21 @@ def main():
# Save logs in up to two places
if pdsf.volname:
- logfiles = set([pdsf.log_path_for_volume('_md5',
+ logfiles = set([pdsf.log_path_for_bundle('_md5',
task=args.task,
dir='pdschecksums'),
- pdsf.log_path_for_volume('_md5',
+ pdsf.log_path_for_bundle('_md5',
task=args.task,
dir='pdschecksums',
place='parallel')])
else:
- logfiles = set([pdsf.log_path_for_volset('_md5',
- task=args.task,
- dir='pdschecksums'),
- pdsf.log_path_for_volset('_md5',
- task=args.task,
- dir='pdschecksums',
- place='parallel')])
+ logfiles = set([pdsf.log_path_for_bundleset('_md5',
+ task=args.task,
+ dir='pdschecksums'),
+ pdsf.log_path_for_bundleset('_md5',
+ task=args.task,
+ dir='pdschecksums',
+ place='parallel')])
# Create all the handlers for this level in the logger
local_handlers = []
diff --git a/holdings_maintenance/pds3/pdsinfoshelf.py b/holdings_maintenance/pds3/pdsinfoshelf.py
index 7396946..cbf4a12 100755
--- a/holdings_maintenance/pds3/pdsinfoshelf.py
+++ b/holdings_maintenance/pds3/pdsinfoshelf.py
@@ -662,7 +662,7 @@ def main():
'than the shelf file, update the shelf file\'s ' +
'modification date.')
- parser.add_argument('volume', nargs='+', type=str,
+ parser.add_argument('--volume', nargs='+', type=str,
help='The path to the root of the volume or volume ' +
'set. For a volume set, all the volume ' +
'directories inside it are handled in sequence.')
diff --git a/holdings_maintenance/pds4/pds4checksums.py b/holdings_maintenance/pds4/pds4checksums.py
new file mode 100755
index 0000000..9f68127
--- /dev/null
+++ b/holdings_maintenance/pds4/pds4checksums.py
@@ -0,0 +1,859 @@
+#!/usr/bin/env python3
+################################################################################
+# pdschecksums.py library and main program
+#
+# Syntax:
+# pdschecksums.py --task path [path ...]
+#
+# Enter the --help option to see more information.
+################################################################################
+
+import argparse
+import datetime
+import glob
+import hashlib
+import os
+import shutil
+import sys
+
+import pdslogger
+import pdsfile
+
+# Holds log file directories temporarily, used by move_old_checksums()
+LOGDIRS = []
+
+LOGNAME = 'pds.validation.checksums'
+LOGROOT_ENV = 'PDS_LOG_ROOT'
+
+################################################################################
+
+# From http://stackoverflow.com/questions/3431825/-
+# generating-an-md5-checksum-of-a-file
+
+def hashfile(fname, blocksize=65536):
+ f = open(fname, 'rb')
+ hasher = hashlib.md5()
+ buf = f.read(blocksize)
+ while len(buf) > 0:
+ hasher.update(buf)
+ buf = f.read(blocksize)
+ return hasher.hexdigest()
+
+################################################################################
+
+def generate_checksums(pdsdir, selection=None, oldpairs=[], regardless=True,
+ limits={'normal':-1}, logger=None):
+ """Generate a list of tuples (abspath, checksum) recursively from the given
+ directory tree.
+
+ If a selection is specified, it is interpreted as the basename of a file,
+ and only that file is processed.
+
+ The optional oldpairs is a list of (abspath, checksum) pairs. For any file
+ that already has a checksum in the shortcut list, the checksum is copied
+ from this list rather than re-calculated. This list is merged with the
+ selection if a selection is identified.
+
+ If regardless is True, then the checksum of a selection is calculated
+ regardless of whether it is already in abspairs.
+
+ Also return the latest modification date among all the files checked.
+ """
+
+ dirpath = pdsdir.abspath
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdsdir.root_)
+ logger.open('Generating MD5 checksums', dirpath, limits=limits)
+
+ latest_mtime = 0.
+ try:
+ md5_dict = {}
+ for (abspath, hex) in oldpairs:
+ md5_dict[abspath] = hex
+
+ newtuples = []
+ for (path, dirs, files) in os.walk(dirpath):
+ for file in files:
+ abspath = os.path.join(path, file)
+ latest_mtime = max(latest_mtime, os.path.getmtime(abspath))
+
+ if selection and file != selection:
+ continue
+
+ if file == '.DS_Store': # skip .DS_Store files
+ logger.ds_store('.DS_Store skipped', abspath)
+ continue
+
+ if file.startswith('._'): # skip dot-underscore files
+ logger.dot_underscore('._* file skipped', abspath)
+ continue
+
+ if '/.' in abspath: # flag invisible files
+ logger.invisible('Invisible file', abspath)
+
+ if regardless and selection:
+ md5 = hashfile(abspath)
+ newtuples.append((abspath, md5, file))
+ logger.normal('Selected MD5=%s' % md5, abspath)
+
+ elif abspath in md5_dict:
+ newtuples.append((abspath, md5_dict[abspath], file))
+ logger.debug('MD5 copied', abspath)
+
+ else:
+ md5 = hashfile(abspath)
+ newtuples.append((abspath, md5, file))
+ logger.normal('MD5=%s' % md5, abspath)
+
+ if selection:
+ if len(newtuples) == 0:
+ logger.error('File selection not found', selection)
+ return ({}, latest_mtime)
+
+ if len(newtuples) > 1:
+ logger.error('Multiple copies of file selection found',
+ selection)
+ return ({}, latest_mtime)
+
+ # Add new values to dictionary
+ for (abspath, md5, _) in newtuples:
+ md5_dict[abspath] = md5
+
+ # Restore original order, old keys then new
+ old_keys = [p[0] for p in oldpairs]
+
+ newpairs = []
+ for key in old_keys:
+ newpairs.append((key, md5_dict[key]))
+ del md5_dict[key]
+
+ for (key, new_md5, new_file) in newtuples:
+ if key in md5_dict: # if not already copied to list of pairs
+ newpairs.append((key, md5_dict[key]))
+
+ dt = datetime.datetime.fromtimestamp(latest_mtime)
+ logger.info('Lastest holdings file modification date',
+ dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True)
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ _ = logger.close()
+
+ return (newpairs, latest_mtime)
+
+################################################################################
+
+def read_checksums(check_path, selection=None, limits={}, logger=None):
+
+ """Return a list of tuples (abspath, checksum) from a checksum file.
+
+ If a selection is specified, then only the checksum with this file name
+ is returned."""
+
+ check_path = os.path.abspath(check_path)
+ pdscheck = pdsfile.Pds4File.from_abspath(check_path)
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdscheck.root_)
+ logger.open('Reading MD5 checksums', check_path, limits=limits)
+
+ try:
+ logger.info('MD5 checksum file', check_path)
+
+ if not os.path.exists(check_path):
+ logger.error('MD5 checksum file not found', check_path)
+ return []
+
+ prefix_ = pdscheck.dirpath_and_prefix_for_checksum()[1]
+
+ # Read the pairs
+ abspairs = []
+ with open(check_path, 'r') as f:
+ for rec in f:
+ hexval = rec[:32]
+ filepath = rec[34:].rstrip()
+
+ if selection and os.path.basename(filepath) != selection:
+ continue
+
+ basename = os.path.basename(filepath)
+ if basename == '.DS_Store':
+ logger.error('.DS_Store found in checksum file', filepath)
+ continue
+
+ if basename.startswith('._'):
+ logger.error('._* file found in checksum file', filepath)
+ continue
+
+ if basename[0] == '.':
+ logger.invisible('Checksum for invisible file', filepath)
+
+ abspairs.append((prefix_ + filepath, hexval))
+ logger.debug('Read', filepath)
+
+ if selection and len(abspairs) == 0:
+ logger.error('File selection not found', selection)
+ return []
+
+ except Exception as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ _ = logger.close()
+
+ return abspairs
+
+################################################################################
+
+def checksum_dict(dirpath, logger=None):
+
+ dirpath = os.path.abspath(dirpath)
+ pdsdir = pdsfile.Pds4File.from_abspath(dirpath)
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdsdir.root_)
+ logger.info('Loading checksums for', dirpath, force=True)
+
+ check_path = pdsdir.checksum_path_and_lskip()[0]
+ abspairs = read_checksums(check_path, logger=logger)
+
+ pair_dict = {}
+ for (abspath, checksum) in abspairs:
+ pair_dict[abspath] = checksum
+
+ logger.info('Checksum load completed', dirpath, force=True)
+ return pair_dict
+
+################################################################################
+
+def write_checksums(check_path, abspairs,
+ limits={'dot_':-1, 'ds_store':-1, 'invisible':100},
+ logger=None):
+ """Write a checksum table containing the given pairs (abspath, checksum)."""
+
+ check_path = os.path.abspath(check_path)
+ pdscheck = pdsfile.Pds4File.from_abspath(check_path)
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdscheck.root_)
+ logger.open('Writing MD5 checksums', check_path, limits=limits)
+
+ try:
+ # Create parent directory if necessary
+ parent = os.path.split(check_path)[0]
+ if not os.path.exists(parent):
+ logger.normal('Creating directory', parent)
+ os.makedirs(parent)
+
+ prefix_ = pdscheck.dirpath_and_prefix_for_checksum()[1]
+ lskip = len(prefix_)
+
+ # Write file
+ f = open(check_path, 'w')
+ for pair in abspairs:
+ (abspath, hex) = pair
+
+ if abspath.endswith('/.DS_Store'): # skip .DS_Store files
+ logger.ds_store('.DS_Store skipped', abspath)
+ continue
+
+ if '/._' in abspath: # skip dot-underscore files
+ logger.dot_underscore('._* file skipped', abspath)
+ continue
+
+ if '/.' in abspath: # flag invisible files
+ logger.invisible('Invisible file', abspath)
+
+ f.write('%s %s\n' % (hex, abspath[lskip:]))
+ logger.debug('Written', abspath)
+
+ f.close()
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ _ = logger.close()
+
+################################################################################
+
+def validate_pairs(pairs1, pairs2, selection=None, limits={}, logger=None):
+ """Validate the first checksum list against the second.
+
+ If a selection is specified, only a file with that basename is checked."""
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.open('Validating checksums', limits=limits)
+
+ success = True
+ try:
+ md5_dict = {}
+ for (abspath, hex) in pairs2:
+ md5_dict[abspath] = hex
+
+ for (abspath, hex) in pairs1:
+ if selection and selection != os.path.basename(abspath):
+ continue
+
+ if abspath not in md5_dict:
+ logger.error('Missing checksum', abspath)
+ success = False
+
+ elif hex != md5_dict[abspath]:
+ del md5_dict[abspath]
+ logger.error('Checksum mismatch', abspath)
+ success = False
+
+ else:
+ del md5_dict[abspath]
+ logger.normal('Validated', abspath)
+
+ if not selection:
+ abspaths = list(md5_dict.keys())
+ abspaths.sort()
+ for abspath in abspaths:
+ logger.error('Extra file', abspath)
+ success = False
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ logger.close()
+ return success
+
+################################################################################
+
+def move_old_checksums(check_path, logger=None):
+ """Appends a version number to an existing checksum file and moves it to
+ the associated log directory."""
+
+ if not os.path.exists(check_path): return
+
+ check_basename = os.path.basename(check_path)
+ (check_prefix, check_ext) = os.path.splitext(check_basename)
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+
+ from_logged = False
+ for log_dir in LOGDIRS:
+ dest_template = log_dir + '/' + check_prefix + '_v???' + check_ext
+ version_paths = glob.glob(dest_template)
+
+ max_version = 0
+ lskip = len(check_ext)
+ for version_path in version_paths:
+ version = int(version_path[-lskip-3:-lskip])
+ max_version = max(max_version, version)
+
+ new_version = max_version + 1
+ dest = dest_template.replace('???', '%03d' % new_version)
+ shutil.copy(check_path, dest)
+
+ if not from_logged:
+ logger.info('Checksum file moved from: ' + check_path)
+ from_logged = True
+
+ logger.info('Checksum file moved to', dest)
+
+################################################################################
+# Simplified functions to perform tasks
+################################################################################
+
+def initialize(pdsdir, selection=None, logger=None):
+
+ check_path = pdsdir.checksum_path_and_lskip()[0]
+
+ # Make sure checksum file does not exist
+ if os.path.exists(check_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.error('Checksum file already exists', check_path)
+ return False
+
+ # Check selection
+ if selection:
+ raise ValueError('File selection is disallowed for task ' +
+ '"initialize": ' + selection)
+
+ # Generate checksums
+ (pairs, _) = generate_checksums(pdsdir, logger=logger)
+ if not pairs:
+ return False
+
+ # Write new checksum file
+ write_checksums(check_path, pairs, logger=logger)
+ return True
+
+def reinitialize(pdsdir, selection=None, logger=None):
+
+ check_path = pdsdir.checksum_path_and_lskip()[0]
+
+ # Warn if checksum file does not exist
+ if not os.path.exists(check_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ if selection:
+ logger.error('Checksum file does not exist', check_path)
+ return False
+ else:
+ logger.warn('Checksum file does not exist; initializing', check_path)
+ return initialize(pdsdir, selection=selection, logger=logger)
+
+ # Re-initialize just the selection; preserve others
+ if selection:
+ oldpairs = read_checksums(check_path, logger=logger)
+ if not oldpairs:
+ return False
+ else:
+ oldpairs = []
+
+ # Generate new checksums
+ (pairs, _) = generate_checksums(pdsdir, selection, oldpairs,
+ regardless=True, logger=logger)
+ if not pairs:
+ return False
+
+ # Write new checksum file
+ move_old_checksums(check_path, logger=logger)
+ write_checksums(check_path, pairs, logger=logger)
+ return True
+
+def validate(pdsdir, selection=None, logger=None):
+
+ check_path = pdsdir.checksum_path_and_lskip()[0]
+
+ # Make sure checksum file exists
+ if not os.path.exists(check_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.error('Checksum file does not exist', check_path)
+ return False
+
+ # Read checksum file
+ md5pairs = read_checksums(check_path, selection, logger=logger)
+ if not md5pairs:
+ return False
+
+ # Generate checksums
+ (dirpairs, _) = generate_checksums(pdsdir, selection, logger=logger)
+ if not dirpairs:
+ return False
+
+ # Validate
+ return validate_pairs(dirpairs, md5pairs, selection, logger=logger)
+
+def repair(pdsdir, selection=None, logger=None):
+
+ check_path = pdsdir.checksum_path_and_lskip()[0]
+
+ # Make sure checksum file exists
+ if not os.path.exists(check_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ if selection:
+ logger.error('Checksum file does not exist', check_path)
+ return False
+ else:
+ logger.warn('Checksum file does not exist; initializing', check_path)
+ return initialize(pdsdir, selection=selection, logger=logger)
+
+ # Read checksums file
+ md5pairs = read_checksums(check_path, logger=logger)
+ if not md5pairs:
+ return False
+
+ # Generate new checksums
+ if selection:
+ (dirpairs,
+ latest_mtime) = generate_checksums(pdsdir, selection, md5pairs,
+ regardless=True, logger=logger)
+ else:
+ (dirpairs,
+ latest_mtime) = generate_checksums(pdsdir, logger=logger)
+
+ if not dirpairs:
+ return False
+
+ # Compare checksums
+ md5pairs.sort()
+ dirpairs.sort()
+ canceled = (dirpairs == md5pairs)
+ if canceled:
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+
+ check_mtime = os.path.getmtime(check_path)
+ if latest_mtime > check_mtime:
+ logger.info('!!! Checksum file content is up to date',
+ check_path, force=True)
+
+ dt = datetime.datetime.fromtimestamp(latest_mtime)
+ logger.info('!!! Latest holdings file modification date',
+ dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True)
+
+ check_mtime = os.path.getmtime(check_path)
+ dt = datetime.datetime.fromtimestamp(check_mtime)
+ logger.info('!!! Checksum file modification date',
+ dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True)
+
+ delta = latest_mtime - check_mtime
+ if delta >= 86400/10:
+ logger.info('!!! Checksum file is out of date %.1f days' %
+ (delta / 86400.), force=True)
+ else:
+ logger.info('!!! Checksum file is out of date %.1f minutes' %
+ (delta / 60.), force=True)
+
+ dt = datetime.datetime.now()
+ os.utime(check_path)
+ logger.info('!!! Time tag on checksum file set to',
+ dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True)
+
+ else:
+ logger.info('!!! Checksum file is up to date; repair canceled',
+ check_path, force=True)
+ return True
+
+ # Write checksum file
+ move_old_checksums(check_path, logger=logger)
+ write_checksums(check_path, dirpairs, logger=logger)
+ return True
+
+def update(pdsdir, selection=None, logger=None):
+
+ check_path = pdsdir.checksum_path_and_lskip()[0]
+
+ # Make sure file exists
+ if not os.path.exists(check_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ if selection:
+ logger.error('Checksum file does not exist', check_path)
+ return False
+ else:
+ logger.warn('Checksum file does not exist; initializing', check_path)
+ return initialize(pdsdir, selection=selection, logger=logger)
+
+ # Read checksums file
+ md5pairs = read_checksums(check_path, logger=logger)
+ if not md5pairs:
+ return False
+
+ # Generate new checksums if necessary
+ (dirpairs,
+ latest_mtime) = generate_checksums(pdsdir, selection, md5pairs,
+ regardless=False, logger=logger)
+ if not dirpairs:
+ return False
+
+ # Compare checksums
+ md5pairs.sort()
+ dirpairs.sort()
+ canceled = (dirpairs == md5pairs)
+ if canceled:
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.info('!!! Checksum file content is complete; update canceled',
+ check_path)
+ return True
+
+ # Write checksum file
+ move_old_checksums(check_path, logger=logger)
+ write_checksums(check_path, dirpairs, logger=logger)
+ return True
+
+################################################################################
+# Executable program
+################################################################################
+
+def main():
+
+ # Set up parser
+ parser = argparse.ArgumentParser(
+ description='pdschecksums: Create, maintain and validate MD5 ' +
+ 'checksum files for PDS volumes and volume sets.')
+
+ parser.add_argument('--initialize', '--init', const='initialize',
+ default='', action='store_const', dest='task',
+ help='Create an MD5 checksum file for a volume or ' +
+ 'volume set. Abort if the checksum file ' +
+ 'already exists.')
+
+ parser.add_argument('--reinitialize', '--reinit', const='reinitialize',
+ default='', action='store_const', dest='task',
+ help='Create an MD5 checksum file for a volume or ' +
+ 'volume set. Replace the checksum file if it ' +
+ 'already exists. If a single file is specified, ' +
+ 'such as one archive file in a volume set, only ' +
+ 'single checksum is re-initialized.')
+
+ parser.add_argument('--validate', const='validate',
+ default='', action='store_const', dest='task',
+ help='Validate every file in a volume directory tree ' +
+ 'against its MD5 checksum. If a single file ' +
+ 'is specified, such as one archive file in a ' +
+ 'volume set, only that single checksum is ' +
+ 'validated.')
+
+ parser.add_argument('--repair', const='repair',
+ default='', action='store_const', dest='task',
+ help='Validate every file in a volume directory tree ' +
+ 'against its MD5 checksum. If any disagreement ' +
+ 'is found, the checksum file is replaced; ' +
+ 'otherwise it is unchanged. If a single file is ' +
+ 'specified, such as one archive file of a ' +
+ 'volume set, then only that single checksum is ' +
+ 'repaired. If any of the files checked are newer' +
+ 'than the checksum file, update shelf file\'s ' +
+ 'modification date')
+
+ parser.add_argument('--update', const='update',
+ default='', action='store_const', dest='task',
+ help='Search a directory for any new files and add ' +
+ 'their MD5 checksums to the checksum file. ' +
+ 'Checksums of pre-existing files are not checked.')
+
+ parser.add_argument('bundle', nargs='+', type=str,
+ help='The path to the root directory of a volume or ' +
+ 'volume set. For a volume set, all the volume ' +
+ 'directories inside it are handled in sequence. ' +
+ 'Note that, for archive directories, checksums ' +
+ 'are grouped into one file for the entire ' +
+ 'volume set.')
+
+ parser.add_argument('--log', '-l', type=str, default='',
+ help='Optional root directory for a duplicate of the ' +
+ 'log files. If not specified, the value of ' +
+ 'environment variable "%s" ' % LOGROOT_ENV +
+ 'is used. In addition, individual logs are ' +
+ 'written into the "logs" directory parallel to ' +
+ '"holdings". Logs are created inside the ' +
+ '"pdschecksums" subdirectory of each log root ' +
+ 'directory.')
+
+ parser.add_argument('--quiet', '-q', action='store_true',
+ help='Do not also log to the terminal.')
+
+ parser.add_argument('--archives', '-a', default=False, action='store_true',
+ help='Instead of referring to a volume, refer to the ' +
+ 'the archive file for that volume.')
+
+ parser.add_argument('--infoshelf', '-i', dest='infoshelf',
+ default=False, action='store_true',
+ help='After a successful run, also execute the ' +
+ 'equivalent pdsinfoshelf command.')
+
+
+ # Parse and validate the command line
+ args = parser.parse_args()
+
+ if not args.task:
+ print('pdschecksums error: Missing task')
+ sys.exit(1)
+
+ # Define the logging directory
+ if args.log == '':
+ try:
+ args.log = os.environ[LOGROOT_ENV]
+ except KeyError:
+ args.log = None
+
+ # Initialize the logger
+ logger = pdslogger.PdsLogger(LOGNAME)
+ pdsfile.Pds4File.set_log_root(args.log)
+
+ if not args.quiet:
+ logger.add_handler(pdslogger.stdout_handler)
+
+ if args.log:
+ path = os.path.join(args.log, 'pdschecksums')
+ warning_handler = pdslogger.warning_handler(path)
+ logger.add_handler(warning_handler)
+
+ error_handler = pdslogger.error_handler(path)
+ logger.add_handler(error_handler)
+
+ # Prepare the list of paths
+ abspaths = []
+ for path in args.bundle:
+
+ # Make sure path makes sense
+ path = os.path.abspath(path)
+ parts = path.partition('/pds4-holdings/')
+ if not parts[1]:
+ print('Not a holdings subdirectory: ' + path)
+ sys.exit(1)
+
+ if parts[2].startswith('checksums-'):
+ print('No checksums for checksum files: ' + path)
+ sys.exit(1)
+
+ # Convert to an archives path if necessary
+ if args.archives and not parts[2].startswith('archives-'):
+ path = parts[0] + '/pds4-holdings/archives-' + parts[2]
+
+ # Convert to a list of absolute paths that exist (volsets or volumes)
+ try:
+ pdsf = pdsfile.Pds4File.from_abspath(path, must_exist=True)
+ abspaths.append(pdsf.abspath)
+
+ except (ValueError, IOError):
+ # Allow a volume name to stand in for a .tar.gz archive
+ (dir, basename) = os.path.split(path)
+ pdsdir = pdsfile.Pds4File.from_abspath(dir)
+ if pdsdir.archives_ and '.' not in basename:
+ if pdsdir.voltype_ == 'volumes/':
+ basename += '.tar.gz'
+ else:
+ basename += '_%s.tar.gz' % pdsdir.voltype_[:-1]
+
+ newpaths = glob.glob(os.path.join(dir, basename))
+ if len(newpaths) == 0:
+ raise
+
+ abspaths += newpaths
+ continue
+ else:
+ raise
+
+ # Generate a list of tuples (pdsfile, selection)
+ info = []
+ for path in abspaths:
+ pdsf = pdsfile.Pds4File.from_abspath(path)
+
+ if pdsf.is_bundleset_dir:
+ # Archive directories are checksumed by volset
+ if pdsf.archives_:
+ info.append((pdsf, None))
+
+ # Others are checksumed by volume
+ else:
+ children = [pdsf.child(c) for c in pdsf.childnames]
+ info += [(c, None) for c in children if c.isdir]
+ # "if c.isdir" is False for volset level readme files
+
+ elif pdsf.is_bundle_dir:
+ # Checksum one volume
+ info.append((pdsf, None))
+
+ elif pdsf.isdir:
+ print('Invalid directory for checksumming: ' + pdsf.logical_path)
+ sys.exit(1)
+
+ else:
+ pdsdir = pdsf.parent()
+ if pdsf.is_volume_file:
+ # Checksum one archive file
+ info.append((pdsdir, pdsf.basename))
+ elif pdsdir.is_bundle_dir:
+ # Checksum one top-level file in volume
+ info.append((pdsdir, pdsf.basename))
+ else:
+ print('Invalid file for checksumming: ' + pdsf.logical_path)
+ sys.exit(1)
+
+ # Begin logging and loop through tuples...
+ logger.open(' '.join(sys.argv))
+ try:
+ for (pdsdir, selection) in info:
+ path = pdsdir.abspath
+
+ if selection:
+ pdsf = pdsdir.child(os.path.basename(selection))
+ else:
+ pdsf = pdsdir
+
+ check_path = pdsdir.checksum_path_and_lskip()[0]
+
+ # Save logs in up to two places
+ if pdsf.bundlename:
+ logfiles = set([pdsf.log_path_for_bundle('_md5',
+ task=args.task,
+ dir='pdschecksums'),
+ pdsf.log_path_for_bundle('_md5',
+ task=args.task,
+ dir='pdschecksums',
+ place='parallel')])
+ else:
+ logfiles = set([pdsf.log_path_for_bundleset('_md5',
+ task=args.task,
+ dir='pdschecksums'),
+ pdsf.log_path_for_bundleset('_md5',
+ task=args.task,
+ dir='pdschecksums',
+ place='parallel')])
+
+ # Create all the handlers for this level in the logger
+ local_handlers = []
+ LOGDIRS = [] # used by move_old_checksums()
+ for logfile in logfiles:
+ local_handlers.append(pdslogger.file_handler(logfile))
+ logdir = os.path.split(logfile)[0]
+ LOGDIRS.append(os.path.split(logfile)[0])
+
+ # These handlers are only used if they don't already exist
+ warning_handler = pdslogger.warning_handler(logdir)
+ error_handler = pdslogger.error_handler(logdir)
+ local_handlers += [warning_handler, error_handler]
+
+ # Open the next level of the log
+ if len(info) > 1:
+ logger.blankline()
+
+ if selection:
+ logger.open('Task "' + args.task + '" for selection ' +
+ selection, path, handler=local_handlers)
+ else:
+ logger.open('Task "' + args.task + '" for', path,
+ handler=local_handlers)
+
+ try:
+ for logfile in logfiles:
+ logger.info('Log file', logfile)
+
+ if args.task == 'initialize':
+ proceed = initialize(pdsdir, selection)
+
+ elif args.task == 'reinitialize':
+ if selection: # don't erase everything else!
+ proceed = update(pdsdir, selection)
+ else:
+ proceed = reinitialize(pdsdir, selection)
+
+ elif args.task == 'validate':
+ proceed = validate(pdsdir, selection)
+
+ elif args.task == 'repair':
+ proceed = repair(pdsdir, selection)
+
+ else: # update
+ proceed = update(pdsdir, selection)
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ proceed = False
+ raise
+
+ finally:
+ _ = logger.close()
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ proceed = False
+ raise
+
+ finally:
+ (fatal, errors, warnings, tests) = logger.close()
+ if fatal or errors:
+ proceed = False
+
+ # If everything went well, execute pdsinfoshelf too
+ if proceed and args.infoshelf:
+ new_list = [a.replace('pdschecksums', 'pdsinfoshelf') for a in sys.argv]
+ new_list = [a for a in new_list if a not in ('--infoshelf', '-i')]
+ status = os.system(' '.join(new_list))
+ sys.exit(status)
+
+if __name__ == '__main__':
+ main()
diff --git a/holdings_maintenance/pds4/pds4indexshelf.py b/holdings_maintenance/pds4/pds4indexshelf.py
new file mode 100755
index 0000000..177e419
--- /dev/null
+++ b/holdings_maintenance/pds4/pds4indexshelf.py
@@ -0,0 +1,499 @@
+#!/usr/bin/env python3
+################################################################################
+# pdsindexshelf.py library and main program
+#
+# Syntax:
+# pdsindexshelf.py --task index_path.csv [index_path.csv ...]
+#
+# Enter the --help option to see more information.
+################################################################################
+
+import argparse
+import datetime
+import glob
+import os
+import pickle
+import sys
+
+import pdslogger
+import pdsfile
+import pdstable
+
+LOGNAME = 'pds.validation.indexshelf'
+LOGROOT_ENV = 'PDS_LOG_ROOT'
+
+################################################################################
+
+def generate_indexdict(pdsf, logger=None):
+ """Generate a dictionary keyed by row key for each row in the given table.
+ The value returned is a list containing all the associated row indices.
+ """
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdsf.root_)
+ logger.open('Tabulating index rows for', pdsf.abspath)
+
+ try:
+ table = pdstable.PdsTable(pdsf.label_abspath,
+ filename_keylen=pdsf.filename_keylen)
+
+ table.index_rows_by_filename_key() # fills in table.filename_keys
+ childnames = table.filename_keys
+ index_dict = {c:table.row_indices_by_filename_key(c)
+ for c in childnames}
+
+ logger.info('Rows tabulated', str(len(index_dict)), force=True)
+
+ latest_mtime = max(os.path.getmtime(pdsf.abspath),
+ os.path.getmtime(pdsf.label_abspath))
+ dt = datetime.datetime.fromtimestamp(latest_mtime)
+ logger.info('Latest index file modification date',
+ dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True)
+
+ except (OSError, ValueError) as e:
+ logger.error(str(e))
+ raise e
+
+ finally:
+ _ = logger.close()
+
+ return (index_dict, latest_mtime)
+
+################################################################################
+
+def write_indexdict(pdsf, index_dict, logger=None):
+ """Write a new shelf file for the rows of this index."""
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdsf.root_)
+ logger.open('Writing index shelf file info for', pdsf.abspath)
+
+ try:
+ pdsfile.Pds4File.close_all_shelves() # prevents using a cached shelf file
+
+ shelf_path = pdsf.indexshelf_abspath
+ logger.info('Index shelf file', shelf_path)
+
+ # Create parent directory if necessary
+ parent = os.path.split(shelf_path)[0]
+ if not os.path.exists(parent):
+ logger.info('Creating parent directory', parent)
+ os.makedirs(parent)
+
+ # Write the pickle file
+ with open(shelf_path, 'wb') as f:
+ pickle.dump(index_dict, f)
+
+ # Write the Python file
+ python_path = shelf_path.rpartition('.')[0] + '.py'
+ logger.info('Writing Python file', python_path)
+
+ # Determine the maximum length of the keys
+ len_path = 0
+ for key in index_dict:
+ len_path = max(len_path, len(key))
+
+ name = os.path.basename(shelf_path).rpartition('.')[0]
+ with open(python_path, 'w', encoding='latin-1') as f:
+ f.write(name + ' = {\n')
+ for key in index_dict:
+ f.write(' "%s: ' % (key + '"' + (len_path-len(key)) * ' '))
+
+ rows = index_dict[key]
+ if len(rows) == 1:
+ f.write('%d,\n' % rows[0])
+ else:
+ f.write('(')
+ for row in rows[:-1]:
+ f.write('%d, ' % row)
+ f.write('%d),\n' % rows[-1])
+
+ f.write('}\n\n')
+
+ logger.info('Two files written')
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ _ = logger.close()
+
+################################################################################
+
+def load_indexdict(pdsf, logger=None):
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdsf.root_)
+ logger.open('Reading index shelf file for', pdsf.abspath)
+
+ try:
+ shelf_path = pdsf.indexshelf_abspath
+ logger.info('Index shelf file', shelf_path)
+
+ if not os.path.exists(shelf_path):
+ logger.error('Index shelf file not found', shelf_path)
+ return {}
+
+ with open(shelf_path, 'rb') as f:
+ index_dict = pickle.load(f)
+
+ logger.info('Shelf records loaded', str(len(index_dict)))
+
+ except pickle.PickleError as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ logger.close()
+
+ return index_dict
+
+################################################################################
+
+def validate_infodict(pdsf, tabdict, shelfdict, logger=None):
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdsf.root_)
+ logger.info('Validating index file for', pdsf.abspath)
+
+ if tabdict == shelfdict:
+ logger.info('Validation complete')
+ else:
+ logger.error('Validation failed for', pdsf.abspath)
+
+################################################################################
+# Simplified functions to perform tasks
+################################################################################
+
+def initialize(pdsf, logger=None):
+
+ shelf_path = pdsf.indexshelf_abspath
+
+ # Make sure file does not exist
+ if os.path.exists(pdsf.indexshelf_abspath):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.error('Index shelf file already exists', shelf_path)
+ return
+
+ # Generate info
+ (index_dict, _) = generate_indexdict(pdsf, logger=logger)
+ if index_dict is None:
+ return
+
+ # Save info file
+ write_indexdict(pdsf, index_dict, logger=logger)
+
+def reinitialize(pdsf, logger=None):
+
+ shelf_path = pdsf.indexshelf_abspath
+
+ # Warn if shelf file does not exist
+ if not os.path.exists(shelf_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.warn('Index shelf file does not exist; initializing', shelf_path)
+ initialize(pdsf, logger=logger)
+ return
+
+ # Generate info
+ (index_dict, _) = generate_indexdict(pdsf, logger=logger)
+ if not index_dict:
+ return
+
+ # Save info file
+ write_indexdict(pdsf, index_dict, logger=logger)
+
+def validate(pdsf, logger=None):
+
+ shelf_path = pdsf.indexshelf_abspath
+
+ # Make sure file exists
+ if not os.path.exists(shelf_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.error('Index shelf file does not exist', shelf_path)
+ return
+
+ (table_indexdict, _) = generate_indexdict(pdsf, logger=logger)
+ if table_indexdict is None:
+ return
+
+ shelf_indexdict = load_indexdict(pdsf, logger=logger)
+ if not shelf_indexdict:
+ return
+
+ # Validate
+ validate_infodict(pdsf, table_indexdict, shelf_indexdict,
+ logger=logger)
+
+def repair(pdsf, logger=None, op='repair'):
+
+ shelf_path = pdsf.indexshelf_abspath
+
+ # Make sure file exists
+ if not os.path.exists(shelf_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.warn('Index shelf file does not exist; initializing',
+ shelf_path)
+ initialize(pdsf, logger=logger)
+ return
+
+ (table_indexdict, latest_mtime) = generate_indexdict(pdsf, logger=logger)
+ if not table_indexdict:
+ return
+
+ shelf_indexdict = load_indexdict(pdsf, logger=logger)
+ if not shelf_indexdict:
+ return
+
+ # Compare
+ canceled = (table_indexdict == shelf_indexdict)
+ if canceled:
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+
+ shelf_pypath = shelf_path.replace('.pickle', '.py')
+ shelf_mtime = min(os.path.getmtime(shelf_path),
+ os.path.getmtime(shelf_pypath))
+ if latest_mtime > shelf_mtime:
+ logger.info('!!! Index shelf file content is up to date',
+ shelf_path, force=True)
+
+ dt = datetime.datetime.fromtimestamp(latest_mtime)
+ logger.info('!!! Index file modification date',
+ dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True)
+
+ dt = datetime.datetime.fromtimestamp(shelf_mtime)
+ logger.info('!!! Index shelf file modification date',
+ dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True)
+
+ delta = latest_mtime - shelf_mtime
+ if delta >= 86400/10:
+ logger.info('!!! Index shelf file is out of date %.1f days' %
+ (delta / 86400.), force=True)
+ else:
+ logger.info('!!! Index shelf file is out of date %.1f minutes' %
+ (delta / 60.), force=True)
+
+ dt = datetime.datetime.now()
+ os.utime(shelf_path)
+ os.utime(shelf_pypath)
+ logger.info('!!! Time tag on index shelf files set to',
+ dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True)
+
+ else:
+ logger.info('!!! Index shelf file is up to date; repair canceled',
+ shelf_path, force=True)
+
+ return
+
+ # Write new info
+ write_indexdict(pdsf, table_indexdict, logger=logger)
+
+def update(pdsf, selection=None, logger=None):
+
+ shelf_path = pdsf.indexshelf_abspath
+ if os.path.exists(shelf_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.info('!!! Index shelf file exists; not updated', pdsf.abspath)
+
+ else:
+ initialize(pdsf, logger)
+
+################################################################################
+################################################################################
+
+def main():
+
+ # Set up parser
+ parser = argparse.ArgumentParser(
+ description='pdsindexshelf: Create, maintain and validate shelf files ' +
+ 'containing row lookup information for index files.')
+
+ parser.add_argument('--initialize', '--init', const='initialize',
+ default='', action='store_const', dest='task',
+ help='Create an indexshelf file for an index or for ' +
+ 'an entire metadata directory. Abort if the file '+
+ 'already exists.')
+
+ parser.add_argument('--reinitialize', '--reinit', const='reinitialize',
+ default='', action='store_const', dest='task',
+ help='Create an indexshelf file for an index or for ' +
+ 'an entire metadata directory. Replace any files '+
+ 'that already exists.')
+
+ parser.add_argument('--validate', const='validate',
+ default='', action='store_const', dest='task',
+ help='Validate an indexshelf file or metadata ' +
+ 'directory.')
+
+ parser.add_argument('--repair', const='repair',
+ default='', action='store_const', dest='task',
+ help='Validate an index shelf file; replace only if ' +
+ 'necessary. If the shelf file content is correct '+
+ 'but it is older than either the file or the ' +
+ 'label, update the shelf file\'s modification ' +
+ 'date.')
+
+ parser.add_argument('--update', const='update',
+ default='', action='store_const', dest='task',
+ help='Search a metadata directory for any new index ' +
+ 'files and add create an index shelf file for ' +
+ 'each one. Existing index shelf files are not ' +
+ 'checked.')
+
+ parser.add_argument('--table', nargs='+', type=str,
+ help='Path to an index file or metadata directory.')
+
+ parser.add_argument('--log', '-l', type=str, default='',
+ help='Optional root directory for a duplicate of the ' +
+ 'log files. If not specified, the value of ' +
+ 'environment variable "%s" ' % LOGROOT_ENV +
+ 'is used. In addition, individual logs are ' +
+ 'written into the "logs" directory parallel to ' +
+ '"holdings". Logs are created inside the "index" '+
+ 'subdirectory of each log root directory.')
+
+ parser.add_argument('--quiet', '-q', action='store_true',
+ help='Do not also log to the terminal.')
+
+ # Parse and validate the command line
+ args = parser.parse_args()
+
+ if not args.task:
+ print('pdsindexshelf error: Missing task')
+ sys.exit(1)
+
+ status = 0
+
+ # Define the logging directory
+ if args.log == '':
+ try:
+ args.log = os.environ[LOGROOT_ENV]
+ except KeyError:
+ args.log = None
+
+ # Initialize the logger
+ logger = pdslogger.PdsLogger(LOGNAME)
+ pdsfile.Pds4File.set_log_root(args.log)
+
+ if not args.quiet:
+ logger.add_handler(pdslogger.stdout_handler)
+
+ if args.log:
+ path = os.path.join(args.log, 'pdsindexshelf')
+ warning_handler = pdslogger.warning_handler(path)
+ logger.add_handler(warning_handler)
+
+ error_handler = pdslogger.error_handler(path)
+ logger.add_handler(error_handler)
+
+ # Generate a list of Pds4File objects before logging
+ pdsfiles = []
+
+ for path in args.table:
+
+ if not os.path.exists(path):
+
+ print('No such file or directory: ' + path)
+ sys.exit(1)
+
+ path = os.path.abspath(path)
+ pdsf = pdsfile.Pds4File.from_abspath(path)
+
+ if pdsf.isdir:
+ if not '/metadata/' in path:
+ print('Not a metadata directory: ' + path)
+ sys.exit(1)
+
+ tables = glob.glob(os.path.join(path, '*.csv'))
+ if not tables:
+ tables = glob.glob(os.path.join(path, '*/*.csv'))
+
+ if not tables:
+ print('No .csv files in directory: ' + path)
+ sys.exit(1)
+
+ pdsfiles += pdsfile.Pds4File.pdsfiles_for_abspaths(tables)
+
+ else:
+ if not '/metadata/' in path:
+ print('Not a metadata file: ' + path)
+ sys.exit(1)
+ if not path.endswith('.csv'):
+ print('Not a table file: ' + path)
+ sys.exit(1)
+
+ pdsfiles.append(pdsf)
+
+ # Open logger and loop through tables...
+ logger.open(' '.join(sys.argv))
+ try:
+ for pdsf in pdsfiles:
+
+ # Save logs in up to two places
+ logfiles = [pdsf.log_path_for_index(task=args.task,
+ dir='pdsindexshelf'),
+ pdsf.log_path_for_index(task=args.task,
+ dir='pdsindexshelf',
+ place='parallel')]
+ if logfiles[0] == logfiles[1]:
+ logfiles = logfiles[:-1]
+
+ # Create all the handlers for this level in the logger
+ local_handlers = []
+ for logfile in logfiles:
+ local_handlers.append(pdslogger.file_handler(logfile))
+ logdir = (logfile.rpartition('/pdsindexshelf/')[0] +
+ '/pdsindexshelf')
+
+ # These handlers are only used if they don't already exist
+ warning_handler = pdslogger.warning_handler(logdir)
+ error_handler = pdslogger.error_handler(logdir)
+ local_handlers += [warning_handler, error_handler]
+
+ # Open the next level of the log
+ if len(pdsfiles) > 1:
+ logger.blankline()
+
+ logger.open('Task "' + args.task + '" for', pdsf.abspath,
+ handler=local_handlers)
+
+ try:
+ for logfile in logfiles:
+ logger.info('Log file', logfile)
+
+ if args.task == 'initialize':
+ initialize(pdsf)
+
+ elif args.task == 'reinitialize':
+ reinitialize(pdsf)
+
+ elif args.task == 'validate':
+ validate(pdsf)
+
+ elif args.task == 'repair':
+ repair(pdsf)
+
+ else: # update
+ update(pdsf)
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ _ = logger.close()
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ print(sys.exc_info()[2])
+ status = 1
+ raise
+
+ finally:
+ (fatal, errors, warnings, tests) = logger.close()
+ if fatal or errors: status = 1
+
+ sys.exit(status)
+
+if __name__ == '__main__':
+ main()
diff --git a/holdings_maintenance/pds4/pds4infoshelf.py b/holdings_maintenance/pds4/pds4infoshelf.py
new file mode 100755
index 0000000..7238ba6
--- /dev/null
+++ b/holdings_maintenance/pds4/pds4infoshelf.py
@@ -0,0 +1,896 @@
+#!/usr/bin/env python3
+################################################################################
+# pdsinfoshelf.py library and main program
+#
+# Syntax:
+# pdsinfoshelf.py --task path [path ...]
+#
+# Enter the --help option to see more information.
+################################################################################
+
+import argparse
+import datetime
+import glob
+import os
+from pathlib import Path
+import pickle
+import shutil
+import sys
+from PIL import Image
+
+import pdslogger
+import pdsfile
+
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(REPO_ROOT))
+
+from holdings_maintenance.pds4 import pds4checksums
+
+# Holds log file directories temporarily, used by move_old_info()
+LOGDIRS = []
+
+LOGNAME = 'pds.validation.fileinfo'
+LOGROOT_ENV = 'PDS_LOG_ROOT'
+
+PREVIEW_EXTS = set(['.jpg', '.png', '.gif', '.tif', '.tiff',
+ '.jpeg', '.jpeg_small'])
+
+
+################################################################################
+
+def generate_infodict(pdsdir, selection, old_infodict={},
+ limits={'normal':-1}, logger=None):
+ """Generate a dictionary keyed by absolute file path for each file in the
+ directory tree. Value returned is a tuple (nbytes, child_count, modtime,
+ checksum, preview size).
+
+ If a selection is specified, it is interpreted as the basename of a file,
+ and only that file is processed.
+
+ The optional old_infodict overrides information found in the directory.
+ This dictionary is merged with the new information assembled. However, if
+ a selection is specified, information about the selection is always updated.
+
+ Also return the latest modification date among all the files checked.
+ """
+
+ ### Internal function
+
+ def get_info_for_file(abspath):
+
+ nbytes = os.path.getsize(abspath)
+ children = 0
+ mtime = os.path.getmtime(abspath)
+ dt = datetime.datetime.fromtimestamp(mtime)
+ modtime = dt.strftime('%Y-%m-%d %H:%M:%S.%f')
+ try:
+ checksum = checkdict[abspath]
+ except KeyError:
+ logger.error('Missing entry in checksum file', abspath)
+ checksum = ''
+
+ size = (0,0)
+ ext = os.path.splitext(abspath)[1]
+ if ext.lower() in PREVIEW_EXTS:
+ try:
+ im = Image.open(abspath)
+ size = im.size
+ im.close()
+ except Exception:
+ logger.error('Preview size not found', abspath)
+
+ return (nbytes, children, modtime, checksum, size)
+
+ def get_info(abspath, infodict, old_infodict, checkdict):
+ """Info about the given abspath."""
+
+ if os.path.isdir(abspath):
+ nbytes = 0
+ children = 0
+ modtime = ''
+
+ files = os.listdir(abspath)
+ for file in files:
+ absfile = os.path.join(abspath, file)
+
+ if file == '.DS_Store': # skip .DS_Store files
+ logger.ds_store('.DS_Store skipped', absfile)
+ continue
+
+ if file.startswith('._'): # skip dot-underscore files
+ logger.dot_underscore('._* file skipped', absfile)
+ continue
+
+ if '/.' in abspath: # flag invisible files
+ logger.invisible('Invisible file', absfile)
+
+ info = get_info(absfile, infodict, old_infodict, checkdict)
+ nbytes += info[0]
+ children += 1
+ modtime = max(modtime, info[2])
+
+ info = (nbytes, children, modtime, '', (0,0))
+
+ elif abspath in old_infodict:
+ info = old_infodict[abspath]
+
+ else:
+ info = get_info_for_file(abspath)
+ logger.normal('File info generated', abspath)
+
+ infodict[abspath] = info
+ return info
+
+ ################################
+ # Begin executable code
+ ################################
+
+ dirpath = pdsdir.abspath
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdsdir.root_)
+
+ if selection:
+ logger.open('Generating file info for selection "%s"' % selection,
+ dirpath, limits)
+ else:
+ logger.open('Generating file info', dirpath, limits)
+
+ try:
+ # Load checksum dictionary
+ checkdict = pds4checksums.checksum_dict(dirpath, logger=logger)
+# Removed... because we can't ignore empty directories
+# if not checkdict:
+# return ({}, 0.)
+
+ # Generate info recursively
+ infodict = {}
+ if selection:
+ root = os.path.join(dirpath, selection)
+ else:
+ root = pdsdir.abspath
+
+ info = get_info(root, infodict, old_infodict, checkdict)
+ latest_modtime = info[2]
+
+ # Merge dictionaries
+ merged = old_infodict.copy()
+
+ if selection:
+ merged[root] = infodict[root]
+
+ else:
+ for (key, value) in infodict.items():
+ if key not in merged:
+ info = infodict[key]
+ merged[key] = info
+ latest_modtime = max(latest_modtime, info[2])
+
+ if not merged:
+ logger.info('No files found')
+ latest_modtime = ''
+ else:
+ logger.info('Latest holdings file modification date = '
+ + latest_modtime[:19], force=True)
+
+ # We also have to check the modtime of the checksum file!
+ check_path = pdsdir.checksum_path_and_lskip()[0]
+ timestamp = os.path.getmtime(check_path)
+ check_datetime = datetime.datetime.fromtimestamp(timestamp)
+ check_modtime = check_datetime.strftime('%Y-%m-%d %H:%M:%S.%f')
+ logger.info('Checksum file modification date = ' + check_modtime[:19],
+ check_path, force=True)
+ if check_modtime > latest_modtime:
+ latest_modtime = check_modtime
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ _ = logger.close()
+
+ return (merged, latest_modtime)
+
+################################################################################
+
+def load_infodict(pdsdir, logger=None):
+
+ dirpath = pdsdir.abspath
+ dirpath_ = dirpath.rstrip('/') + '/'
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdsdir.root_)
+ logger.open('Reading info shelf file for', dirpath_[:-1])
+
+ try:
+ (info_path, lskip) = pdsdir.shelf_path_and_lskip('info')
+ logger.info('Info shelf file', info_path)
+
+ if not os.path.exists(info_path):
+ logger.error('Info shelf file not found', info_path)
+ return {}
+
+ # Read the shelf file and convert to a dictionary
+ with open(info_path, 'rb') as f:
+ shelf = pickle.load(f)
+
+ infodict = {}
+ for (key,info) in shelf.items():
+ # Remove a 'null' checksum indicated by a string of dashes
+ # (Directories do not have checksums.)
+ if info[3] and info[3][0] == '-':
+ info = info[:3] + ('',) + info[4:]
+
+ if key == '':
+ infodict[dirpath_[:-1]] = info
+ else:
+ infodict[dirpath_[:lskip] + key] = info
+
+ return infodict
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ _ = logger.close()
+
+################################################################################
+
+def write_infodict(pdsdir, infodict, limits={}, logger=None):
+ """Write a new info shelf file for a directory tree."""
+
+ # Initialize
+ dirpath = pdsdir.abspath
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdsdir.root_)
+ logger.open('Writing info file info for', dirpath, limits=limits)
+
+ try:
+ (info_path, lskip) = pdsdir.shelf_path_and_lskip('info')
+ logger.info('Info shelf file', info_path)
+
+ # Create parent directory if necessary
+ parent = os.path.split(info_path)[0]
+ if not os.path.exists(parent):
+ logger.info('Creating parent directory', parent)
+ os.makedirs(parent)
+
+ # Write the pickle file
+ pickle_dict = {}
+ for (key, values) in infodict.items():
+ short_key = key[lskip:]
+ pickle_dict[short_key] = values
+
+ with open(info_path, 'wb') as f:
+ pickle.dump(pickle_dict, f)
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ _ = logger.close()
+
+ logger.open('Writing Python dictionary', dirpath, limits=limits)
+ try:
+ # Determine the maximum length of the file path
+ len_path = 0
+ for (abspath, values) in infodict.items():
+ len_path = max(len_path, len(abspath))
+
+ len_path -= lskip
+
+ # Write the python dictionary version
+ python_path = info_path.rpartition('.')[0] + '.py'
+ name = os.path.basename(python_path)
+ parts = name.split('_')
+ name = '_'.join(parts[:2]) + '_info'
+ abspaths = list(infodict.keys())
+ abspaths.sort()
+
+ with open(python_path, 'w', encoding='latin-1') as f:
+ f.write(name + ' = {\n')
+ for abspath in abspaths:
+ path = abspath[lskip:]
+ (nbytes, children, modtime, checksum, size) = infodict[abspath]
+ f.write(' "%s: ' % (path + '"' + (len_path-len(path)) * ' '))
+ f.write('(%11d, %3d, ' % (nbytes, children))
+ f.write('"%s", ' % modtime)
+ f.write('"%-33s, ' % (checksum + '"'))
+ f.write('(%4d,%4d)),\n' % size)
+
+ f.write('}\n\n')
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ _ = logger.close()
+
+################################################################################
+
+def validate_infodict(pdsdir, dirdict, shelfdict, selection,
+ limits={'normal': 0}, logger=None):
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdsdir.root_)
+
+ if selection:
+ logger.open('Validating file info for selection %s' % selection,
+ pdsdir.abspath, limits=limits)
+ else:
+ logger.open('Validating file info for', pdsdir.abspath, limits=limits)
+
+ # Prune the shelf dictionary if necessary
+ if selection:
+ keys = list(shelfdict.keys())
+ full_path = os.path.join(pdsdir.abspath, selection)
+ for key in keys:
+ if key != full_path:
+ del shelfdict[key]
+
+ try:
+ keys = list(dirdict.keys())
+ for key in keys:
+ if key in shelfdict:
+ dirinfo = dirdict[key]
+ shelfinfo = shelfdict[key]
+
+ (bytes1, count1, modtime1, checksum1, size1) = dirinfo
+ (bytes2, count2, modtime2, checksum2, size2) = shelfinfo
+
+ # Truncate modtimes to seconds
+ modtime1 = modtime1.rpartition('.')[0]
+ modtime2 = modtime2.rpartition('.')[0]
+
+ agreement = True
+ if bytes1 != bytes2:
+ logger.error('File size mismatch %d %d' %
+ (bytes1, bytes2), key)
+ agreement = False
+
+ if count1 != count2:
+ logger.error('Child count mismatch %d %d' %
+ (count1, count1), key)
+ agreement = False
+
+ if abs(modtime1 != modtime2) > 1:
+ logger.error('Modification time mismatch "%s" "%s"' %
+ (modtime1, modtime2), key)
+ agreement = False
+
+ if checksum1 != checksum1:
+ logger.error('Checksum mismatch', key)
+ agreement = False
+
+ if size1 != size2:
+ logger.error('Display size mismatch', key)
+ agreement = False
+
+ if agreement:
+ logger.normal('File info matches', key)
+
+ del shelfdict[key]
+ del dirdict[key]
+
+ keys = list(dirdict.keys())
+ keys.sort()
+ for key in keys:
+ logger.error('Missing shelf info for', key)
+
+ keys = list(shelfdict.keys())
+ keys.sort()
+ for key in keys:
+ logger.error('Shelf info for missing file', key)
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ return logger.close()
+
+################################################################################
+
+def move_old_info(shelf_file, logger=None):
+ """Move a file to the /logs/ directory tree and append a time tag."""
+
+ if not os.path.exists(shelf_file): return
+
+ shelf_basename = os.path.basename(shelf_file)
+ (shelf_prefix, shelf_ext) = os.path.splitext(shelf_basename)
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+
+ from_logged = False
+ for log_dir in LOGDIRS:
+ dest_template = log_dir + '/' + shelf_prefix + '_v???' + shelf_ext
+ version_paths = glob.glob(dest_template)
+
+ max_version = 0
+ lskip = len(shelf_ext)
+ for version_path in version_paths:
+ version = int(version_path[-lskip-3:-lskip])
+ max_version = max(max_version, version)
+
+ new_version = max_version + 1
+ dest = dest_template.replace('???', '%03d' % new_version)
+ shutil.copy(shelf_file, dest)
+
+ if not from_logged:
+ logger.info('Info shelf file moved from: ' + shelf_file)
+ from_logged = True
+
+ logger.info('Info shelf file moved to', dest)
+
+ python_file = shelf_file.rpartition('.')[0] + '.py'
+ dest = dest.rpartition('.')[0] + '.py'
+ shutil.copy(python_file, dest)
+
+################################################################################
+# Simplified functions to perform tasks
+################################################################################
+
+def initialize(pdsdir, selection=None, logger=None):
+
+ info_path = pdsdir.shelf_path_and_lskip('info')[0]
+
+ # Make sure file does not exist
+ if os.path.exists(info_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.error('Info shelf file already exists', info_path)
+ return
+
+ # Check selection
+ if selection:
+ logger.error('File selection is disallowed for task "initialize"',
+ selection)
+ return
+
+ # Generate info
+ (infodict, _) = generate_infodict(pdsdir, selection, logger=logger)
+
+ # Save info file
+ write_infodict(pdsdir, infodict, logger=logger)
+
+def reinitialize(pdsdir, selection=None, logger=None):
+
+ info_path = pdsdir.shelf_path_and_lskip('info')[0]
+
+ # Warn if shelf file does not exist
+ if not os.path.exists(info_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ if selection:
+ logger.error('Info shelf file does not exist', info_path)
+ else:
+ logger.warn('Info shelf file does not exist; initializing',
+ info_path)
+ initialize(pdsdir, selection=selection, logger=logger)
+ return
+
+ # Generate info
+ (infodict, _) = generate_infodict(pdsdir, selection, logger=logger)
+ if not infodict:
+ return
+
+ # Move old file if necessary
+ if os.path.exists(info_path):
+ move_old_info(info_path, logger=logger)
+
+ # Save info file
+ write_infodict(pdsdir, infodict, logger=logger)
+
+def validate(pdsdir, selection=None, logger=None):
+
+ info_path = pdsdir.shelf_path_and_lskip('info')[0]
+
+ # Make sure file exists
+ if not os.path.exists(info_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.error('Info shelf file does not exist', info_path)
+ return
+
+ # Read info shelf file
+ shelf_infodict = load_infodict(pdsdir, logger=logger)
+
+ # Generate info
+ (dir_infodict, _) = generate_infodict(pdsdir, selection, logger=logger)
+
+ # Validate
+ validate_infodict(pdsdir, dir_infodict, shelf_infodict, selection=selection,
+ logger=logger)
+
+def repair(pdsdir, selection=None, logger=None):
+
+ info_path = pdsdir.shelf_path_and_lskip('info')[0]
+
+ # Make sure file exists
+ if not os.path.exists(info_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ if selection:
+ logger.error('Info shelf file does not exist', info_path)
+ else:
+ logger.warn('Info shelf file does not exist; initializing',
+ info_path)
+ initialize(pdsdir, selection=selection, logger=logger)
+ return
+
+ # Read info shelf file
+ shelf_infodict = load_infodict(pdsdir, logger=logger)
+
+ # Generate info
+ (dir_infodict, latest_modtime) = generate_infodict(pdsdir, selection,
+ logger=logger)
+ latest_iso = latest_modtime.replace(' ', 'T')
+ latest_datetime = datetime.datetime.fromisoformat(latest_iso)
+
+ # For a single selection, use the old information
+ if selection:
+ key = list(dir_infodict.keys())[0]
+ value = dir_infodict[key]
+ dir_infodict = shelf_infodict.copy()
+ dir_infodict[key] = value
+
+ # Compare
+ canceled = (dir_infodict == shelf_infodict)
+ if canceled:
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+
+ info_pypath = info_path.replace('.pickle', '.py')
+ timestamp = min(os.path.getmtime(info_path),
+ os.path.getmtime(info_pypath))
+ info_datetime = datetime.datetime.fromtimestamp(timestamp)
+ info_iso = info_datetime.isoformat(timespec='microseconds')
+
+ if latest_iso > info_iso:
+ logger.info('!!! Info shelf file content is up to date',
+ info_path, force=True)
+ logger.info('!!! Latest holdings file modification date',
+ latest_iso, force=True)
+ logger.info('!!! Info shelf file modification date',
+ info_iso, force=True)
+
+ delta = (latest_datetime - info_datetime).total_seconds()
+ if delta >= 86400/10:
+ logger.info('!!! Info shelf file is out of date %.1f days' %
+ (delta / 86400.), force=True)
+ else:
+ logger.info('!!! Info shelf file is out of date %.1f minutes' %
+ (delta / 60.), force=True)
+
+ dt = datetime.datetime.now()
+ os.utime(info_path)
+ os.utime(info_pypath)
+ logger.info('!!! Time tag on info shelf files set to',
+ dt.strftime('%Y-%m-%dT%H:%M:%S'), force=True)
+ else:
+ logger.info('!!! Info shelf file is up to date; repair canceled',
+ info_path, force=True)
+ return
+
+ # Move files and write new info
+ move_old_info(info_path, logger=logger)
+ write_infodict(pdsdir, dir_infodict, logger=logger)
+
+def update(pdsdir, selection=None, logger=None):
+
+ info_path = pdsdir.shelf_path_and_lskip('info')[0]
+
+ # Make sure info shelf file exists
+ if not os.path.exists(info_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ if selection:
+ logger.error('Info shelf file does not exist', info_path)
+ else:
+ logger.warn('Info shelf file does not exist; initializing',
+ info_path)
+ initialize(pdsdir, selection=selection, logger=logger)
+ return
+
+ # Read info shelf file
+ shelf_infodict = load_infodict(pdsdir, logger=logger)
+
+ # Generate info
+ (dir_infodict, _) = generate_infodict(pdsdir, selection, shelf_infodict,
+ logger=logger)
+
+ # Compare
+ canceled = (dir_infodict == shelf_infodict)
+ if canceled:
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.info('!!! Info shelf file content is complete; update canceled',
+ info_path, force=True)
+ return
+
+ # Write checksum file
+ move_old_info(info_path, logger=logger)
+ write_infodict(pdsdir, dir_infodict, logger=logger)
+
+################################################################################
+################################################################################
+
+def main():
+
+ # Set up parser
+ parser = argparse.ArgumentParser(
+ description='pdsinfoshelf: Create, maintain and validate shelf files ' +
+ 'containing basic information about each file.')
+
+ parser.add_argument('--initialize', '--init', const='initialize',
+ default='', action='store_const', dest='task',
+ help='Create an infoshelf file for a bundle. Abort ' +
+ 'if the file already exists.')
+
+ parser.add_argument('--reinitialize', '--reinit', const='reinitialize',
+ default='', action='store_const', dest='task',
+ help='Create an infoshelf file for a bundle. Replace ' +
+ 'the file if it already exists. If a single ' +
+ 'file is specified, such as one archive file in ' +
+ 'a bundle set, then only information about that ' +
+ 'file is re-initialized.')
+
+ parser.add_argument('--validate', const='validate',
+ default='', action='store_const', dest='task',
+ help='Validate every file in a bundle against the ' +
+ 'contents of its infoshelf file. If a single ' +
+ 'file is specified, such as an archive file in ' +
+ 'a bundle set, then only information about that ' +
+ 'file is validated')
+
+ parser.add_argument('--repair', const='repair',
+ default='', action='store_const', dest='task',
+ help='Validate every file in a bundle against the ' +
+ 'contents of its infoshelf file. If any file ' +
+ 'has changed, the infoshelf file is replaced. ' +
+ 'If a single file is specified, such as an ' +
+ 'archive file in a bundle set, then only ' +
+ 'information about that file is repaired. If any '+
+ 'of the files checked are newer than the shelf ' +
+ 'file, update the shelf file\'s modification ' +
+ 'date.')
+
+ parser.add_argument('--update', const='update',
+ default='', action='store_const', dest='task',
+ help='Search a directory for any new files and add ' +
+ 'their information to the infoshelf file. ' +
+ 'Information about pre-existing files is not ' +
+ 'updated. If any of the files checked are newer ' +
+ 'than the shelf file, update the shelf file\'s ' +
+ 'modification date.')
+
+ parser.add_argument('bundle', nargs='+', type=str,
+ help='The path to the root of the bundle or bundle ' +
+ 'set. For a bundle set, all the bundle ' +
+ 'directories inside it are handled in sequence.')
+
+ parser.add_argument('--log', '-l', type=str, default='',
+ help='Optional root directory for a duplicate of the ' +
+ 'log files. If not specified, the value of ' +
+ 'environment variable "%s" ' % LOGROOT_ENV +
+ 'is used. In addition, individual logs are ' +
+ 'written into the "logs" directory parallel to ' +
+ '"holdings". Logs are created inside the ' +
+ '"pdsinfoshelf" subdirectory of each log root ' +
+ 'directory.'
+ )
+
+ parser.add_argument('--quiet', '-q', action='store_true',
+ help='Do not also log to the terminal.')
+
+ parser.add_argument('--archives', '-a', default=False, action='store_true',
+ help='Instead of referring to a bundle, refer to the ' +
+ 'the archive file for that bundle.')
+
+
+ # Parse and validate the command line
+ args = parser.parse_args()
+
+ if not args.task:
+ print('pdsinfoshelf error: Missing task')
+ sys.exit(1)
+
+ status = 0
+
+ # Define the logging directory
+ if args.log == '':
+ try:
+ args.log = os.environ[LOGROOT_ENV]
+ except KeyError:
+ args.log = None
+
+ # Initialize the logger
+ logger = pdslogger.PdsLogger(LOGNAME)
+ pdsfile.Pds4File.set_log_root(args.log)
+
+ if not args.quiet:
+ logger.add_handler(pdslogger.stdout_handler)
+
+ if args.log:
+ path = os.path.join(args.log, 'pdsinfoshelf')
+ warning_handler = pdslogger.warning_handler(path)
+ logger.add_handler(warning_handler)
+
+ error_handler = pdslogger.error_handler(path)
+ logger.add_handler(error_handler)
+
+ # Prepare the list of paths
+ abspaths = []
+ for path in args.bundle:
+
+ # Make sure path makes sense
+ path = os.path.abspath(path)
+ parts = path.partition('/pds4-holdings/')
+ if not parts[1]:
+ print('Not a holdings subdirectory: ' + path)
+ sys.exit(1)
+
+ if parts[2].startswith('checksums-'):
+ print('No infoshelves for checksum files: ' + path)
+ sys.exit(1)
+
+ # Convert to an archives path if necessary
+ if args.archives and not parts[2].startswith('archives-'):
+ path = parts[0] + '/pds4-holdings/archives-' + parts[2]
+
+ # Convert to a list of absolute paths that exist (bundlsets or bundles)
+ try:
+ pdsf = pdsfile.Pds4File.from_abspath(path, must_exist=True)
+ abspaths.append(pdsf.abspath)
+
+ except (ValueError, IOError):
+ # Allow a bundle name to stand in for a .tar.gz archive
+ (dir, basename) = os.path.split(path)
+ pdsdir = pdsfile.Pds4File.from_abspath(dir)
+ if pdsdir.archives_ and '.' not in basename:
+ if pdsdir.voltype_ == 'bundles/':
+ basename += '.tar.gz'
+ else:
+ basename += '_%s.tar.gz' % pdsdir.voltype_[:-1]
+
+ newpaths = glob.glob(os.path.join(dir, basename))
+ if len(newpaths) == 0:
+ raise
+
+ abspaths += newpaths
+ continue
+ else:
+ raise
+
+ # Generate a list of tuples (pdsfile, selection)
+ info = []
+ for path in abspaths:
+ pdsf = pdsfile.Pds4File.from_abspath(path)
+
+ if pdsf.is_bundleset_dir:
+ # Info about archive directories is stored by bundleset
+ if pdsf.archives_:
+ info.append((pdsf, None))
+
+ # Others are checksumed by bundle
+ else:
+ children = [pdsf.child(c) for c in pdsf.childnames]
+ info += [(c, None) for c in children if c.isdir]
+ # "if c.isdir" is False for bundleset level readme files
+
+ elif pdsf.is_bundle_dir:
+ # Shelve one bundle
+ info.append((pdsf, None))
+
+ elif pdsf.isdir:
+ print('Invalid directory for an infoshelf: ' + pdsf.logical_path)
+ sys.exit(1)
+
+ else:
+ pdsdir = pdsf.parent()
+ if pdsf.is_bundle_file:
+ # Shelve one archive file
+ info.append((pdsdir, pdsf.basename))
+ elif pdsdir.is_bundle_dir:
+ # Shelve one top-level file in bundle
+ info.append((pdsdir, pdsf.basename))
+ else:
+ print('Invalid file for an infoshelf: ' + pdsf.logical_path)
+ sys.exit(1)
+
+ # Open logger and loop through tuples...
+ logger.open(' '.join(sys.argv))
+ try:
+ for (pdsdir, selection) in info:
+
+ info_path = pdsdir.shelf_path_and_lskip('info')[0]
+
+ if selection:
+ pdsf = pdsdir.child(os.path.basename(selection))
+ else:
+ pdsf = pdsdir
+
+ # Save logs in up to two places
+ if pdsf.bundlename:
+ logfiles = set([pdsf.log_path_for_bundle('_info',
+ task=args.task,
+ dir='pdsinfoshelf'),
+ pdsf.log_path_for_bundle('_info',
+ task=args.task,
+ dir='pdsinfoshelf',
+ place='parallel')])
+ else:
+ logfiles = set([pdsf.log_path_for_bundleset('_info',
+ task=args.task,
+ dir='pdsinfoshelf'),
+ pdsf.log_path_for_bundleset('_info',
+ task=args.task,
+ dir='pdsinfoshelf',
+ place='parallel')])
+
+ # Create all the handlers for this level in the logger
+ local_handlers = []
+ LOGDIRS = [] # used by move_old_info()
+ for logfile in logfiles:
+ local_handlers.append(pdslogger.file_handler(logfile))
+ logdir = os.path.split(logfile)[0]
+ LOGDIRS.append(os.path.split(logfile)[0])
+
+ # These handlers are only used if they don't already exist
+ warning_handler = pdslogger.warning_handler(logdir)
+ error_handler = pdslogger.error_handler(logdir)
+ local_handlers += [warning_handler, error_handler]
+
+ # Open the next level of the log
+ if len(info) > 1:
+ logger.blankline()
+
+ if selection:
+ logger.open('Task "' + args.task + '" for selection ' +
+ selection, pdsdir.abspath, handler=local_handlers)
+ else:
+ logger.open('Task "' + args.task + '" for', pdsdir.abspath,
+ handler=local_handlers)
+
+ try:
+ for logfile in logfiles:
+ logger.info('Log file', logfile)
+
+ if args.task == 'initialize':
+ initialize(pdsdir, selection)
+
+ elif args.task == 'reinitialize':
+ if selection: # don't erase everything else!
+ update(pdsdir, selection)
+ else:
+ reinitialize(pdsdir, selection)
+
+ elif args.task == 'validate':
+ validate(pdsdir, selection)
+
+ elif args.task == 'repair':
+ repair(pdsdir, selection)
+
+ else: # update
+ update(pdsdir, selection)
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ _ = logger.close()
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ print(sys.exc_info()[2])
+ status = 1
+ raise
+
+ finally:
+ (fatal, errors, warnings, tests) = logger.close()
+ if fatal or errors:
+ status = 1
+
+ sys.exit(status)
+
+if __name__ == '__main__':
+ main()
diff --git a/holdings_maintenance/pds4/pds4linkshelf.py b/holdings_maintenance/pds4/pds4linkshelf.py
new file mode 100755
index 0000000..775c99a
--- /dev/null
+++ b/holdings_maintenance/pds4/pds4linkshelf.py
@@ -0,0 +1,1219 @@
+#!/usr/bin/env python3
+################################################################################
+# # pdslinkshelf.py library and main program
+#
+# Syntax:
+# pdslinkshelf.py --task path [path ...]
+#
+# Enter the --help option to see more information.
+################################################################################
+
+import argparse
+import csv
+import datetime
+import glob
+import os
+import pickle
+import re
+import shutil
+import sys
+
+import pdslogger
+import pdsfile
+import translator
+
+LOGNAME = 'pds.validation.links'
+LOGROOT_ENV = 'PDS_LOG_ROOT'
+
+# Holds log file directories temporarily, used by move_old_links()
+LOGDIRS = []
+
+REPAIRS = translator.TranslatorByRegex([])
+
+KNOWN_MISSING_LABELS = translator.TranslatorByRegex([])
+
+# Match pattern for any file name, but possibly things that are not file names
+PATTERN = r'\'?\"?([A-Z0-9][-\w]*\.[A-Z0-9][-\w\.]*)\'?\"?'
+
+# Match pattern for the file name in anything of the form
+# "file name" in the PDS4 label
+TARGET_REGEX1 = re.compile(r'^ *\' + PATTERN + r'\<\/file_name\>', re.I)
+
+# Match pattern for a file name on a line by itself
+TARGET_REGEX2 = re.compile(r'^ *,? *' + PATTERN, re.I)
+
+# Match pattern for one or more file names embedded in a row of a text file.
+# A file name begins with a letter, followed by any number of letters, digits,
+# underscore or dash. Unless the name is "Makefile", it must have one or more
+# extensions, each containing one or more characters. It can also have any
+# number of directory prefixes separate by slashes.
+
+LINK_REGEX = re.compile(r'(?:|.*?[^/@\w\.])/?(?:\.\./)*(([A-Z0-9][-\w]+/)*' +
+ r'(makefile\.?|[A-Z0-9][\w-]*(\.[\w-]+)+))', re.I)
+
+EXTS_WO_LABELS = set(['.XML', '.CAT', '.FMT', '.SFD'])
+
+################################################################################
+
+class LinkInfo(object):
+ """Used internally to describe a link within a specified record of a file.
+ """
+
+ def __init__(self, recno, linkname, is_target):
+
+ self.recno = recno # record number
+ self.linktext = linkname # substring within this record that looks
+ # like a link.
+ self.linkname = linkname # link text after possible repair for known
+ # errors.
+ self.is_target = is_target # True if, based on the local context, this
+ # might be a target of a label file
+ self.target = '' # abspath to target of link, if any.
+ # If not blank, this file must exist.
+
+ def remove_path(self):
+ """Remove any leading directory path from this LinkInfo object."""
+
+ if '/' in self.linktext:
+ self.linktext = self.linktext.rpartition('/')[2]
+ self.linkname = self.linktext
+
+ def __str__(self):
+ return ('%d %s %s %s' % (self.recno, self.linktext, str(self.is_target),
+ self.target or '[' + self.linkname + ']'))
+
+def generate_links(dirpath, old_links={},
+ limits={'info':-1, 'debug':500, 'ds_store':10}, logger=None):
+ """Generate a dictionary keyed by the absolute file path for files in the
+ given directory tree, which must correspond to a volume.
+
+ Keys ending in .XML, .CAT and .TXT return a list of tuples
+ (recno, link, target)
+ for each link found. Here,
+ recno = record number in file;
+ link = the text of the link;
+ target = absolute path to the target of the link.
+
+ Other keys return a single string, which indicates the absolute path to the
+ label file describing this file.
+
+ Unlabeled files not ending in .XML, .CAT or .TXT return an empty string.
+
+ Also return the latest modification date among all the files checked.
+ """
+
+ dirpath = os.path.abspath(dirpath)
+ pdsdir = pdsfile.Pds4File.from_abspath(dirpath)
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdsdir.root_)
+ logger.open('Finding link shelf files', dirpath, limits)
+
+ try:
+
+ linkinfo_dict = old_links.copy() # abspath: list of LinkInfo objects
+ label_dict = {k:v for k,v in old_links.items() if isinstance(v,str)}
+ # abspath: label for this file
+ abspaths = [] # list of all abspaths
+
+ latest_mtime = 0.
+ collection_basename_dict = {}
+ # Walk the directory tree, one subdirectory "root" at a time...
+ for (root, dirs, files) in os.walk(dirpath):
+
+ local_basenames = [] # Tracks the basenames in this directory
+ local_basenames_uc = [] # Same as above, but upper case
+ for basename in files:
+ abspath = os.path.join(root, basename)
+ latest_mtime = max(latest_mtime, os.path.getmtime(abspath))
+
+ if basename == '.DS_Store': # skip .DS_Store files
+ logger.ds_store('.DS_Store file skipped', abspath)
+ continue
+
+ if basename.startswith('._'): # skip dot_underscore files
+ logger.dot_underscore('dot_underscore file skipped',
+ abspath)
+ continue
+
+ if basename.startswith('.'): # skip invisible files
+ logger.invisible('Invisible file skipped', abspath)
+ continue
+
+ # collection_basename_dict: a dictonary with the abspath of a collection
+ # csv file as the key and the set of basenames of its corresponding
+ # entries as the value.
+ # Create collection_basename_dict and use it to check whether a file
+ # is listed in the csv later.
+ if (basename.startswith('collection') and
+ basename.endswith('.csv') and
+ not abspath in collection_basename_dict):
+ logger.debug('Construct collection basename dictionary from', abspath)
+ csv_basenames = set()
+ with open(abspath, 'r') as file:
+ csv_lines = csv.reader(file)
+ for line in csv_lines:
+ # skip the empty line
+ if not line:
+ continue
+ if '::' in line[-1]:
+ lid = line[-1].rpartition('::')[0]
+ else:
+ lid = line[-1]
+ csv_basename = lid.rpartition(':')[-1]
+ csv_basenames.add(csv_basename)
+
+ collection_basename_dict[abspath] = csv_basenames
+
+ abspaths.append(abspath)
+ local_basenames.append(basename)
+ local_basenames_uc.append(basename.upper())
+
+ local_labels = [f for f in local_basenames if '.xml' in f]
+ local_labels_abspath = [os.path.join(root, f) for f in local_labels]
+
+ # Update linkinfo_dict, searching each relevant file for possible links.
+ # If the linking file is a label and the target file has a matching
+ # name, update the label_dict entry for the target.
+ candidate_labels = {} # {target: list of possible label basenames}
+ for basename in local_basenames:
+
+ abspath = os.path.join(root, basename)
+ if abspath in linkinfo_dict: # for update op, skip existing links
+ continue
+
+ basename_uc = basename.upper()
+
+ # Only check XML, CAT etc.
+ ext = basename_uc[-4:] if len(basename) >= 4 else ''
+ if ext not in EXTS_WO_LABELS:
+ continue
+
+ # Get list of link info for all possible linked filenames
+ logger.info('*** Get link info and review', abspath)
+ linkinfo_list = read_links(abspath, logger=logger)
+
+ # Apply repairs
+ repairs = REPAIRS.all(abspath)
+ for info in linkinfo_list:
+ for repair in repairs:
+ linkname = repair.first(info.linktext)
+ if linkname is None:
+
+ # Attempt repair with leading directory path removed
+ if '/' in info.linktext:
+ info.remove_path()
+ linkname = repair.first(info.linktext)
+
+ if linkname is None:
+ continue # no repair found
+
+ info.linkname = linkname
+ if linkname == '':
+ logger.info('Ignoring link "%s"' %
+ info.linktext, abspath, force=True)
+ else:
+ logger.info('Repairing link "%s"->"%s"' %
+ (info.linktext, linkname),
+ abspath, force=True)
+
+ # Validate non-local targets of repairs
+ if '/' in linkname:
+ target = os.path.join(root, linkname)
+ if os.path.exists(target):
+ info.target = os.path.abspath(target)
+ else:
+ logger.error('Target of repaired link is missing', target)
+
+ break # apply only one repair per found link
+
+ # Validate or remove other targets
+ new_linkinfo_list = []
+ baseroot_uc = basename_uc.partition('.')[0]
+ ltest = len(baseroot_uc)
+ for info in linkinfo_list:
+ if info.target: # Non-local, repaired links have targets
+ new_linkinfo_list.append(info)
+ continue
+
+ # A blank linkname is from a repair; indicates to ignore
+ if info.linkname == '':
+ continue
+
+ # Ignore self-references
+ linkname_uc = info.linkname.upper()
+ if linkname_uc == basename_uc:
+ continue
+
+ # Check for target inside this directory
+ try:
+ match_index = local_basenames_uc.index(linkname_uc)
+ except ValueError:
+ match_index = None
+
+ # If not found, maybe it is a non-local reference (.FMT perhaps)
+ if match_index is None:
+
+ # It's easy to pick up floats as link candidates; ignore
+ try:
+ _ = float(info.linkname)
+ continue # Yup, it's just a float
+ except ValueError:
+ pass
+
+ if info.linkname[-1] in ('e', 'E'):
+ try:
+ _ = float(info.linkname[:-1])
+ continue # Float with exponent
+ except ValueError:
+ pass
+
+ # Also ignore format specifications (e.g., "F10.3")
+ if info.linkname[0] in ('F', 'E', 'G'):
+ try:
+ _ = float(info.linkname[1:])
+ continue # Format
+ except ValueError:
+ pass
+
+ # Search non-locally
+ if '/' in info.linkname:
+ nonlocal_target = locate_link_with_path(abspath,
+ info.linkname)
+ else:
+ nonlocal_target = locate_nonlocal_link(abspath,
+ info.linkname)
+
+ # Report the outcome
+ if nonlocal_target:
+ logger.debug('Located "%s"' % info.linkname,
+ nonlocal_target)
+ info.target = nonlocal_target
+ new_linkinfo_list.append(info)
+ continue
+
+ if linkname_uc.endswith('.FMT'):
+ logger.error('Unable to locate .FMT file "%s"' %
+ info.linkname, abspath)
+ elif linkname_uc.endswith('.CAT'):
+ logger.error('Unable to locate .CAT file "%s"' %
+ info.linkname, abspath)
+ else:
+ logger.debug('Substring "%s" is not a link, ignored' %
+ info.linkname, abspath)
+
+ continue
+
+ # Save the match
+ info.linkname = local_basenames[match_index] # update case
+ info.target = os.path.join(root, info.linkname)
+ new_linkinfo_list.append(info)
+
+ # Could this be the label?
+ if ext != '.XML': # nope
+ continue
+
+ # If names match up to '.XML', then yes
+ if (len(linkname_uc) > ltest and
+ linkname_uc[:ltest] == baseroot_uc and
+ linkname_uc[ltest] == '.'):
+ label_dict[info.target] = abspath
+ logger.info('Label identified (by name) for %s' %
+ info.linkname, abspath)
+ continue
+
+ # Otherwise, then maybe
+ if info.is_target:
+ if info.linkname in candidate_labels:
+ if basename not in candidate_labels[info.linkname]:
+ candidate_labels[info.linkname].append(basename)
+ else:
+ candidate_labels[info.linkname] = [basename]
+
+ logger.debug('Candidate label found for ' +
+ info.linkname, abspath)
+
+ linkinfo_dict[abspath] = new_linkinfo_list
+
+ parent_root = root.rpartition('/')[0]
+ local_collection_csv_prefix = f'{root}/collection'
+ parent_collection_csv_prefix = f'{parent_root}/collection'
+
+ # Identify labels for files
+ for basename in local_basenames:
+
+ basename_uc = basename.upper()
+ ext = basename_uc[-4:] if len(basename) >= 4 else ''
+ if ext in (".XML", ".FMT"): # these can't have labels
+ continue
+
+ abspath = os.path.join(root, basename)
+
+ if abspath in label_dict:
+ logger.info('Label already found for %s' % abspath)
+ continue # label already found
+
+ # linkinfo_dict: a dictionary with the abspath of a label file as the key
+ # and a list of its corresponding files (LinkInfo objects) under file_name
+ # tags as the value.
+ # label_dict: a dictionary with the abspath of a file as the key and the
+ # abspath of its corresponding label as the value.
+ # At the current directory, if a file basename is in the list of a label's
+ # (in same directory) file_name tags in linkinfo_dict, create an entry of
+ # that file basename in label_dict. This will make sure the file is
+ # pointing to its correct corresponding label.
+ is_label_found = False
+ for label_abspath, link_info_list in linkinfo_dict.items():
+
+ # if the label is not at the same directory, skip it.
+ if label_abspath not in local_labels_abspath:
+ continue
+
+ for info in link_info_list:
+ if info.linktext == basename and abspath not in label_dict:
+ label_dict[abspath] = label_abspath
+ logger.info('Label identified (by file_name tag) for %s' %
+ info.linktext, label_abspath)
+ is_label_found = True
+ break
+ if is_label_found:
+ break
+
+ # label found by searching linkinfo_dict
+ if is_label_found:
+ continue
+
+ # Maybe we already know the label is missing
+ test = KNOWN_MISSING_LABELS.first(abspath)
+ if test == 'unneeded':
+ logger.debug('Label is not neeeded', abspath)
+ continue
+
+ if test == 'missing':
+ logger.debug('Label is known to be missing', abspath)
+ continue
+
+ # Determine if a label is required
+ label_is_required = (ext not in EXTS_WO_LABELS)
+
+ # Get the list of candidate labels in this directory
+ candidates = candidate_labels.get(basename, [])
+
+ # Determine if the obvious label file exists
+ label_guess_uc = basename_uc.partition('.')[0] + '.XML'
+ if label_guess_uc in local_basenames_uc:
+ k = local_basenames_uc.index(label_guess_uc)
+ obvious_label_basename = local_basenames[k]
+ else:
+ obvious_label_basename = ''
+
+ # Simplest case...
+ if obvious_label_basename in candidates:
+ if not label_is_required:
+ logger.debug('Unnecessary label found', abspath, force=True)
+
+ label_abspath = os.path.join(root, obvious_label_basename)
+ label_dict[abspath] = label_abspath
+ logger.info('Label found for %s' % abspath, label_abspath)
+ continue
+
+ # More cases...
+ if not label_is_required:
+ continue # leave abspath out of label_dict
+
+ # Report a phantom label
+ if obvious_label_basename:
+ logger.error('Label %s does not point to file' %
+ local_basenames[k], abspath)
+
+ if len(candidates) == 1:
+ logger.debug('Label found as ' + candidates[0], abspath,
+ force=True)
+ label_dict[abspath] = os.path.join(root, candidates[0])
+ continue
+
+ # Before raising an error, check this:
+ # For files like errata.txt, or checksum files that don't exist in the
+ # label nor exist in the csv, they are not part of the archive, so they
+ # don't have labels
+ is_basename_in_csv = False
+ logger.info('Check if %s is in the collection csv' % basename)
+ for col_abspath, csv_basenames in collection_basename_dict.items():
+ if (col_abspath.startswith(parent_collection_csv_prefix) or
+ col_abspath.startswith(local_collection_csv_prefix)):
+ if basename.rpartition('.')[0] in csv_basenames:
+ is_basename_in_csv = True
+ break
+
+ if not is_basename_in_csv:
+ continue
+
+ # or errors...
+ label_dict[abspath] = ""
+ if len(candidates) == 0:
+ logger.error('Label is missing', abspath)
+ else:
+ logger.error('Ambiguous label found as %s' % candidates[0],
+ abspath, force=True)
+ for candidate in candidates[1:]:
+ logger.debug('Alternative label found as %s' % candidate,
+ abspath, force=True)
+
+ # Merge the dictionaries
+ # There are cases where a file can have both a list of links and a label.
+ # This occurs when a .TXT or .CAT file has a label, even though it didn't
+ # need one. In the returned dictionary, link lists take priority.
+ link_dict = {}
+
+ for key in abspaths:
+ if key in linkinfo_dict:
+ # If this is a new entry, it's a list of LinkInfo objects
+ # If this was copied from old_links, it's already a list of tuples
+ values = linkinfo_dict[key]
+ if isinstance(values, list):
+ new_list = []
+ for item in values:
+ if isinstance(item, LinkInfo):
+ new_list.append((item.recno, item.linktext, item.target))
+ else:
+ new_list.append(item)
+ link_dict[key] = new_list
+ else:
+ link_dict[key] = values
+ elif key in label_dict:
+ link_dict[key] = label_dict[key]
+ else:
+ link_dict[key] = ''
+
+ dt = datetime.datetime.fromtimestamp(latest_mtime)
+ logger.info('Lastest holdings file modification date',
+ dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True)
+
+ return (link_dict, latest_mtime)
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ _ = logger.close()
+
+def read_links(abspath, logger=None):
+ """Return a list of LinkInfo objects for anything linked or labeled by this
+ file.
+ """
+
+ with open(abspath, 'r', encoding='latin-1') as f:
+ recs = f.readlines()
+
+ links = []
+ multiple_targets = False
+ for recno,rec in enumerate(recs):
+
+ while True:
+
+ # Search for the target of a link
+ is_target = True
+ matchobj = TARGET_REGEX1.match(rec)
+
+ if matchobj:
+ subrec = rec[:matchobj.end()]
+ if '(' in subrec or '{' in subrec:
+ multiple_targets = True
+
+ # ... on the same line or the next line
+ elif multiple_targets:
+ matchobj = TARGET_REGEX2.match(rec)
+
+ # No more matches in this record
+ if not matchobj:
+ break
+
+ linktext = matchobj.group(1)
+ links.append(LinkInfo(recno, linktext, is_target))
+
+ rec = rec[matchobj.end():]
+
+ return links
+
+def locate_nonlocal_link(abspath, filename):
+ """Return the absolute path associated with a link in a PDS file. This is
+ done by searching up the tree and also by looking inside the LABEL,
+ CATALOG and INCLUDE directories if they exist."""
+
+ filename_uc = filename.upper()
+
+ parts = abspath.split('/')[:-1]
+
+ # parts are [..., 'holdings', 'volumes', volset, volname, ...]
+ # Therefore, if 'holdings' is in parts[:-3], then there's a volname in this
+ # path.
+ while 'pds4-holdings' in parts[:-3]:
+ testpath = '/'.join(parts)
+ basenames = os.listdir(testpath)
+ basenames_uc = [b.upper() for b in basenames]
+ try:
+ k = basenames_uc.index(filename_uc)
+ return testpath + '/' + basenames[k]
+ except ValueError:
+ pass
+
+ for dirname in ['LABEL', 'CATALOG', 'INCLUDE', 'INDEX', 'DOCUMENT',
+ 'DATA', 'CALIB', 'EXTRAS', 'SOFTWARE']:
+ try:
+ k = basenames_uc.index(dirname)
+ subnames = os.listdir(testpath + '/' + basenames[k])
+ subupper = [s.upper() for s in subnames]
+ try:
+ kk = subupper.index(filename_uc)
+ return testpath + '/' + basenames[k] + '/' + subnames[kk]
+ except ValueError:
+ pass
+ except ValueError:
+ pass
+
+ parts = parts[:-1]
+
+ return ''
+
+def locate_link_with_path(abspath, filename):
+ """Return the absolute path associated with a link that contains a leading
+ directory path.
+ """
+
+ parts = filename.split('/')
+ link_path = locate_nonlocal_link(abspath, parts[0])
+ if not link_path:
+ return ''
+
+ for part in parts[1:]:
+ basenames = os.listdir(link_path)
+ if part in basenames:
+ link_path += '/' + part
+ else:
+ basenames_uc = [b.upper() for b in basenames]
+ part_uc = part.upper()
+ if part_uc in basenames_uc:
+ k = basenames_uc.index(part_uc)
+ link_path += '/' + basenames[k]
+ else:
+ return ''
+
+ return link_path
+
+################################################################################
+
+def load_links(dirpath, limits={}, logger=None):
+ """Load link dictionary from a shelf file, converting interior paths to
+ absolute paths."""
+
+ dirpath = os.path.abspath(dirpath)
+ pdsdir = pdsfile.Pds4File.from_abspath(dirpath)
+
+ dirpath_ = dirpath.rstrip('/') + '/'
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdsdir.root_)
+ logger.open('Reading link shelf file for', dirpath, limits)
+
+ try:
+ (link_path, lskip) = pdsdir.shelf_path_and_lskip('link')
+ prefix_ = pdsdir.volume_abspath() + '/'
+
+ logger.info('Link shelf file', link_path)
+
+ if not os.path.exists(link_path):
+ raise IOError('File not found: ' + link_path)
+
+ # Read the shelf file and convert to a dictionary
+ with open(link_path, 'rb') as f:
+ interior_dict = pickle.load(f)
+
+ # Convert interior paths to absolute paths
+ link_dict = {}
+ for (key, values) in interior_dict.items():
+ long_key = dirpath_ + key
+
+ if isinstance(values, list):
+ new_list = []
+ for (recno, basename, interior_path) in values:
+ abspath = dirpath_ + str(interior_path)
+ if '../' in abspath:
+ abspath = os.path.abspath(abspath)
+
+ new_list.append((recno, str(basename), abspath))
+
+ link_dict[long_key] = new_list
+ else:
+ values = str(values)
+ if values == '':
+ link_dict[long_key] = ''
+ else:
+ link_dict[long_key] = dirpath_ + values
+
+ return link_dict
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ _ = logger.close()
+
+################################################################################
+
+def write_linkdict(dirpath, link_dict, limits={}, logger=None):
+ """Write a new link shelf file for a directory tree."""
+
+ # Initialize
+ dirpath = os.path.abspath(dirpath)
+ pdsdir = pdsfile.Pds4File.from_abspath(dirpath)
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdsdir.root_)
+ logger.open('Writing link shelf file for', dirpath, limits)
+
+ try:
+ (link_path, lskip) = pdsdir.shelf_path_and_lskip('link')
+ logger.info('Link shelf file', link_path)
+
+ # Create a dictionary using interior paths instead of absolute paths
+ interior_dict = {}
+ prefix = (dirpath + '/')[:lskip]
+ for (key, values) in link_dict.items():
+ if isinstance(values, list):
+ new_list = []
+ for (basename, recno, link_abspath) in values:
+ if link_abspath[:lskip] == prefix:
+ new_list.append((basename, recno, link_abspath[lskip:]))
+ else: # link outside this volume
+ link = pdsfile.Pds4File.from_abspath(link_abspath)
+ if (link.category_ == pdsdir.category_ and
+ link.bundleset == pdsdir.bundleset and
+ link.suffix == pdsdir.suffix):
+ link_relpath = '../' + link.bundlename_ + link.interior
+ elif link.category_ == pdsdir.category_:
+ link_relpath = ('../../' + link.bundleset_ +
+ link.bundlename_ + link.interior)
+ else:
+ link_relpath = ('../../../' + link.category_ +
+ link.bundleset_ +
+ link.bundlename_ + link.interior)
+ new_list.append((basename, recno, link_relpath))
+
+ interior_dict[key[lskip:]] = new_list
+ else:
+ interior_dict[key[lskip:]] = values[lskip:]
+
+ # Create parent directory if necessary
+ parent = os.path.split(link_path)[0]
+ if not os.path.exists(parent):
+ logger.normal('Creating directory', parent)
+ os.makedirs(parent)
+
+ # Write the shelf
+ with open(link_path, 'wb') as f:
+ pickle.dump(interior_dict, f)
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ _ = logger.close()
+
+ logger.open('Writing Python dictionary', dirpath)
+ try:
+ # Determine the maximum length of the file path and basename
+ len_key = 0
+ len_base = 0
+ for (key, value) in interior_dict.items():
+ len_key = max(len_key, len(key))
+ if isinstance(value, list):
+ tuples = value
+ for (recno, basename, interior_path) in tuples:
+ len_base = max(len_base, len(basename))
+
+ len_key = min(len_key, 60)
+
+ # Write the python dictionary version
+ python_path = link_path.rpartition('.')[0] + '.py'
+ name = os.path.basename(python_path)
+ parts = name.split('_')
+ name = '_'.join(parts[:2]) + '_links'
+ keys = list(interior_dict.keys())
+ keys.sort()
+
+ with open(python_path, 'w', encoding='latin-1') as f:
+ f.write(name + ' = {\n')
+ for valtype in (list, str):
+ for key in keys:
+ if not isinstance(interior_dict[key], valtype): continue
+
+ f.write(' "%s"' % key)
+ if len(key) < len_key:
+ f.write((len_key - len(key)) * ' ')
+ f.write(': ')
+ tuple_indent = max(len(key),len_key) + 7
+
+ values = interior_dict[key]
+ if isinstance(values, str):
+ f.write('"%s",\n' % values)
+ elif len(values) == 0:
+ f.write('[],\n')
+ else:
+ f.write('[')
+ for k in range(len(values)):
+ (recno, basename, interior_path) = values[k]
+ f.write('(%4d, ' % recno)
+ f.write('"%s, ' % (basename + '"' +
+ (len_base-len(basename)) * ' '))
+ f.write('"%s")' % interior_path)
+
+ if k < len(values) - 1:
+ f.write(',\n' + tuple_indent * ' ')
+ else:
+ f.write('],\n')
+
+ f.write('}\n\n')
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ _ = logger.close()
+
+################################################################################
+
+def validate_links(dirpath, dirdict, shelfdict, limits={}, logger=None):
+
+ dirpath = os.path.abspath(dirpath)
+ pdsdir = pdsfile.Pds4File.from_abspath(dirpath)
+
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.replace_root(pdsdir.root_)
+ logger.open('Validating link shelf file for', dirpath, limits=limits)
+
+ try:
+ keys = list(dirdict.keys())
+ for key in keys:
+ if key in shelfdict:
+ dirinfo = dirdict[key]
+ shelfinfo = shelfdict[key]
+
+ if type(dirinfo) == list:
+ dirinfo.sort()
+
+ if type(shelfinfo) == list:
+ shelfinfo.sort()
+
+ if dirinfo != shelfinfo:
+ logger.error('Link target mismatch', key)
+
+ del shelfdict[key]
+ del dirdict[key]
+
+ keys = list(dirdict.keys())
+ keys.sort()
+ for key in keys:
+ logger.error('Missing link shelf file entry for', key)
+
+ keys = list(shelfdict.keys())
+ keys.sort()
+ for key in keys:
+ logger.error('Link shelf file entry found for missing file', key)
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ return logger.close()
+
+################################################################################
+
+def move_old_links(shelf_file, logger=None):
+ """Move a file to the /logs/ directory tree and append a time tag."""
+
+ if not os.path.exists(shelf_file): return
+
+ shelf_basename = os.path.basename(shelf_file)
+ (shelf_prefix, shelf_ext) = os.path.splitext(shelf_basename)
+
+ if logger is None:
+ logger = pdslogger.PdsLogger.get_logger(LOGNAME)
+
+ from_logged = False
+ for log_dir in LOGDIRS:
+ dest_template = log_dir + '/' + shelf_prefix + '_v???' + shelf_ext
+ version_paths = glob.glob(dest_template)
+
+ max_version = 0
+ lskip = len(shelf_ext)
+ for version_path in version_paths:
+ version = int(version_path[-lskip-3:-lskip])
+ max_version = max(max_version, version)
+
+ new_version = max_version + 1
+ dest = dest_template.replace('???', '%03d' % new_version)
+ shutil.copy(shelf_file, dest)
+
+ if not from_logged:
+ logger.info('Link shelf file moved from: ' + shelf_file)
+ from_logged = True
+
+ logger.info('Link shelf file moved to ' + dest)
+
+ python_src = shelf_file.rpartition('.')[0] + '.py'
+ python_dest = dest.rpartition('.')[0] + '.py'
+ shutil.copy(python_src, python_dest)
+
+ pickle_src = shelf_file.rpartition('.')[0] + '.pickle'
+ pickle_dest = dest.rpartition('.')[0] + '.pickle'
+ shutil.copy(pickle_src, pickle_dest)
+
+################################################################################
+# Simplified functions to perform tasks
+################################################################################
+
+def initialize(pdsdir, logger=None):
+
+ link_path = pdsdir.shelf_path_and_lskip('link')[0]
+
+ # Make sure file does not exist
+ if os.path.exists(link_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.error('Link shelf file already exists', link_path)
+ return
+
+ # Generate link info
+ (link_dict, _) = generate_links(pdsdir.abspath, logger=logger)
+
+ # Move old file if necessary
+ if os.path.exists(link_path):
+ move_old_links(link_path, logger=logger)
+
+ # Save link files
+ write_linkdict(pdsdir.abspath, link_dict, logger=logger)
+
+def reinitialize(pdsdir, logger=None):
+
+ link_path = pdsdir.shelf_path_and_lskip('link')[0]
+
+ # Warn if shelf file does not exist
+ if not os.path.exists(link_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.warn('Link shelf file does not exist; initializing', link_path)
+ initialize(pdsdir, logger=logger)
+ return
+
+ # Generate link info
+ (link_dict, _) = generate_links(pdsdir.abspath, logger=logger)
+
+ # Move old file if necessary
+ if os.path.exists(link_path):
+ move_old_links(link_path, logger=logger)
+
+ # Save link files
+ write_linkdict(pdsdir.abspath, link_dict, logger=logger)
+
+def validate(pdsdir, logger=None):
+
+ link_path = pdsdir.shelf_path_and_lskip('link')[0]
+
+ # Make sure file exists
+ if not os.path.exists(link_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.error('Link shelf file does not exist', link_path)
+ return
+
+ # Read link shelf file
+ shelf_linkdict = load_links(pdsdir.abspath, logger=logger)
+
+ # Generate link dict
+ (dir_linkdict, _) = generate_links(pdsdir.abspath, logger=logger)
+
+ # Validate
+ validate_links(pdsdir.abspath, dir_linkdict, shelf_linkdict, logger=logger)
+
+def repair(pdsdir, logger=None):
+
+ link_path = pdsdir.shelf_path_and_lskip('link')[0]
+
+ # Make sure file exists
+ if not os.path.exists(link_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.warn('Link shelf file does not exist; initializing', link_path)
+ return
+
+ # Read link shelf file
+ shelf_linkdict = load_links(pdsdir.abspath, logger=logger)
+
+ # Generate link dict
+ (dir_linkdict, latest_mtime) = generate_links(pdsdir.abspath, logger=logger)
+
+ # Compare
+ canceled = (dir_linkdict == shelf_linkdict)
+ if canceled:
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+
+ link_pypath = link_path.replace('.pickle', '.py')
+ link_mtime = min(os.path.getmtime(link_path),
+ os.path.getmtime(link_pypath))
+ if latest_mtime > link_mtime:
+ logger.info('!!! Link shelf file content is up to date',
+ link_path, force=True)
+
+ dt = datetime.datetime.fromtimestamp(latest_mtime)
+ logger.info('!!! Latest holdings file modification date',
+ dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True)
+
+ dt = datetime.datetime.fromtimestamp(link_mtime)
+ logger.info('!!! Link shelf file modification date',
+ dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True)
+
+ delta = latest_mtime - link_mtime
+ if delta >= 86400/10:
+ logger.info('!!! Link shelf file is out of date %.1f days' %
+ (delta / 86400.), force=True)
+ else:
+ logger.info('!!! Link shelf file is out of date %.1f minutes' %
+ (delta / 60.), force=True)
+
+ dt = datetime.datetime.now()
+ os.utime(link_path)
+ os.utime(link_pypath)
+ logger.info('!!! Time tag on link shelf files set to',
+ dt.strftime('%Y-%m-%dT%H-%M-%S'), force=True)
+ else:
+ logger.info(f'!!! Link shelf file is up to date; repair canceled',
+ link_path, force=True)
+ return
+
+ # Move files and write new links
+ move_old_links(link_path, logger=logger)
+ write_linkdict(pdsdir.abspath, dir_linkdict, logger=logger)
+
+def update(pdsdir, logger=None):
+
+ link_path = pdsdir.shelf_path_and_lskip('link')[0]
+
+ # Make sure link shelf file exists
+ if not os.path.exists(link_path):
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.warn('Link shelf file does not exist; initializing', link_path)
+ initialize(pdsdir, logger=logger)
+ return
+
+ # Read link shelf file
+ shelf_linkdict = load_links(pdsdir.abspath, logger=logger)
+
+ # Generate link dict
+ (dir_linkdict,
+ latest_mtime) = generate_links(pdsdir.abspath, shelf_linkdict,
+ logger=logger)
+
+ # Compare
+ canceled = (dir_linkdict == shelf_linkdict)
+ if canceled:
+ logger = logger or pdslogger.PdsLogger.get_logger(LOGNAME)
+ logger.info('!!! Link shelf file content is complete; update canceled',
+ link_path, force=True)
+ return
+
+ # Move files and write new links
+ move_old_links(link_path, logger=logger)
+ write_linkdict(pdsdir.abspath, dir_linkdict, logger=logger)
+
+################################################################################
+
+def main():
+
+ # Set up parser
+ parser = argparse.ArgumentParser(
+ description='pdslinkshelf: Create, maintain and validate shelves of ' +
+ 'links between files.')
+
+ parser.add_argument('--initialize', '--init', const='initialize',
+ default='', action='store_const', dest='task',
+ help='Create a link shelf file for a volume. Abort ' +
+ 'if the checksum file already exists.')
+
+ parser.add_argument('--reinitialize', '--reinit', const='reinitialize',
+ default='', action='store_const', dest='task',
+ help='Create a link shelf file for a volume. Replace ' +
+ 'the file if it already exists.')
+
+ parser.add_argument('--validate', const='validate',
+ default='', action='store_const', dest='task',
+ help='Validate every link in a volume directory tree ' +
+ 'against its link shelf file.')
+
+ parser.add_argument('--repair', const='repair',
+ default='', action='store_const', dest='task',
+ help='Validate every link in a volume directory tree ' +
+ 'against its link shelf file. If any ' +
+ 'disagreement is found, replace the shelf ' +
+ 'file; otherwise leave it unchanged. If any of ' +
+ 'the files checked are newer than the link shelf '+
+ 'file, update shelf file\'s modification date')
+
+ parser.add_argument('--update', const='update',
+ default='', action='store_const', dest='task',
+ help='Search a directory for any new files and add ' +
+ 'their links to the link shelf file. Links of ' +
+ 'pre-existing files are not checked.')
+
+ parser.add_argument('bundle', nargs='+', type=str,
+ help='The path to the root directory of a bundle.')
+
+ parser.add_argument('--log', '-l', type=str, default='',
+ help='Optional root directory for a duplicate of the ' +
+ 'log files. If not specified, the value of ' +
+ 'environment variable "%s" ' % LOGROOT_ENV +
+ 'is used. In addition, individual logs are ' +
+ 'written into the "logs" directory parallel to ' +
+ '"holdings". Logs are created inside the ' +
+ '"pdslinkshelf" subdirectory of each log root ' +
+ 'directory.'
+ )
+
+ parser.add_argument('--quiet', '-q', action='store_true',
+ help='Do not also log to the terminal.')
+
+ # Parse and validate the command line
+ args = parser.parse_args()
+
+ if not args.task:
+ print('pdslinkshelf error: Missing task')
+ sys.exit(1)
+
+ status = 0
+
+ # Define the logging directory
+ if args.log == '':
+ try:
+ args.log = os.environ[LOGROOT_ENV]
+ except KeyError:
+ args.log = None
+
+ # Initialize the logger
+ logger = pdslogger.PdsLogger(LOGNAME)
+ pdsfile.Pds4File.set_log_root(args.log)
+
+ if not args.quiet:
+ logger.add_handler(pdslogger.stdout_handler)
+
+ if args.log:
+ path = os.path.join(args.log, 'pdslinkshelf')
+ warning_handler = pdslogger.warning_handler(path)
+ logger.add_handler(warning_handler)
+
+ error_handler = pdslogger.error_handler(path)
+ logger.add_handler(error_handler)
+
+ # Generate a list of file paths before logging
+ paths = []
+ for path in args.bundle:
+
+ if not os.path.exists(path):
+ print('No such file or directory: ' + path)
+ sys.exit(1)
+
+ path = os.path.abspath(path)
+ pdsf = pdsfile.Pds4File.from_abspath(path)
+
+ if pdsf.checksums_:
+ print('No link shelf files for checksum files: ' + path)
+ sys.exit(1)
+
+ if pdsf.archives_:
+ print('No link shelf files for archive files: ' + path)
+ sys.exit(1)
+
+ if pdsf.is_bundleset_dir:
+ paths += [os.path.join(path, c) for c in pdsf.childnames]
+
+ else:
+ paths.append(os.path.abspath(path))
+
+ # Loop through tuples...
+ logger.open(' '.join(sys.argv))
+ try:
+ for path in paths:
+
+ pdsdir = pdsfile.Pds4File.from_abspath(path)
+ # skip volset-level readme files and *_support dirctiory
+ # if not pdsdir.isdir or '_support' in pdsdir.abspath:
+ if not pdsdir.isdir:
+ continue
+
+ # Save logs in up to two places
+ logfiles = set([pdsdir.log_path_for_bundle('_links',
+ task=args.task,
+ dir='pdslinkshelf'),
+ pdsdir.log_path_for_bundle('_links',
+ task=args.task,
+ dir='pdslinkshelf',
+ place='parallel')])
+
+ # Create all the handlers for this level in the logger
+ local_handlers = []
+ LOGDIRS = [] # used by move_old_links()
+ for logfile in logfiles:
+ local_handlers.append(pdslogger.file_handler(logfile))
+ logdir = os.path.split(logfile)[0]
+ LOGDIRS.append(os.path.split(logfile)[0])
+
+ # These handlers are only used if they don't already exist
+ warning_handler = pdslogger.warning_handler(logdir)
+ error_handler = pdslogger.error_handler(logdir)
+ local_handlers += [warning_handler, error_handler]
+
+ # Open the next level of the log
+ if len(paths) > 1:
+ logger.blankline()
+
+ logger.open('Task "' + args.task + '" for', path,
+ handler=local_handlers)
+
+ try:
+ for logfile in logfiles:
+ logger.info('Log file', logfile)
+
+ if args.task == 'initialize':
+ initialize(pdsdir)
+
+ elif args.task == 'reinitialize':
+ reinitialize(pdsdir)
+
+ elif args.task == 'validate':
+ validate(pdsdir)
+
+ elif args.task == 'repair':
+ repair(pdsdir)
+
+ else: # update
+ update(pdsdir)
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ raise
+
+ finally:
+ _ = logger.close()
+
+ except (Exception, KeyboardInterrupt) as e:
+ logger.exception(e)
+ status = 1
+ raise
+
+ finally:
+ (fatal, errors, warnings, tests) = logger.close()
+ if fatal or errors: status = 1
+
+ sys.exit(status)
+
+if __name__ == '__main__':
+ main()
diff --git a/pdsfile/pds3file/__init__.py b/pdsfile/pds3file/__init__.py
index 255d449..e8591b1 100644
--- a/pdsfile/pds3file/__init__.py
+++ b/pdsfile/pds3file/__init__.py
@@ -11,6 +11,7 @@
from pdsfile.pdsfile import PdsFile
from . import rules
from pdsfile.preload_and_cache import cache_lifetime_for_class
+import re
class Pds3File(PdsFile):
@@ -87,6 +88,9 @@ class Pds3File(PdsFile):
OPUS_ID_TO_SUBCLASS = rules.OPUS_ID_TO_SUBCLASS
FILESPEC_TO_BUNDLESET = rules.FILESPEC_TO_BUNDLESET
+ IDX_EXT = '.tab'
+ LBL_EXT = '.lbl'
+
def __init__(self):
super().__init__()
diff --git a/pdsfile/pds4file/__init__.py b/pdsfile/pds4file/__init__.py
index c4c6c66..01f326f 100644
--- a/pdsfile/pds4file/__init__.py
+++ b/pdsfile/pds4file/__init__.py
@@ -13,7 +13,16 @@
class Pds4File(PdsFile):
- BUNDLESET_REGEX = re.compile(r'^(uranus_occs_earthbased|^cassini_iss|^cassini_vims)$')
+ PDS_HOLDINGS = 'pds4-holdings'
+ BUNDLE_DIR_NAME = 'bundles'
+
+ # TODO: Generalize PDS4 bundlenames in the future once we have more bundles
+ # REGEX
+ BUNDLESET_REGEX = re.compile(r'^(uranus_occs_earthbased|' +
+ r'^cassini_iss.*|' +
+ r'^cassini_vims.*|' +
+ r'^cassini_uvis.*)$')
+ BUNDLESET_REGEX_I = re.compile(BUNDLESET_REGEX.pattern, re.I)
BUNDLESET_PLUS_REGEX = re.compile(BUNDLESET_REGEX.pattern[:-1] +
r'(_v[0-9]+\.[0-9]+\.[0-9]+|' +
r'_v[0-9]+\.[0-9]+|_v[0-9]+|' +
@@ -22,15 +31,18 @@ class Pds4File(PdsFile):
r'((|_calibrated|_diagrams|_metadata|_previews)' +
r'(|_md5\.txt|\.tar\.gz))$')
BUNDLESET_PLUS_REGEX_I = re.compile(BUNDLESET_PLUS_REGEX.pattern, re.I)
+ BUNDLENAME_REGEX = re.compile(r'^([a-zA-z\_].+)$')
- BUNDLENAME_REGEX = re.compile(r'((^uranus_occ_u\d{0,4}._[a-z]*_(fos|\d{2,3}cm))'+
- r'|(^cassini\_[a-z]{3,4}\_cruise))$')
+ BUNDLENAME_REGEX_I = re.compile(BUNDLENAME_REGEX.pattern, re.I)
BUNDLENAME_PLUS_REGEX = re.compile(BUNDLENAME_REGEX.pattern[:-1] +
r'(|_[a-z]+)(|_md5\.txt|\.tar\.gz)$')
BUNDLENAME_PLUS_REGEX_I = re.compile(BUNDLENAME_PLUS_REGEX.pattern, re.I)
-
- PDS_HOLDINGS = 'pds4-holdings'
- BUNDLE_DIR_NAME = 'bundles'
+ BUNDLENAME_VERSION = re.compile(BUNDLENAME_REGEX.pattern[:-1] +
+ r'(_v[0-9]+\.[0-9]+\.[0-9]+|'+
+ r'_v[0-9]+\.[0-9]+|_v[0-9]+|'+
+ r'_in_prep|_prelim|_peer_review|'+
+ r'_lien_resolution)$')
+ BUNDLENAME_VERSION_I = re.compile(BUNDLENAME_VERSION.pattern, re.I)
# Logger
LOGGER = pdslogger.NullLogger()
@@ -68,6 +80,9 @@ class Pds4File(PdsFile):
LOCAL_PRELOADED = []
SUBCLASSES = {}
+ IDX_EXT = '.csv'
+ LBL_EXT = '.xml'
+
def __init__(self):
super().__init__()
diff --git a/pdsfile/pdsfile.py b/pdsfile/pdsfile.py
index 50c710f..0a3cebd 100644
--- a/pdsfile/pdsfile.py
+++ b/pdsfile/pdsfile.py
@@ -259,30 +259,6 @@ class PdsFile(object):
VIEWABLE_EXTS = set(['jpg', 'png', 'gif', 'tif', 'tiff', 'jpeg', 'jpeg_small'])
DATAFILE_EXTS = set(['dat', 'img', 'cub', 'qub', 'fit', 'fits'])
- # REGEX
- BUNDLESET_REGEX = re.compile(r'^([A-Z][A-Z0-9x]{1,5}_[0-9x]{3}x)$')
- BUNDLESET_REGEX_I = re.compile(BUNDLESET_REGEX.pattern, re.I)
- BUNDLESET_PLUS_REGEX = re.compile(BUNDLESET_REGEX.pattern[:-1] +
- r'(_v[0-9]+\.[0-9]+\.[0-9]+|'+
- r'_v[0-9]+\.[0-9]+|_v[0-9]+|'+
- r'_in_prep|_prelim|_peer_review|'+
- r'_lien_resolution|)' +
- r'((|_calibrated|_diagrams|_metadata|_previews)' +
- r'(|_md5\.txt|\.tar\.gz))$')
- BUNDLESET_PLUS_REGEX_I = re.compile(BUNDLESET_PLUS_REGEX.pattern, re.I)
-
- BUNDLENAME_REGEX = re.compile(r'^([A-Z][A-Z0-9]{1,5}_(?:[0-9]{4}))$')
- BUNDLENAME_REGEX_I = re.compile(BUNDLENAME_REGEX.pattern, re.I)
- BUNDLENAME_PLUS_REGEX = re.compile(BUNDLENAME_REGEX.pattern[:-1] +
- r'(|_[a-z]+)(|_md5\.txt|\.tar\.gz)$')
- BUNDLENAME_PLUS_REGEX_I = re.compile(BUNDLENAME_PLUS_REGEX.pattern, re.I)
- BUNDLENAME_VERSION = re.compile(BUNDLENAME_REGEX.pattern[:-1] +
- r'(_v[0-9]+\.[0-9]+\.[0-9]+|'+
- r'_v[0-9]+\.[0-9]+|_v[0-9]+|'+
- r'_in_prep|_prelim|_peer_review|'+
- r'_lien_resolution)$')
- BUNDLENAME_VERSION_I = re.compile(BUNDLENAME_VERSION.pattern, re.I)
-
CATEGORY_REGEX = re.compile(r'^(|checksums\-)(|archives\-)(\w+)$')
CATEGORY_REGEX_I = re.compile(CATEGORY_REGEX.pattern, re.I)
@@ -1320,11 +1296,11 @@ def os_path_exists(cls, abspath, force_case_sensitive=False):
return os.path.exists(abspath)
# Handle index rows
- if '.tab/' in abspath:
- parts = abspath.partition('.tab/')
- if not cls.os_path_exists(parts[0] + '.tab'):
+ if f'{cls.IDX_EXT}/' in abspath:
+ parts = abspath.partition(f'{cls.IDX_EXT}/')
+ if not cls.os_path_exists(parts[0] + cls.IDX_EXT):
return False
- pdsf = cls.from_abspath(parts[0] + '.tab')
+ pdsf = cls.from_abspath(parts[0] + cls.IDX_EXT)
return (pdsf.exists and
pdsf.child_of_index(parts[2], flag='').exists)
@@ -1865,14 +1841,14 @@ def indexshelf_abspath(self):
cls = type(self)
if self._indexshelf_abspath is None:
- if self.extension not in ('.tab', '.TAB'):
+ if self.extension not in (cls.IDX_EXT, cls.IDX_EXT.upper()):
self._indexshelf_abspath = ''
else:
abspath = self.abspath
abspath = abspath.replace(f'/{cls.PDS_HOLDINGS}/',
f'/{cls.PDS_HOLDINGS}/_indexshelf-')
- abspath = abspath.replace('.tab', '.pickle')
- abspath = abspath.replace('.TAB', '.pickle')
+ abspath = abspath.replace(cls.IDX_EXT, '.pickle')
+ abspath = abspath.replace(cls.IDX_EXT.upper(), '.pickle')
self._indexshelf_abspath = abspath
self._recache()
@@ -1885,6 +1861,7 @@ def is_index(self):
presence of the corresponding indexshelf file.
"""
+ cls = type(self)
if self._is_index is None:
abspath = self.indexshelf_abspath
if abspath and os.path.exists(abspath):
@@ -1895,7 +1872,7 @@ def is_index(self):
# file is being created.
# XXX This is a real hack and should be looked at again later
if ('/metadata/' in self.abspath
- and self.abspath.lower().endswith('.tab')):
+ and self.abspath.lower().endswith(cls.IDX_EXT)):
return True # this value is not cached
self._is_index = False
@@ -1911,9 +1888,11 @@ def index_pdslabel(self):
if not self.is_index:
return None
+ cls = type(self)
if self._index_pdslabel is None:
- label_abspath = self.abspath.replace ('.tab', '.lbl')
- label_abspath = label_abspath.replace('.TAB', '.LBL')
+ label_abspath = self.abspath.replace (cls.IDX_EXT, cls.LBL_EXT)
+ label_abspath = label_abspath.replace(cls.IDX_EXT.upper(),
+ cls.LBL_EXT.upper())
try:
self._index_pdslabel = pdsparser.PdsLabel.from_file(label_abspath)
except:
@@ -2626,9 +2605,9 @@ def label_basename(self):
# Take a first guess at the label filename; PDS3 only!
if self.extension.isupper():
- ext_guesses = ('.LBL', '.lbl')
+ ext_guesses = (cls.LBL_EXT.upper(), cls.LBL_EXT)
else:
- ext_guesses = ('.lbl', '.LBL')
+ ext_guesses = (cls.LBL_EXT, cls.LBL_EXT.upper())
rootname = self.basename[:-len(self.extension)]
test_basenames = [rootname + ext for ext in ext_guesses]
@@ -4832,7 +4811,7 @@ def checksum_path_and_lskip(self):
raise ValueError('No checksums of checksum files: ' +
self.logical_path)
- if self.voltype_ == 'volumes/':
+ if self.voltype_ == 'volumes/' or self.voltype_ == 'bundles/':
suffix = ''
else:
suffix = '_' + self.voltype_[:-1]
@@ -5473,7 +5452,8 @@ def basename_is_label(self, basename):
basename -- basename of a file
"""
- return (len(basename) > 4) and (basename[-4:].lower() == '.lbl')
+ cls = type(self)
+ return (len(basename) > 4) and (basename[-4:].lower() == cls.LBL_EXT)
def basename_is_viewable(self, basename=None):
"""Return True if this basename is viewable. Override if viewable files can
@@ -5966,8 +5946,8 @@ def associated_abspaths(self, category, must_exist=True):
for pattern in patterns:
# Handle an index row by separating the filepath from the suffix
- if '.tab/' in pattern:
- parts = pattern.rpartition('.tab')
+ if f'{cls.IDX_EXT}/' in pattern:
+ parts = pattern.rpartition(cls.IDX_EXT)
pattern = parts[0] + parts[1]
suffix = parts[2][1:]
else: