From 8d41acbbd511e4e6ddb39dbe3c7e7701dd6d798a Mon Sep 17 00:00:00 2001 From: Duncan Macleod Date: Fri, 25 Aug 2023 03:51:40 -0700 Subject: [PATCH] remove all support for XML file support closes #165 --- docs/utilities/merge.rst | 10 +---- docs/workflow/index.rst | 5 +-- docs/workflow/simulations.rst | 30 --------------- omicron/cli/merge_with_gaps.py | 65 +++----------------------------- omicron/cli/process.py | 38 +++---------------- omicron/cli/show.py | 20 ++-------- omicron/nagios.py | 4 +- omicron/parameters.py | 3 +- omicron/tests/test_io.py | 2 +- omicron/tests/test_parameters.py | 8 ++-- 10 files changed, 26 insertions(+), 159 deletions(-) diff --git a/docs/utilities/merge.rst b/docs/utilities/merge.rst index e95fa98..d4c10af 100644 --- a/docs/utilities/merge.rst +++ b/docs/utilities/merge.rst @@ -1,8 +1,8 @@ Merge trigger files ################### -In order to not end up with millions of small ``.root``, ``.hdf5``, and -``.xml`` files each representing a +In order to not end up with millions of small ``.root`` and ``.hdf5`` +files each representing a small chunk of processed time, the ``omicron-process`` workflow will merge contiguous files together using the ``omicron-merge-with-gaps`` command line utility. The purpose of this control utility is to detect any gaps in the expected list @@ -20,8 +20,6 @@ merge te contiguous trigger files: +------------+-----------+-------------------------------------------------+ | HDF5 | ``.hdf5`` | ``omicron-hdf5-merge`` | +------------+-----------+-------------------------------------------------+ -| ligolw | ``.xml `` | ``ligolw_add`` and ``gzip`` | -+------------+-----------+-------------------------------------------------+ | Text | ``.txt `` | ``?`` | +------------+-----------+-------------------------------------------------+ @@ -35,8 +33,6 @@ the :meth:`omicron.io.merge_hdf5_files` method: .. automethod:: omicron.io.merge_hdf5_files -The ``ligolw_add`` is an external program contained in the ``lscsoft-glue`` package. - -------------------- @@ -52,8 +48,6 @@ message of each program: .. command-output:: omicron-hdf5-merge --help -.. command-output:: ligolw_add --help - Reducing file count and disk space ################################## diff --git a/docs/workflow/index.rst b/docs/workflow/index.rst index 417ebeb..d7cba47 100644 --- a/docs/workflow/index.rst +++ b/docs/workflow/index.rst @@ -59,8 +59,7 @@ The ``omicron-process`` executable will do the following The DAG will normally do something like this: #. process raw data using ``omicron.exe`` -#. merge contiguous output files with ``.root``, ``.h5``, and ``.xml`` extensions -#. gzip ``.xml`` files to save space +#. merge contiguous output files with ``.root`` and ``.h5`` extensions #. the merged files are copied to the archive directory, nominally ``/home/detchar/triggers///`` #. if everything completes successfully, trigger and log files are deleted @@ -92,7 +91,7 @@ where the path components are as follows e.g.:: - ~/triggers/L1/GDS_CALIB_STRAIN_OMICRON/12345/L1-GDS_CALIB_STRAIN_OMICRON-1234567890-100.xml.gz + ~/triggers/L1/GDS_CALIB_STRAIN_OMICRON/12345/L1-GDS_CALIB_STRAIN_OMICRON-1234567890-100.h5 ----------------------------------- Processing a specific time interval diff --git a/docs/workflow/simulations.rst b/docs/workflow/simulations.rst index ed6af48..cd705b3 100644 --- a/docs/workflow/simulations.rst +++ b/docs/workflow/simulations.rst @@ -86,33 +86,3 @@ The results will be in the directory: .. code-block:: ./run/merge/Z1:SIM-SINE_GAUSS/ - -One way to view them is to use ligolw_print. The results, formatted for readability: - -.. code-block:: - - ligolw_print -t sngl_burst -c 'peak_time' -c 'snr' -c 'peak_frequency' \ - 'run/merge/Z1:SIM-SINE_GAUSS/Z1-SIM_SINE_GAUSS_OMICRON-1346050820-196.xml.gz' - - +------------+-----------+--------------+ - | peak_time | SNR | Frequency | - +============+===========+==============| - | 1346050868 | 37.1 | 236.4 | - | 1346050875 | 22.1 | 480.7 | - | 1346050882 | 15.2 | 771.3 | - | 1346050888 | 37.1 | 243.9 | - | 1346050894 | 21.4 | 459.1 | - | 1346050902 | 13.7 | 699.8 | - | 1346050908 | 36.5 | 266.1 | - | 1346050915 | 22.4 | 480.7 | - | 1346050922 | 13.9 | 771.3 | - | 1346050928 | 36.0 | 236.4 | - | 1346050935 | 22.1 | 480.6 | - | 1346050942 | 14.2 | 771.3 | - | 1346050948 | 38.7 | 243.9 | - | 1346050954 | 24.3 | 459.1 | - | 1346050962 | 15.0 | 699.7 | - +------------+-----------+--------------+ - - - diff --git a/omicron/cli/merge_with_gaps.py b/omicron/cli/merge_with_gaps.py index 3861d74..ed1291f 100644 --- a/omicron/cli/merge_with_gaps.py +++ b/omicron/cli/merge_with_gaps.py @@ -28,7 +28,6 @@ prog_start_time = time.time() import argparse import glob -import gzip import logging from .. import __version__ from pathlib import Path @@ -52,14 +51,12 @@ def get_merge_cmd(ext): """ Determine the command used to coalescew individual trigger files - :param str ext: file extension: xml, h5 or root + :param str ext: file extension: h5 or root """ if ext == 'root': ret = 'omicron-root-merge' elif ext == 'h5': ret = 'omicron-hdf5-merge' - elif 'xml' in ext: - ret = 'ligolw_add' else: raise AttributeError(f'Unknown trigger file typr {ext}') ret_path = shutil.which(ret) @@ -68,23 +65,7 @@ def get_merge_cmd(ext): return ret_path -def is_old_ligolw(path): - flag = "ilwd:char" - if 'gz' in path.name: - with gzip.open(str(path.absolute()), 'r') as gz: - for line in gz: - if flag in str(line): - return True - return False - else: - with path.open('r') as fp: - for line in fp: - if flag in line: - return True - return False - - -def do_merge(opath, curfiles, chan, stime, etime, ext, skip_gzip): +def do_merge(opath, curfiles, chan, stime, etime, ext): """ Given the list of trigger files merge them all into a single file :param Path opath: output directory @@ -93,7 +74,6 @@ def do_merge(opath, curfiles, chan, stime, etime, ext, skip_gzip): :param int stime: Start GPS time for file list :param int etime: End GPS time :param str ext: trigger file extension, identifying file type - :param boolean skip_gzip: if type is xml do not compress merged file """ outfile_path = opath / f'{chan}-{stime}-{etime - stime}.{ext}' ret = None @@ -108,44 +88,20 @@ def do_merge(opath, curfiles, chan, stime, etime, ext, skip_gzip): returncode = 0 else: cmd = [get_merge_cmd(ext)] - if 'xml' in ext: # also accept xml.gz - outfile_path = Path(str(outfile_path.absolute()).replace('.xml.gz', '.xml')) - cmd.append(f'--output={outfile_path}') - if is_old_ligolw(curfiles[0]): - cmd.append('--ilwdchar-compat') - logger.debug('Working with old ligolw format') for cur in curfiles: cmd.append(str(cur.absolute())) - if 'xml' not in ext: - cmd.append(str(outfile_path.absolute())) + cmd.append(str(outfile_path.absolute())) logger.info(f'Merging {len(curfiles)} {ext} files into {outfile_path}') logger.debug(f'Merge command:\n {" ".join(cmd)}') result = subprocess.run(cmd, capture_output=True) returncode = result.returncode - err_old_fmt = b"invalid type 'ilwd:char'" - if returncode == 1 and 'xml' in ext and err_old_fmt in result.stderr: - # old ligolw format seems to be the problem - cmd = [get_merge_cmd(ext), '--ilwdchar-compat', f'--output={outfile_path}'] - cmd.extend(curfiles) - logger.info(f'Retry merging {len(curfiles)} into {outfile_path} using old xml format') - result = subprocess.run(cmd, capture_output=True) - returncode = result.returncode - if returncode == 0: logger.debug(f'Merge of {ext} files succeeded') else: logger.error(f'Return code:{returncode}, stderr:\n{result.stderr.decode("UTF-8")}') - if 'xml' in ext and returncode == 0 and not skip_gzip and outfile_path.suffix != '.gz': - logger.info(f'Compressing {outfile_path} with gzip') - res2 = subprocess.run(['gzip', '-9', '--force', outfile_path], capture_output=True) - if res2.returncode == 0: - ret = str(outfile_path.absolute()) + '.gz' - else: - logger.error(f'gzip error on {outfile_path}:\n {res2.stderr.decode("UTF-8")}') - else: - ret = str(outfile_path.absolute()) + ret = str(outfile_path.absolute()) return ret @@ -163,11 +119,6 @@ def valid_file(path, uint_bug): if path.exists(): if path.name.endswith('.h5'): table = EventTable.read(path, path='/triggers') - elif path.name.endswith('.xml.gz') or path.name.endswith('.xml'): - if uint_bug: - sed_cmd = ['sed', '-i', '', '-e', 's/uint_8s/int_8u/g', str(path.absolute())] - subprocess.run(sed_cmd) - table = EventTable.read(path, tablename='sngl_burst') elif path.name.endswith('.root'): # reading root files fail if there is a : in the name cwd = Path.cwd() @@ -200,10 +151,6 @@ def main(): parser.add_argument('-o', '--out-dir', help='Path to output directory for merged files') parser.add_argument('-n', '--no-merge', action='store_true', default=False, help='Do not merge files, only copy to output indir') - parser.add_argument('--no-gzip', action='store_true', default=False, - help='Do not compress the ligolw xml files') - parser.add_argument('--uint-bug', default=False, action='store_true', - help='Fix problem XML files created by old version of Omicron beforew merging.') parser.add_argument('--file-list', help='File with list of input file paths, one per line') parser.add_argument('infiles', nargs='*', help='List of paths to files to merge or copy') @@ -309,7 +256,7 @@ def main(): curfiles.append(inpath) else: # break in continuity or start of a new metric day - outfile = do_merge(out_dir, curfiles, name, start_time, end_time, ext, args.no_gzip) + outfile = do_merge(out_dir, curfiles, name, start_time, end_time, ext) if outfile: outfiles.append(outfile) else: @@ -320,7 +267,7 @@ def main(): end_time = etime curfiles = [inpath] if curfiles: - outfile = do_merge(out_dir, curfiles, name, start_time, end_time, ext, args.no_gzip) + outfile = do_merge(out_dir, curfiles, name, start_time, end_time, ext) if outfile: outfiles.append(outfile) else: diff --git a/omicron/cli/process.py b/omicron/cli/process.py index 0ca1735..4cf55ef 100644 --- a/omicron/cli/process.py +++ b/omicron/cli/process.py @@ -422,25 +422,12 @@ def create_parser(): default=False, help='skip running omicron-hdf5-merge (default: %(default)s)', ) - pipeg.add_argument( - '--skip-ligolw_add', - action='store_true', - default=False, - help='skip running ligolw_add (default: %(default)s)', - ) - pipeg.add_argument( - '--skip-gzip', - action='store_true', - default=False, - help='skip running gzip (default: %(default)s)', - ) pipeg.add_argument( '--skip-postprocessing', action='store_true', default=False, help='skip all post-processing, equivalent to ' '--skip-root-merge --skip-hdf5-merge ' - '--skip-ligolw_add --skip-gzip ' '(default: %(default)s)', ) pipeg.add_argument( @@ -484,13 +471,14 @@ def main(args=None): "--executable on the command line") # validate processing options - if all((args.skip_root_merge, args.skip_hdf5_merge, args.skip_ligolw_add, - args.skip_gzip, not args.archive)): + if all((args.skip_root_merge, args.skip_hdf5_merge, not args.archive)): args.skip_postprocessing = True if args.archive: argsd = vars(args) - for arg in ['skip-root-merge', 'skip-hdf5-merge', - 'skip-ligolw-add', 'skip-gzip']: + for arg in [ + 'skip-root-merge', + 'skip-hdf5-merge', + ]: if argsd[arg.replace('-', '_')]: parser.error(f"Cannot use --{arg} with --archive") @@ -1050,8 +1038,6 @@ def main(args=None): prog_path['omicron-merge'] = find_executable('omicron-merge-with-gaps') prog_path['rootmerge'] = find_executable('omicron-root-merge') prog_path['hdf5merge'] = find_executable('omicron-hdf5-merge') - prog_path['ligolw_add'] = find_executable('ligolw_add') - prog_path['gzip'] = find_executable('gzip') prog_path['omicron_archive'] = find_executable('omicron-archive') goterr = list() @@ -1175,20 +1161,6 @@ def main(args=None): f' --out-dir {mergepath} {hdf5files} ') rmfiles.append(hdf5files) - # add LIGO_LW operations - if 'xml' in fileformats: - xmlfiles = ' '.join(omicronfiles[c]['xml']) - for f in omicronfiles[c]['xml']: - ppnode.add_input_file(f) - - no_merge = '--no-merge' if args.skip_ligolw_add else '' - no_gzip = '--no-gzip' if args.skip_gzip else '' - operations.append( - f' {prog_path["omicron-merge"]} {no_merge} {no_gzip} --uint-bug ' - f' --out-dir {mergepath} {xmlfiles} ') - - rmfiles.append(xmlfiles) - # add ASCII operations if 'txt' in fileformats: txtfiles = ' '.join(omicronfiles[c]['txt']) diff --git a/omicron/cli/show.py b/omicron/cli/show.py index 301acc8..613df6a 100644 --- a/omicron/cli/show.py +++ b/omicron/cli/show.py @@ -116,7 +116,7 @@ def create_parser(): '-t', '--file-type', default='xml.gz', - choices=['root', 'xml.gz', 'h5'], + choices=['root', 'h5'], help='type of files to find', ) @@ -236,25 +236,11 @@ def main(args=None): # -- read events ---------------------------------------------------------- # set default columns - if not args.column and args.file_type == 'xml.gz': - args.column = ['peak', 'peak_frequency', 'snr'] - elif not args.column: + if not args.column: args.column = ['time', 'frequency', 'snr'] # read events (with simple filter on segments) - if args.file_type == 'xml.gz': - cname = args.channel.split(':', 1)[1] - events = EventTable.read( - cache, - format='ligolw', - tablename='sngl_burst', - selection=[ - ('peak', in_segmentlist, segs), - 'channel == "{0}"'.format(cname), - ], - columns=set(args.column + ['peak', 'channel']), - ) - elif args.file_type == 'root': + if args.file_type == 'root': events = EventTable.read( cache, format='root', diff --git a/omicron/nagios.py b/omicron/nagios.py index cd08358..3d55a40 100644 --- a/omicron/nagios.py +++ b/omicron/nagios.py @@ -168,7 +168,7 @@ def find_archive_latency(channel, padding, frametype=None, state=None, ------- latency : `dict` a `dict` of `(ext, latency)` pairs for each file extension stored in - the archive ('root', 'xml.gz') + the archive ('root', 'h5') """ ifo = channel[:2] obs = ifo[0] @@ -182,7 +182,7 @@ def find_archive_latency(channel, padding, frametype=None, state=None, target -= padding # find latest file latency = {} - for ext in ['root', 'xml.gz']: + for ext in ['root', 'h5']: f = find_latest_omicron_file(channel, base, ext=ext) end = file_segment(f)[1] latency[ext] = (int(target - end), f) diff --git a/omicron/parameters.py b/omicron/parameters.py index b36671f..72be89f 100644 --- a/omicron/parameters.py +++ b/omicron/parameters.py @@ -403,7 +403,7 @@ def distribute_segment(self, start, end, nperjob=1): return out def output_formats(self): - return [fmt for fmt in ('root', 'txt', 'xml', 'hdf5') if + return [fmt for fmt in ('root', 'txt', 'hdf5') if fmt in self.get('OUTPUT', 'FORMAT')] def output_files(self, start, end, flatten=False): @@ -432,7 +432,6 @@ def output_files(self, start, end, flatten=False): extension = { 'root': 'root', 'txt': 'txt', - 'xml': 'xml', 'hdf5': 'h5', } diff --git a/omicron/tests/test_io.py b/omicron/tests/test_io.py index 229b8be..86880b2 100644 --- a/omicron/tests/test_io.py +++ b/omicron/tests/test_io.py @@ -25,7 +25,7 @@ def test_get_archive_filename(): assert io.get_archive_filename('L1:GDS-CALIB_STRAIN', 0, 100) == ( '%s/L1/GDS_CALIB_STRAIN_OMICRON/00000/' - 'L1-GDS_CALIB_STRAIN_OMICRON-0-100.xml.gz' % const.OMICRON_ARCHIVE) + 'L1-GDS_CALIB_STRAIN_OMICRON-0-100.h5' % const.OMICRON_ARCHIVE) assert io.get_archive_filename( 'L1:GDS-CALIB_STRAIN', 1234567890, 123, archive='/triggers', filetag='TEST-TAg', ext='root') == ( diff --git a/omicron/tests/test_parameters.py b/omicron/tests/test_parameters.py index f9c9b1c..7330a93 100644 --- a/omicron/tests/test_parameters.py +++ b/omicron/tests/test_parameters.py @@ -189,7 +189,7 @@ def test_distribute_segments(pars): def test_output_files(pars): pars.set('PARAMETER', 'TIMING', '64 4') - pars.set('OUTPUT', 'FORMAT', 'root xml') + pars.set('OUTPUT', 'FORMAT', 'root hdf5') pars.set('DATA', 'CHANNELS', 'X1:TEST-CHANNEL') assert pars.output_files(0, 100) == { 'X1:TEST-CHANNEL': @@ -198,9 +198,9 @@ def test_output_files(pars): './X1:TEST-CHANNEL/X1-TEST_CHANNEL_OMICRON-2-60.root', './X1:TEST-CHANNEL/X1-TEST_CHANNEL_OMICRON-62-36.root', ], - 'xml': [ - './X1:TEST-CHANNEL/X1-TEST_CHANNEL_OMICRON-2-60.xml', - './X1:TEST-CHANNEL/X1-TEST_CHANNEL_OMICRON-62-36.xml', + 'hdf5': [ + './X1:TEST-CHANNEL/X1-TEST_CHANNEL_OMICRON-2-60.h5', + './X1:TEST-CHANNEL/X1-TEST_CHANNEL_OMICRON-62-36.h5', ], }, }