Fix error querying Virgo frame type (gwastro#4523)

* Fix error querying Virgo frame type * Typo * Typo * Implement Ian's suggestion * Make it work * Use an actual DeprecationWarning
acorreia61201 · Apr 4, 2024 · 5c55d59 · 5c55d59
1 parent 2899f8f
commit 5c55d59
Show file tree

Hide file tree

Showing 2 changed files with 114 additions and 26 deletions.
diff --git a/pycbc/frame/frame.py b/pycbc/frame/frame.py
@@ -17,14 +17,19 @@
 This modules contains functions for reading in data from frame files or caches
 """
 
-import lalframe, logging
-import lal
-import numpy
+import logging
+import warnings
+import os.path
+import glob
+import time
 import math
-import os.path, glob, time
+import re
+from urllib.parse import urlparse
+import numpy
+import lalframe
+import lal
 from gwdatafind import find_urls as find_frame_urls
 import pycbc
-from urllib.parse import urlparse
 from pycbc.types import TimeSeries, zeros
 
 
@@ -256,13 +261,15 @@ def read_frame(location, channels, start_time=None,
     else:
         return _read_channel(channels, stream, start_time, duration)
 
-def frame_paths(frame_type, start_time, end_time, server=None, url_type='file'):
-    """Return the paths to a span of frame files
+def frame_paths(
+    frame_type, start_time, end_time, server=None, url_type='file', site=None
+):
+    """Return the paths to a span of frame files.
 
     Parameters
     ----------
     frame_type : string
-        The string representation of the frame type (ex. 'H1_ER_C00_L1')
+        The string representation of the frame type (ex. 'H1_ER_C00_L1').
     start_time : int
         The start time that we need the frames to span.
     end_time : int
@@ -274,6 +281,11 @@ def frame_paths(frame_type, start_time, end_time, server=None, url_type='file'):
         Returns only frame URLs with a particular scheme or head such
         as "file" or "https". Default is "file", which queries locally
         stored frames. Option can be disabled if set to None.
+    site : string, optional
+        One-letter string specifying which site you want data from (H, L, V,
+        etc).  If not given, the site is assumed to be the first letter of
+        `frame_type`, which is usually (but not always) a safe assumption.
+
     Returns
     -------
     paths : list of paths
@@ -283,22 +295,73 @@ def frame_paths(frame_type, start_time, end_time, server=None, url_type='file'):
     --------
     >>> paths = frame_paths('H1_LDAS_C02_L2', 968995968, 968995968+2048)
     """
-    site = frame_type[0]
+    if site is None:
+        # this case is tolerated for backward compatibility
+        site = frame_type[0]
+        warnings.warn(
+            f'Guessing site {site} from frame type {frame_type}',
+            DeprecationWarning
+        )
     cache = find_frame_urls(site, frame_type, start_time, end_time,
                             urltype=url_type, host=server)
     return [urlparse(entry).path for entry in cache]
 
+
+def get_site_from_type_or_channel(frame_type, channels):
+    """Determine the site for querying gwdatafind (H, L, V, etc) based on
+    substrings of the frame type and channel(s).
+
+    The type should begin with S: or SN:, in which case S is taken as the
+    site.  Otherwise, the same is done with the channel (with the first
+    channel if more than one are given). If that also fails, the site is
+    taken to be the first letter of the frame type, which is usually
+    (but not always) a correct assumption.
+
+    Parameters
+    ----------
+    frame_type : string
+        The frame type, ideally prefixed by the site indicator.
+    channels : string or list of strings
+        The channel name or names.
+
+    Returns
+    -------
+    site : string
+        The site letter.
+    frame_type : string
+        The frame type with the site prefix (if any) removed.
+    """
+    site_re = '^([^:])[^:]?:'
+    m = re.match(site_re, frame_type)
+    if m:
+        return m.groups(1)[0], frame_type[m.end():]
+    chan = channels
+    if isinstance(chan, list):
+        chan = channels[0]
+    m = re.match(site_re, chan)
+    if m:
+        return m.groups(1)[0], frame_type
+    warnings.warn(
+        f'Guessing site {frame_type[0]} from frame type {frame_type}',
+        DeprecationWarning
+    )
+    return frame_type[0], frame_type
+
+
 def query_and_read_frame(frame_type, channels, start_time, end_time,
                          sieve=None, check_integrity=False):
     """Read time series from frame data.
 
-    Query for the locatin of physical frames matching the frame type. Return
+    Query for the location of physical frames matching the frame type. Return
     a time series containing the channel between the given start and end times.
 
     Parameters
     ----------
     frame_type : string
-        The type of frame file that we are looking for.
+        The type of frame file that we are looking for. The string should begin
+        with S: or SN:, in which case S is taken as the site to query. If this
+        is not the case, the site will be guessed from the channel name or from
+        the type in a different way, which may not work.
     channels : string or list of strings
         Either a string that contains the channel name or a list of channel
         name strings.
@@ -324,6 +387,8 @@ def query_and_read_frame(frame_type, channels, start_time, end_time,
     >>> ts = query_and_read_frame('H1_LDAS_C02_L2', 'H1:LDAS-STRAIN',
     >>>                               968995968, 968995968+2048)
     """
+    site, frame_type = get_site_from_type_or_channel(frame_type, channels)
+
     # Allows compatibility with our standard tools
     # We may want to place this into a higher level frame getting tool
     if frame_type in ['LOSC_STRAIN', 'GWOSC_STRAIN']:
@@ -337,17 +402,23 @@ def query_and_read_frame(frame_type, channels, start_time, end_time,
         from pycbc.frame.gwosc import read_frame_gwosc
         return read_frame_gwosc(channels, start_time, end_time)
 
-    logging.info('querying datafind server')
-    paths = frame_paths(frame_type, int(start_time), int(numpy.ceil(end_time)))
-    logging.info('found files: %s' % (' '.join(paths)))
-    return read_frame(paths, channels,
-                      start_time=start_time,
-                      end_time=end_time,
-                      sieve=sieve,
-                      check_integrity=check_integrity)
+    logging.info('Querying datafind server')
+    paths = frame_paths(
+        frame_type,
+        int(start_time),
+        int(numpy.ceil(end_time)),
+        site=site
+    )
+    logging.info('Found frame file paths: %s', ' '.join(paths))
+    return read_frame(
+        paths,
+        channels,
+        start_time=start_time,
+        end_time=end_time,
+        sieve=sieve,
+        check_integrity=check_integrity
+    )
 
-__all__ = ['read_frame', 'frame_paths',
-           'query_and_read_frame']
 
 def write_frame(location, channels, timeseries):
     """Write a list of time series to a single frame file.
@@ -887,3 +958,15 @@ def null_advance(self, blocksize):
         """
         self.idq.null_advance(blocksize)
         self.idq_state.null_advance(blocksize)
+
+
+__all__ = [
+    'locations_to_cache',
+    'read_frame',
+    'query_and_read_frame',
+    'frame_paths',
+    'write_frame',
+    'DataBuffer',
+    'StatusBuffer',
+    'iDQBuffer'
+]
diff --git a/pycbc/strain/strain.py b/pycbc/strain/strain.py
@@ -504,8 +504,10 @@ def insert_strain_option_group(parser, gps_times=True):
     # Use datafind to get frame files
     data_reading_group.add_argument("--frame-type",
                             type=str,
+                            metavar="S:TYPE",
                             help="(optional), replaces frame-files. Use datafind "
-                                 "to get the needed frame file(s) of this type.")
+                                 "to get the needed frame file(s) of this type "
+                                 "from site S.")
     # Filter frame files by URL
     data_reading_group.add_argument("--frame-sieve",
                             type=str,
@@ -704,7 +706,7 @@ def insert_strain_option_group_multi_ifo(parser, gps_times=True):
                             help="Store of time series data in hdf format")
     # Use datafind to get frame files
     data_reading_group_multi.add_argument("--frame-type", type=str, nargs="+",
-                                    action=MultiDetOptionAction,
+                                    action=MultiDetOptionActionSpecial,
                                     metavar='IFO:FRAME_TYPE',
                                     help="(optional) Replaces frame-files. "
                                          "Use datafind to get the needed frame "
@@ -1975,9 +1977,12 @@ def from_cli(cls, ifo, args, maxlen):
             idq_state_channel = ':'.join([ifo, args.idq_state_channel[ifo]])
 
         if args.frame_type:
-            frame_src = pycbc.frame.frame_paths(args.frame_type[ifo],
-                                                args.start_time,
-                                                args.end_time)
+            frame_src = pycbc.frame.frame_paths(
+                args.frame_type[ifo],
+                args.start_time,
+                args.end_time,
+                site=ifo[0]
+            )
         else:
             frame_src = [args.frame_src[ifo]]
         strain_channel = ':'.join([ifo, args.channel_name[ifo]])