From aa862894a5216651c44ae8bf071150f240a9d3e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20S=C3=A1nchez-Gallego?= Date: Thu, 7 Nov 2024 10:14:16 -0800 Subject: [PATCH] Troubleshooter for Overwatcher (#37) * Add some initial files * Merge branch 'main' into troubleshooter * Improve basic structure of the troubleshooter module * Try to fix building docker image for pull request * Attempt with ${{ github.head_ref || github.ref_name }} * Merge branch 'main' into troubleshooter * Add categorisation methods to the ErrorCode enum * Merge remote-tracking branch 'origin/main' into troubleshooter * Very basic but complete implementataion of the troubleshooter * Some tweaks to OverwatcherTask logging * Fix bug in twilight flats recipe for extra flats * Do not require overwatcher to be enabled for calibrations if dome does not change * Merge branch 'main' into troubleshooter * The troubleshooter blocks the observing loop while troubleshooting * Disable overwatcher and run cleanup in safety after closing dome * Add option to disable overwatcher on shutdown * Run a cleanup when disabling the observe loop with immediate=True * Add placeholder for emitting a critical error in the troubleshooter * Add MJD to the notification record in the DB * Merge branch 'main' into troubleshooter * Merge branch 'main' into troubleshooter * Add max_start_time to quick_cals and bias_sequence calibrations --- src/gort/enums.py | 40 +++++ src/gort/etc/calibrations.yaml | 4 +- src/gort/exceptions.py | 12 ++ src/gort/overwatcher/core.py | 4 +- src/gort/overwatcher/helpers/notifier.py | 3 +- src/gort/overwatcher/observer.py | 60 +++---- src/gort/overwatcher/overwatcher.py | 25 ++- src/gort/overwatcher/safety.py | 8 + .../overwatcher/troubleshooter/__init__.py | 12 ++ .../overwatcher/troubleshooter/recipes.py | 150 ++++++++++++++++++ .../troubleshooter/troubleshooter.py | 148 +++++++++++++++++ 11 files changed, 415 insertions(+), 51 deletions(-) create mode 100644 src/gort/overwatcher/troubleshooter/__init__.py create mode 100644 src/gort/overwatcher/troubleshooter/recipes.py create mode 100644 src/gort/overwatcher/troubleshooter/troubleshooter.py diff --git a/src/gort/enums.py b/src/gort/enums.py index ee2c5e0..7803d36 100644 --- a/src/gort/enums.py +++ b/src/gort/enums.py @@ -49,6 +49,46 @@ class ErrorCode(Enum): CALIBRATION_ERROR = 900 UNKNOWN_ERROR = 9999 + def is_telescope_error(self): + """Returns True if the error is related to the telescope.""" + + return self.value >= 100 and self.value < 200 + + def is_ag_error(self): + """Returns True if the error is related to the autoguider.""" + + return self.value >= 200 and self.value < 300 + + def is_spectrograph_error(self): + """Returns True if the error is related to the spectrograph.""" + + return self.value >= 300 and self.value < 400 + + def is_nps_error(self): + """Returns True if the error is related to the NPS.""" + + return self.value >= 400 and self.value < 500 + + def is_enclosure_error(self): + """Returns True if the error is related to the enclosure.""" + + return self.value >= 500 and self.value < 600 + + def is_guiding_error(self): + """Returns True if the error is related to the guider.""" + + return self.value >= 600 and self.value < 700 + + def is_scheduler_error(self): + """Returns True if the error is related to the scheduler.""" + + return self.value >= 700 and self.value < 800 + + def is_observer_error(self): + """Returns True if the error is related to the observer.""" + + return self.value >= 800 and self.value < 900 + class GuiderStatus(Flag): """Maskbits with the guider status.""" diff --git a/src/gort/etc/calibrations.yaml b/src/gort/etc/calibrations.yaml index 9fa1c60..c29eb4f 100644 --- a/src/gort/etc/calibrations.yaml +++ b/src/gort/etc/calibrations.yaml @@ -14,7 +14,7 @@ - name: quick_cals recipe: quick_cals min_start_time: 1800 - max_start_time: null + max_start_time: 3600 time_mode: secs_after_sunset after: null required: true @@ -25,7 +25,7 @@ - name: bias_sequence recipe: bias_sequence min_start_time: null - max_start_time: null + max_start_time: 7200 time_mode: null after: quick_cals required: true diff --git a/src/gort/exceptions.py b/src/gort/exceptions.py index d8e7170..6316953 100644 --- a/src/gort/exceptions.py +++ b/src/gort/exceptions.py @@ -72,6 +72,18 @@ class OverwatcherError(GortError): pass +class TroubleshooterCriticalError(OverwatcherError): + """A critical error in the troubleshooter that will shut down the system.""" + + pass + + +class TroubleshooterTimeoutError(OverwatcherError): + """The troubleshooter timed out while running a recipe.""" + + pass + + class RemoteCommandError(GortError): """An error in a remote command to an actor.""" diff --git a/src/gort/overwatcher/core.py b/src/gort/overwatcher/core.py index 8ed72ea..9ad13f4 100644 --- a/src/gort/overwatcher/core.py +++ b/src/gort/overwatcher/core.py @@ -33,11 +33,11 @@ class OverwatcherBaseTask: keep_alive: ClassVar[bool] = True restart_on_error: ClassVar[bool] = True - def __init__(self): + def __init__(self, log: LogNamespace | None = None): self._task_runner: asyncio.Task | None = None self._heartbeat_task: asyncio.Task | None = None - self._log: Mock | LogNamespace = Mock() + self._log: Mock | LogNamespace = log or Mock() async def run(self): """Runs the task.""" diff --git a/src/gort/overwatcher/helpers/notifier.py b/src/gort/overwatcher/helpers/notifier.py index 8fc17bf..557cbb8 100644 --- a/src/gort/overwatcher/helpers/notifier.py +++ b/src/gort/overwatcher/helpers/notifier.py @@ -17,7 +17,7 @@ import httpx -from sdsstools import Configuration +from sdsstools import Configuration, get_sjd from sdsstools.utils import GatheringTaskGroup from gort.core import LogNamespace @@ -189,6 +189,7 @@ async def notify( [ { "date": datetime.datetime.now(tz=datetime.UTC), + "mjd": get_sjd("LCO"), "level": level, "message": message, "payload": json.dumps(payload), diff --git a/src/gort/overwatcher/observer.py b/src/gort/overwatcher/observer.py index 85dfd9f..2b060ed 100644 --- a/src/gort/overwatcher/observer.py +++ b/src/gort/overwatcher/observer.py @@ -15,13 +15,12 @@ from astropy.time import Time -from gort.enums import ErrorCode -from gort.exceptions import GortError +from gort.exceptions import GortError, TroubleshooterTimeoutError from gort.exposure import Exposure from gort.overwatcher import OverwatcherModule from gort.overwatcher.core import OverwatcherModuleTask from gort.tile import Tile -from gort.tools import cancel_task, run_in_executor, set_tile_status +from gort.tools import cancel_task, run_in_executor if TYPE_CHECKING: @@ -174,6 +173,10 @@ async def stop_observing( ) self.observe_loop = await cancel_task(self.observe_loop) + # The guiders may have been left running or the spectrograph may still + # be exposing. Clean up to avoid issues. + await self.gort.cleanup(readout=False) + else: await self.overwatcher.notify( f"Stopping observations after this tile. Reason: {reason}" @@ -197,6 +200,9 @@ async def observe_loop_task(self): exp: Exposure | list[Exposure] | bool = False try: + # Wait in case the troubleshooter is doing something. + await self.overwatcher.troubleshooter.wait_until_ready(300) + # We want to avoid re-acquiring the tile between dithers. We call # the scheduler here and control the dither position loop ourselves. tile: Tile = await run_in_executor(Tile.from_scheduler) @@ -217,6 +223,8 @@ async def observe_loop_task(self): await self.gort.guiders.focus() for dpos in tile.dither_positions: + await self.overwatcher.troubleshooter.wait_until_ready(300) + # The exposure will complete in 900 seconds + acquisition + readout self.next_exposure_completes = time() + 90 + 900 + 60 @@ -242,46 +250,16 @@ async def observe_loop_task(self): except asyncio.CancelledError: break - except Exception as err: - # TODO: this should be moved to the troubleshooting module, but - # for now handling it here. - - if isinstance(err, GortError): - # If the acquisition failed, disable the tile and try again. - if err.error_code == ErrorCode.ACQUISITION_FAILED: - tile_id: int | None = err.payload.get("tile_id", None) - if tile_id is None: - await notify( - 'Cannot disable tile without a "tile_id. ' - "Continuing observations without disabling tile.", - level="error", - ) - else: - await set_tile_status(tile_id, enabled=False) - await notify( - f"tile_id={tile_id} has been disabled. " - "Continuing observations.", - level="warning", - ) - - # If the scheduler cannot find a tile, wait a minute and try again. - elif err.error_code == ErrorCode.SCHEDULER_CANNOT_FIND_TILE: - await notify( - "The scheduler was not able to find a valid tile to " - "observe. Waiting 60 seconds before trying again.", - level="warning", - ) - await asyncio.sleep(60) - continue - - # No specific troubleshooting available. Report the error, - # do a cleanup and try again. + except TroubleshooterTimeoutError: await notify( - f"An error occurred during the observation: {err} " - "Running the cleanup recipe.", - level="error", + "The troubleshooter timed out after 300 seconds. " + "Cancelling observations.", + level="critical", ) - await self.gort.cleanup(readout=False) + break + + except Exception as err: + await self.overwatcher.troubleshooter.handle(err) finally: if self.is_cancelling: diff --git a/src/gort/overwatcher/overwatcher.py b/src/gort/overwatcher/overwatcher.py index 1d45770..4632700 100644 --- a/src/gort/overwatcher/overwatcher.py +++ b/src/gort/overwatcher/overwatcher.py @@ -26,6 +26,7 @@ from gort.overwatcher.helpers import DomeHelper from gort.overwatcher.helpers.notifier import NotifierMixIn from gort.overwatcher.helpers.tasks import DailyTasks +from gort.overwatcher.troubleshooter.troubleshooter import Troubleshooter @dataclasses.dataclass @@ -49,7 +50,11 @@ def __init__(self, overwatcher: Overwatcher): super().__init__() self.overwatcher = overwatcher - self.log = self.overwatcher.log + + # A bit configuring but _log is used internally, mainly for + # OverwatcherBaseTask.run() and log is for external use. + self._log = self.overwatcher.log + self.log = self._log class OverwatcherMainTask(OverwatcherTask): @@ -228,7 +233,7 @@ async def handle_reenable(self): if self._pending_close_dome: return - self.log.info("Undoing the cancellation of the observing loop.") + self._log.info("Undoing the cancellation of the observing loop.") observer._cancelling = False self.overwatcher.gort.observer.cancelling = False @@ -289,7 +294,7 @@ def __init__( self.state.dry_run = dry_run self.dome = DomeHelper(self) - + self.troubleshooter = Troubleshooter(self) self.tasks: list[OverwatcherTask] = [ OverwatcherMainTask(self), OverwatcherPingTask(self), @@ -339,11 +344,15 @@ async def shutdown( reason: str = "undefined", retry: bool = True, park: bool = True, + disable_overwatcher: bool = False, ): """Shuts down the observatory.""" - # Check if the dome is already closed, then do nothing. - if await self.dome.is_closing(): + dome_closed = await self.dome.is_closing() + enabled = self.state.enabled + observing = self.observer.is_observing + + if dome_closed and not enabled and not observing: return if not reason.endswith("."): @@ -351,6 +360,9 @@ async def shutdown( await self.notify(f"Triggering shutdown. Reason: {reason}", level="warning") + if disable_overwatcher: + await self.notify("The Overwatcher will be disabled.", level="warning") + if not self.state.dry_run: stop = asyncio.create_task(self.observer.stop_observing(immediate=True)) shutdown = asyncio.create_task(self.dome.shutdown(retry=retry, park=park)) @@ -367,6 +379,9 @@ async def shutdown( error=err, ) + if disable_overwatcher: + self.state.enabled = False + async def cancel(self): """Cancels the overwatcher tasks.""" diff --git a/src/gort/overwatcher/safety.py b/src/gort/overwatcher/safety.py index c0fe86a..b49b043 100644 --- a/src/gort/overwatcher/safety.py +++ b/src/gort/overwatcher/safety.py @@ -81,6 +81,14 @@ async def task(self): ) await self.module.close_dome() + # Now run a shutdown. This should not try to close the dome + # since that's already done, but it will stop the observe loop, + # clean-up, etc. + await self.overwatcher.shutdown( + reason="safety alerts detected", + disable_overwatcher=True, + ) + elif self.failed: # We have failed closing the dome as a last resort. We have issued # a critical alert. We don't try closing the dome again. diff --git a/src/gort/overwatcher/troubleshooter/__init__.py b/src/gort/overwatcher/troubleshooter/__init__.py new file mode 100644 index 0000000..546b9d6 --- /dev/null +++ b/src/gort/overwatcher/troubleshooter/__init__.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# @Author: José Sánchez-Gallego (gallegoj@uw.edu) +# @Date: 2024-11-05 +# @Filename: __init__.py +# @License: BSD 3-clause (http://www.opensource.org/licenses/BSD-3-Clause) + +from __future__ import annotations + +from .recipes import TroubleshooterRecipe +from .troubleshooter import Troubleshooter diff --git a/src/gort/overwatcher/troubleshooter/recipes.py b/src/gort/overwatcher/troubleshooter/recipes.py new file mode 100644 index 0000000..d1dfe0a --- /dev/null +++ b/src/gort/overwatcher/troubleshooter/recipes.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# @Author: José Sánchez-Gallego (gallegoj@uw.edu) +# @Date: 2024-11-05 +# @Filename: recipes.py +# @License: BSD 3-clause (http://www.opensource.org/licenses/BSD-3-Clause) + +from __future__ import annotations + +import abc +import asyncio + +from typing import TYPE_CHECKING, ClassVar + +from gort.enums import ErrorCode +from gort.exceptions import TroubleshooterCriticalError +from gort.tools import set_tile_status + + +if TYPE_CHECKING: + from .troubleshooter import TroubleModel, Troubleshooter + + +__all__ = ["TroubleshooterRecipe"] + + +class TroubleshooterRecipe(metaclass=abc.ABCMeta): + """Base class for troubleshooter recipes.""" + + priority: ClassVar[int | None] = None + name: ClassVar[str] + + def __init__(self, troubleshooter: Troubleshooter): + self.troubleshooter = troubleshooter + + self.overwatcher = troubleshooter.overwatcher + self.notify = troubleshooter.overwatcher.notify + + assert hasattr(self, "name"), "Recipe must have a name." + + @abc.abstractmethod + def match(self, error_model: TroubleModel) -> bool: + """Returns True if the recipe can handle the error.""" + + raise NotImplementedError + + async def handle(self, error_model: TroubleModel) -> bool: + """Runs the recipe to handle the error.""" + + try: + return await self._handle_internal(error_model) + except TroubleshooterCriticalError: + # Propagate this error, which will be handled by the Troubleshooter class. + raise + except Exception as err: + await self.notify( + f"Error running recipe {self.name}: {err!r}", + level="error", + ) + return False + + @abc.abstractmethod + async def _handle_internal(self, error_model: TroubleModel) -> bool: + """Internal implementation of the recipe.""" + + raise NotImplementedError + + +class CleanupRecipe(TroubleshooterRecipe): + """A recipe that cleans up after an error.""" + + priority = 500 + name = "cleanup" + + def match(self, error_model: TroubleModel) -> bool: + """Returns True if the recipe can handle the error.""" + + # Always return False. This is a last resort recipe. + return False + + async def _handle_internal(self, error_model: TroubleModel) -> bool: + """Run the cleanup recipe.""" + + await self.overwatcher.gort.cleanup(readout=False) + + return True + + +class AcquisitionFailedRecipe(TroubleshooterRecipe): + """Handle acquisition failures.""" + + priority = 1 + name = "acquisition_failed" + + def match(self, error_model: TroubleModel) -> bool: + """Returns True if the recipe can handle the error.""" + + if error_model.error_code == ErrorCode.ACQUISITION_FAILED: + return True + + return False + + async def _handle_internal(self, error_model: TroubleModel) -> bool: + """Handle the error.""" + + error = error_model.error + + tile_id: int | None = error.payload.get("tile_id", None) + if tile_id is None: + await self.notify( + 'Cannot disable tile without a "tile_id. ' + "Continuing observations without disabling tile.", + level="error", + ) + else: + await set_tile_status(tile_id, enabled=False) + await self.notify( + f"tile_id={tile_id} has been disabled. Continuing observations.", + level="warning", + ) + + return True + + +class SchedulerFailedRecipe(TroubleshooterRecipe): + """Handle acquisition failures.""" + + priority = 1 + name = "scheduler_failed" + + def match(self, error_model: TroubleModel) -> bool: + """Returns True if the recipe can handle the error.""" + + if error_model.error_code.is_scheduler_error(): + return True + + return False + + async def _handle_internal(self, error_model: TroubleModel) -> bool: + """Handle the error.""" + + await self.notify( + "The scheduler was not able to find a valid tile to " + "observe. Waiting 60 seconds before trying again.", + level="warning", + ) + await asyncio.sleep(60) + + return True diff --git a/src/gort/overwatcher/troubleshooter/troubleshooter.py b/src/gort/overwatcher/troubleshooter/troubleshooter.py new file mode 100644 index 0000000..b8a4053 --- /dev/null +++ b/src/gort/overwatcher/troubleshooter/troubleshooter.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# @Author: José Sánchez-Gallego (gallegoj@uw.edu) +# @Date: 2024-11-05 +# @Filename: troubleshooter.py +# @License: BSD 3-clause (http://www.opensource.org/licenses/BSD-3-Clause) + +from __future__ import annotations + +import asyncio +from dataclasses import dataclass + +from typing import TYPE_CHECKING + +from gort.core import LogNamespace +from gort.enums import ErrorCode +from gort.exceptions import ( + GortError, + TroubleshooterCriticalError, + TroubleshooterTimeoutError, +) + +from .recipes import TroubleshooterRecipe + + +if TYPE_CHECKING: + from gort.overwatcher.overwatcher import Overwatcher + + +@dataclass +class TroubleModel: + """Base model for describing a problem to troubleshoot.""" + + error: GortError + error_code: ErrorCode + message: str | None = None + handled: bool = False + + +class RecipeBook(dict[str, TroubleshooterRecipe]): + """A dictionary of recipes.""" + + def __init__(self, ts: Troubleshooter): + recipes_sorted = sorted( + [Recipe(ts) for Recipe in TroubleshooterRecipe.__subclasses__()], + key=lambda x: x.priority if x.priority is not None else 1000, + ) + + for recipe in recipes_sorted: + self[recipe.name] = recipe + + +class Troubleshooter: + """Handles troubleshooting for the Overwatcher.""" + + def __init__(self, overwatcher: Overwatcher): + self.overwatcher = overwatcher + self.notify = overwatcher.notify + + self.recipes = RecipeBook(self) + + self.log = LogNamespace( + self.overwatcher.gort.log, + header=f"({self.__class__.__name__}) ", + ) + + self._event = asyncio.Event() + self._event.set() + + def reset(self): + """Resets the troubleshooter to its initial state.""" + + self._event.set() + + async def wait_until_ready(self, timeout: float | None = None): + """Blocks if the troubleshooter is handling an error until done.""" + + try: + await asyncio.wait_for(self._event.wait(), timeout=timeout) + except asyncio.TimeoutError: + raise TroubleshooterTimeoutError("Troubleshooter timed out.") + + async def handle(self, error: Exception | str): + """Handles an error and tries to troubleshoot it. + + Parameters + ---------- + error + The error to troubleshoot. This can be an exception (usually an instance + of :obj:`.GortError`) or a string with the error message. + + """ + + if not isinstance(error, (Exception, str)): + raise RuntimeError("error must be an exception or a string.") + + if isinstance(error, str): + error = GortError(error, error_code=ErrorCode.UNCATEGORISED_ERROR) + elif not isinstance(error, GortError): + error = GortError(str(error), error_code=ErrorCode.UNCATEGORISED_ERROR) + + error_model = TroubleModel( + error=error, + error_code=error.error_code, + message=str(error), + ) + + await self.notify( + f"Troubleshooting error of type {error.error_code.name}: " + f"{error_model.message}", + level="warning", + ) + + try: + self._event.clear() + + for recipe in self.recipes.values(): + # TODO: for now the first recipe that matches is the only one that runs. + if recipe.match(error_model): + await self.notify(f"Running troubleshooting recipe {recipe.name}.") + + error_model.handled = await recipe.handle(error_model) + if error_model.handled: + break + + if error_model.handled: + await self.notify("Error has been handled.") + return True + + await self.notify( + "Error could not be handled. Running clean-up recipe.", + level="warning", + ) + cleanup = self.recipes["cleanup"] + await cleanup.handle(error_model) + + return True + + except TroubleshooterCriticalError as err: + await self.notify( + f"Shutting down due to critical error while troubleshooting: {err!r}", + level="critical", + ) + await self.overwatcher.shutdown(disable_overwatcher=True) + + finally: + self._event.set()