Skip to content

Commit

Permalink
Troubleshooter for Overwatcher (#37)
Browse files Browse the repository at this point in the history
* Add some initial files

* Merge branch 'main' into troubleshooter

* Improve basic structure of the troubleshooter module

* Try to fix building docker image for pull request

* Attempt with ${{ github.head_ref || github.ref_name }}

* Merge branch 'main' into troubleshooter

* Add categorisation methods to the ErrorCode enum

* Merge remote-tracking branch 'origin/main' into troubleshooter

* Very basic but complete implementataion of the troubleshooter

* Some tweaks to OverwatcherTask logging

* Fix bug in twilight flats recipe for extra flats

* Do not require overwatcher to be enabled for calibrations if dome does not change

* Merge branch 'main' into troubleshooter

* The troubleshooter blocks the observing loop while troubleshooting

* Disable overwatcher and run cleanup in safety after closing dome

* Add option to disable overwatcher on shutdown

* Run a cleanup when disabling the observe loop with immediate=True

* Add placeholder for emitting a critical error in the troubleshooter

* Add MJD to the notification record in the DB

* Merge branch 'main' into troubleshooter

* Merge branch 'main' into troubleshooter

* Add max_start_time to quick_cals and bias_sequence calibrations
  • Loading branch information
albireox authored Nov 7, 2024
1 parent c99d768 commit aa86289
Show file tree
Hide file tree
Showing 11 changed files with 415 additions and 51 deletions.
40 changes: 40 additions & 0 deletions src/gort/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,46 @@ class ErrorCode(Enum):
CALIBRATION_ERROR = 900
UNKNOWN_ERROR = 9999

def is_telescope_error(self):
"""Returns True if the error is related to the telescope."""

return self.value >= 100 and self.value < 200

def is_ag_error(self):
"""Returns True if the error is related to the autoguider."""

return self.value >= 200 and self.value < 300

def is_spectrograph_error(self):
"""Returns True if the error is related to the spectrograph."""

return self.value >= 300 and self.value < 400

def is_nps_error(self):
"""Returns True if the error is related to the NPS."""

return self.value >= 400 and self.value < 500

def is_enclosure_error(self):
"""Returns True if the error is related to the enclosure."""

return self.value >= 500 and self.value < 600

def is_guiding_error(self):
"""Returns True if the error is related to the guider."""

return self.value >= 600 and self.value < 700

def is_scheduler_error(self):
"""Returns True if the error is related to the scheduler."""

return self.value >= 700 and self.value < 800

def is_observer_error(self):
"""Returns True if the error is related to the observer."""

return self.value >= 800 and self.value < 900


class GuiderStatus(Flag):
"""Maskbits with the guider status."""
Expand Down
4 changes: 2 additions & 2 deletions src/gort/etc/calibrations.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
- name: quick_cals
recipe: quick_cals
min_start_time: 1800
max_start_time: null
max_start_time: 3600
time_mode: secs_after_sunset
after: null
required: true
Expand All @@ -25,7 +25,7 @@
- name: bias_sequence
recipe: bias_sequence
min_start_time: null
max_start_time: null
max_start_time: 7200
time_mode: null
after: quick_cals
required: true
Expand Down
12 changes: 12 additions & 0 deletions src/gort/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,18 @@ class OverwatcherError(GortError):
pass


class TroubleshooterCriticalError(OverwatcherError):
"""A critical error in the troubleshooter that will shut down the system."""

pass


class TroubleshooterTimeoutError(OverwatcherError):
"""The troubleshooter timed out while running a recipe."""

pass


class RemoteCommandError(GortError):
"""An error in a remote command to an actor."""

Expand Down
4 changes: 2 additions & 2 deletions src/gort/overwatcher/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ class OverwatcherBaseTask:
keep_alive: ClassVar[bool] = True
restart_on_error: ClassVar[bool] = True

def __init__(self):
def __init__(self, log: LogNamespace | None = None):
self._task_runner: asyncio.Task | None = None
self._heartbeat_task: asyncio.Task | None = None

self._log: Mock | LogNamespace = Mock()
self._log: Mock | LogNamespace = log or Mock()

async def run(self):
"""Runs the task."""
Expand Down
3 changes: 2 additions & 1 deletion src/gort/overwatcher/helpers/notifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import httpx

from sdsstools import Configuration
from sdsstools import Configuration, get_sjd
from sdsstools.utils import GatheringTaskGroup

from gort.core import LogNamespace
Expand Down Expand Up @@ -189,6 +189,7 @@ async def notify(
[
{
"date": datetime.datetime.now(tz=datetime.UTC),
"mjd": get_sjd("LCO"),
"level": level,
"message": message,
"payload": json.dumps(payload),
Expand Down
60 changes: 19 additions & 41 deletions src/gort/overwatcher/observer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,12 @@

from astropy.time import Time

from gort.enums import ErrorCode
from gort.exceptions import GortError
from gort.exceptions import GortError, TroubleshooterTimeoutError
from gort.exposure import Exposure
from gort.overwatcher import OverwatcherModule
from gort.overwatcher.core import OverwatcherModuleTask
from gort.tile import Tile
from gort.tools import cancel_task, run_in_executor, set_tile_status
from gort.tools import cancel_task, run_in_executor


if TYPE_CHECKING:
Expand Down Expand Up @@ -174,6 +173,10 @@ async def stop_observing(
)
self.observe_loop = await cancel_task(self.observe_loop)

# The guiders may have been left running or the spectrograph may still
# be exposing. Clean up to avoid issues.
await self.gort.cleanup(readout=False)

else:
await self.overwatcher.notify(
f"Stopping observations after this tile. Reason: {reason}"
Expand All @@ -197,6 +200,9 @@ async def observe_loop_task(self):
exp: Exposure | list[Exposure] | bool = False

try:
# Wait in case the troubleshooter is doing something.
await self.overwatcher.troubleshooter.wait_until_ready(300)

# We want to avoid re-acquiring the tile between dithers. We call
# the scheduler here and control the dither position loop ourselves.
tile: Tile = await run_in_executor(Tile.from_scheduler)
Expand All @@ -217,6 +223,8 @@ async def observe_loop_task(self):
await self.gort.guiders.focus()

for dpos in tile.dither_positions:
await self.overwatcher.troubleshooter.wait_until_ready(300)

# The exposure will complete in 900 seconds + acquisition + readout
self.next_exposure_completes = time() + 90 + 900 + 60

Expand All @@ -242,46 +250,16 @@ async def observe_loop_task(self):
except asyncio.CancelledError:
break

except Exception as err:
# TODO: this should be moved to the troubleshooting module, but
# for now handling it here.

if isinstance(err, GortError):
# If the acquisition failed, disable the tile and try again.
if err.error_code == ErrorCode.ACQUISITION_FAILED:
tile_id: int | None = err.payload.get("tile_id", None)
if tile_id is None:
await notify(
'Cannot disable tile without a "tile_id. '
"Continuing observations without disabling tile.",
level="error",
)
else:
await set_tile_status(tile_id, enabled=False)
await notify(
f"tile_id={tile_id} has been disabled. "
"Continuing observations.",
level="warning",
)

# If the scheduler cannot find a tile, wait a minute and try again.
elif err.error_code == ErrorCode.SCHEDULER_CANNOT_FIND_TILE:
await notify(
"The scheduler was not able to find a valid tile to "
"observe. Waiting 60 seconds before trying again.",
level="warning",
)
await asyncio.sleep(60)
continue

# No specific troubleshooting available. Report the error,
# do a cleanup and try again.
except TroubleshooterTimeoutError:
await notify(
f"An error occurred during the observation: {err} "
"Running the cleanup recipe.",
level="error",
"The troubleshooter timed out after 300 seconds. "
"Cancelling observations.",
level="critical",
)
await self.gort.cleanup(readout=False)
break

except Exception as err:
await self.overwatcher.troubleshooter.handle(err)

finally:
if self.is_cancelling:
Expand Down
25 changes: 20 additions & 5 deletions src/gort/overwatcher/overwatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from gort.overwatcher.helpers import DomeHelper
from gort.overwatcher.helpers.notifier import NotifierMixIn
from gort.overwatcher.helpers.tasks import DailyTasks
from gort.overwatcher.troubleshooter.troubleshooter import Troubleshooter


@dataclasses.dataclass
Expand All @@ -49,7 +50,11 @@ def __init__(self, overwatcher: Overwatcher):
super().__init__()

self.overwatcher = overwatcher
self.log = self.overwatcher.log

# A bit configuring but _log is used internally, mainly for
# OverwatcherBaseTask.run() and log is for external use.
self._log = self.overwatcher.log
self.log = self._log


class OverwatcherMainTask(OverwatcherTask):
Expand Down Expand Up @@ -228,7 +233,7 @@ async def handle_reenable(self):
if self._pending_close_dome:
return

self.log.info("Undoing the cancellation of the observing loop.")
self._log.info("Undoing the cancellation of the observing loop.")
observer._cancelling = False
self.overwatcher.gort.observer.cancelling = False

Expand Down Expand Up @@ -289,7 +294,7 @@ def __init__(
self.state.dry_run = dry_run

self.dome = DomeHelper(self)

self.troubleshooter = Troubleshooter(self)
self.tasks: list[OverwatcherTask] = [
OverwatcherMainTask(self),
OverwatcherPingTask(self),
Expand Down Expand Up @@ -339,18 +344,25 @@ async def shutdown(
reason: str = "undefined",
retry: bool = True,
park: bool = True,
disable_overwatcher: bool = False,
):
"""Shuts down the observatory."""

# Check if the dome is already closed, then do nothing.
if await self.dome.is_closing():
dome_closed = await self.dome.is_closing()
enabled = self.state.enabled
observing = self.observer.is_observing

if dome_closed and not enabled and not observing:
return

if not reason.endswith("."):
reason += "."

await self.notify(f"Triggering shutdown. Reason: {reason}", level="warning")

if disable_overwatcher:
await self.notify("The Overwatcher will be disabled.", level="warning")

if not self.state.dry_run:
stop = asyncio.create_task(self.observer.stop_observing(immediate=True))
shutdown = asyncio.create_task(self.dome.shutdown(retry=retry, park=park))
Expand All @@ -367,6 +379,9 @@ async def shutdown(
error=err,
)

if disable_overwatcher:
self.state.enabled = False

async def cancel(self):
"""Cancels the overwatcher tasks."""

Expand Down
8 changes: 8 additions & 0 deletions src/gort/overwatcher/safety.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,14 @@ async def task(self):
)
await self.module.close_dome()

# Now run a shutdown. This should not try to close the dome
# since that's already done, but it will stop the observe loop,
# clean-up, etc.
await self.overwatcher.shutdown(
reason="safety alerts detected",
disable_overwatcher=True,
)

elif self.failed:
# We have failed closing the dome as a last resort. We have issued
# a critical alert. We don't try closing the dome again.
Expand Down
12 changes: 12 additions & 0 deletions src/gort/overwatcher/troubleshooter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# @Author: José Sánchez-Gallego ([email protected])
# @Date: 2024-11-05
# @Filename: __init__.py
# @License: BSD 3-clause (http://www.opensource.org/licenses/BSD-3-Clause)

from __future__ import annotations

from .recipes import TroubleshooterRecipe
from .troubleshooter import Troubleshooter
Loading

0 comments on commit aa86289

Please sign in to comment.