diff --git a/CHANGELOG.md b/CHANGELOG.md index d21aa83..d084871 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## Next version +### 🚀 New + +* [#38](https://github.com/sdss/lvmgort/pull/38) Add a post-observing daily task that runs 30 minutes after sunrise and will do a few check (make sure the dome is closed, park the telescopes, etc.) and retry safe calibrations that failed during the normal sequence. + ### 🔧 Fixed * Prevent the Overwatcher observer from opening the dome while calibrations are ongoing. @@ -16,7 +20,7 @@ ### 🚀 New -* * [#37](https://github.com/sdss/lvmgort/pull/37) Basic implementation of the `Troubleshooter` class for the Overwatcher. Currently only very broad troubleshooting checks and recipes are implemented. +* [#37](https://github.com/sdss/lvmgort/pull/37) Basic implementation of the `Troubleshooter` class for the Overwatcher. Currently only very broad troubleshooting checks and recipes are implemented. ### 🏷️ Changed diff --git a/codecov.yml b/codecov.yml index e00ce3d..98286a3 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,2 +1,7 @@ +coverage: + status: + project: off + patch: off + github_checks: annotations: false diff --git a/src/gort/etc/calibrations.yaml b/src/gort/etc/calibrations.yaml index c29eb4f..47fbdf2 100644 --- a/src/gort/etc/calibrations.yaml +++ b/src/gort/etc/calibrations.yaml @@ -10,6 +10,7 @@ close_dome_after: true abort_observing: true priority: 10 + allow_post_observing_recovery: false - name: quick_cals recipe: quick_cals @@ -21,6 +22,7 @@ dome: closed abort_observing: true priority: 8 + allow_post_observing_recovery: true - name: bias_sequence recipe: bias_sequence @@ -32,6 +34,7 @@ dome: closed abort_observing: true priority: 5 + allow_post_observing_recovery: true # # - name: twilight_flats_sunrise # recipe: twilight_flats diff --git a/src/gort/overwatcher/calibration.py b/src/gort/overwatcher/calibration.py index 1c906ba..ee74979 100644 --- a/src/gort/overwatcher/calibration.py +++ b/src/gort/overwatcher/calibration.py @@ -96,6 +96,11 @@ class CalibrationModel(BaseModel): title="The maximum time in seconds to attempt the calibration if it fails. " "If max_start_time is reached during this period, the calibrations fails.", ) + allow_post_observing_recovery: bool = Field( + default=True, + title="Whether the calibration can be run after observing has finished " + "if it initially failed.", + ) @model_validator(mode="after") def validate_start_time(self) -> Self: @@ -484,6 +489,9 @@ async def reset(self, cals_file: str | pathlib.Path | None = None): if cals_file is not None: self.cals_file = cals_file + self._failing_cals = {} + self._ignore_cals = set() + try: self.schedule.update_schedule(self.cals_file) except Exception as ee: diff --git a/src/gort/overwatcher/helpers/tasks.py b/src/gort/overwatcher/helpers/tasks.py index d422d9a..20a01a4 100644 --- a/src/gort/overwatcher/helpers/tasks.py +++ b/src/gort/overwatcher/helpers/tasks.py @@ -18,7 +18,8 @@ from sdsstools import get_sjd -from gort.tools import redis_client_sync +from gort.overwatcher.calibration import CalibrationState +from gort.tools import add_night_log_comment, redis_client_sync if TYPE_CHECKING: @@ -103,14 +104,31 @@ async def run(self): if self.done: return + if not await self._should_run(): + return + await self.overwatcher.notify(f"Running daily task {self.name}.") - self.done = await self._run_internal() + try: + self.done = await self._run_internal() + except Exception as err: + await self.overwatcher.notify( + f"Error running daily task {self.name}: {err}", + level="error", + ) + self.done = True + return if self.done: await self.overwatcher.notify(f"Task {self.name} has been completed.") else: await self.overwatcher.notify(f"Task {self.name} has failed.") + @abc.abstractmethod + async def _should_run(self) -> bool: + """Returns True if the task should run.""" + + raise NotImplementedError + @abc.abstractmethod async def _run_internal(self) -> bool: """Runs the internal task.""" @@ -145,12 +163,17 @@ def mark_done(self): class PreObservingTask(DailyTaskBase): - """Run the pre-observing tasks.""" + """Run the pre-observing tasks. + + This task is run between 30 and 10 minutes before sunset if no calibration is + ongoing and will take a bias and make sure the telescopes are connected and homed. + + """ name = "pre_observing" - async def _run_internal(self) -> bool: - """Runs the pre-observing tasks.""" + async def _should_run(self) -> bool: + """Returns True if the task should run.""" if self.overwatcher.ephemeris.ephemeris is None: return False @@ -168,6 +191,11 @@ async def _run_internal(self) -> bool: ): return False + return True + + async def _run_internal(self) -> bool: + """Runs the pre-observing tasks.""" + try: await self.overwatcher.gort.execute_recipe("pre-observing") except Exception as err: @@ -178,3 +206,111 @@ async def _run_internal(self) -> bool: # Always mark the task complete, even if it failed. return True + + +class PostObservingTask(DailyTaskBase): + """Run the post-observing tasks. + + This task is run 30 minutes after sunrise. It runs the post-observing recipe + but does not send the email (that is done at 12UT by a cronjon for redundancy). + + The recipe checks that the dome is closed, the telescope is parked, guiders + are off, etc. It also goes over the calibrations and if a calibration is missing + and has ``allow_post_observing_recovery=true`` it will try to obtain it. + + """ + + name = "post_observing" + + async def _should_run(self) -> bool: + """Returns True if the task should run.""" + + if self.overwatcher.ephemeris.ephemeris is None: + return False + + # Run this task 30 minutes after sunrise. + now = time.time() + sunrise = Time(self.overwatcher.ephemeris.ephemeris.sunrise, format="jd").unix + + if ( + now - sunrise < 0 + or now - sunrise < 1800 + or now - sunrise > 2000 + or self.overwatcher.state.calibrating + or self.overwatcher.state.observing + ): + return False + + return True + + async def _run_internal(self) -> bool: + """Runs the post-observing tasks.""" + + notify = self.overwatcher.notify + + try: + await self.overwatcher.gort.execute_recipe( + "post-observing", + send_emal=False, + ) + except Exception as err: + await self.overwatcher.notify( + f"Error running post-observing task: {err}", + level="critical", + ) + return True + + calibrations_attempted: bool = False + + for calibration in self.overwatcher.calibrations.schedule.calibrations: + name = calibration.name + + # Calibration must not be done (any other state is valid) + if calibration.state != CalibrationState.DONE: + # Calibration must allow recovery. + allows_recovery = calibration.model.allow_post_observing_recovery + + # Calibration must not require moving the dome (model.dome = None) + # or asks for the dome to be closed and it actually is. + required_dome = calibration.model.dome + needs_dome: bool = False + if required_dome is not None: + current_dome = await self.overwatcher.dome.is_closing() + if required_dome is True or current_dome != required_dome: + needs_dome = True + + # Calibrations must be allowed. + allow_calibrations = self.overwatcher.state.allow_calibrations + + if not needs_dome and allows_recovery and allow_calibrations: + await notify(f"Retrying calibration {calibration.name}.") + + try: + calibrations_attempted = True + await self.overwatcher.calibrations.run_calibration(calibration) + + if not calibration.state == CalibrationState.DONE: + await notify(f"Failed to recover calibration {name}.") + else: + await notify(f"Calibration {name} recovered.") + + # Automatically add a comment to the night log. + await add_night_log_comment( + f"Calibration {name} initially failed and was retaken " + "after observations had been completed. Review the " + "data quality since the exposures were taken after " + "sunrise.", + category="other", + ) + + except Exception as err: + await notify(f"Error recovering calibration {name}: {err}") + + # If we have tried a calibration we may have rehomed the telescopes and + # left them not parked. Make sure they are really parked. + if calibrations_attempted: + self.overwatcher.log.info("Parking telescopes after post-observing cals.") + await self.overwatcher.gort.telescopes.park() + + # Always mark the task complete, even if it failed. + return True diff --git a/src/gort/recipes/operations.py b/src/gort/recipes/operations.py index 27fdacb..39bd088 100644 --- a/src/gort/recipes/operations.py +++ b/src/gort/recipes/operations.py @@ -10,7 +10,7 @@ import asyncio -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, ClassVar from rich.prompt import Confirm @@ -322,7 +322,9 @@ class PostObservingRecipe(BaseRecipe): name = "post-observing" - async def recipe(self): + email_route: ClassVar[str] = "/logs/night-logs/0/email?only_if_not_sent=1" + + async def recipe(self, send_email: bool = True): """Runs the post-observing sequence.""" from gort.overwatcher.helpers.notifier import BasicNotifier @@ -347,10 +349,11 @@ async def recipe(self): except Exception as ee: notifier.log.error(f"Error running post-observing task: {ee}") - notifier.log.info("Sending night log email.") - result = await get_lvmapi_route("/logs/night-logs/0/email?only_if_not_sent=1") - if not result: - notifier.log.warning("Night log had already been sent.") + if send_email: + notifier.log.info("Sending night log email.") + result = await get_lvmapi_route(self.email_route) + if not result: + notifier.log.warning("Night log had already been sent.") # Disable the overwatcher. if await overwatcher_is_running(): diff --git a/src/gort/tools.py b/src/gort/tools.py index 39ae534..d6f9fa8 100644 --- a/src/gort/tools.py +++ b/src/gort/tools.py @@ -89,6 +89,7 @@ "kubernetes_restart_deployment", "kubernetes_list_deployments", "get_gort_client", + "add_night_log_comment", ] AnyPath = str | os.PathLike @@ -1003,3 +1004,25 @@ async def get_gort_client(override_overwatcher: bool | None = None): yield gort await gort.stop() + + +async def add_night_log_comment(comment: str, category: str = "other"): + """Adds a comment to the night log.""" + + payload = { + "mjd": get_sjd("LCO"), + "category": category or "other", + "comment": comment, + } + + host, port = config["services"]["lvmapi"].values() + + async with httpx.AsyncClient( + base_url=f"http://{host}:{port}", + follow_redirects=True, + ) as client: + response = await client.post("/night-logs/comments/add", json=payload) + + code = response.status_code + if code != 200: + raise ValueError(f"Failed adding night log comment. Code {code}.")