From 386b902b2333f74500ec948a3cceb81c82668360 Mon Sep 17 00:00:00 2001 From: Dave Cuza Date: Fri, 5 Jul 2024 15:16:38 -0700 Subject: [PATCH 01/12] chore: Add disable_retries_on_lost option to Kubernetes configuration --- tron/config/config_parse.py | 2 ++ tron/config/schema.py | 1 + 2 files changed, 3 insertions(+) diff --git a/tron/config/config_parse.py b/tron/config/config_parse.py index 6d0c2ea0f..df0c8b80a 100644 --- a/tron/config/config_parse.py +++ b/tron/config/config_parse.py @@ -866,12 +866,14 @@ class ValidateKubernetes(Validator): defaults = { "kubeconfig_path": None, "enabled": False, + "disable_retries_on_lost": False, "default_volumes": (), } validators = { "kubeconfig_path": valid_string, "enabled": valid_bool, + "disable_retries_on_lost": valid_bool, "default_volumes": build_list_of_type_validator(valid_volume, allow_empty=True), } diff --git a/tron/config/schema.py b/tron/config/schema.py index 0e2f425ee..b58323fd0 100644 --- a/tron/config/schema.py +++ b/tron/config/schema.py @@ -118,6 +118,7 @@ def config_object_factory(name, required=None, optional=None): optional=[ "kubeconfig_path", "enabled", + "disable_retries_on_lost", "default_volumes", ], ) From 75ece6b639d39b59ceafcd3bbd6b2d3a2fc985ad Mon Sep 17 00:00:00 2001 From: Dave Cuza Date: Mon, 8 Jul 2024 05:31:29 -0700 Subject: [PATCH 02/12] chore: Refactor _exit_unsuccessful method in KubernetesActionRun The _exit_unsuccessful method in the KubernetesActionRun class has been refactored to include the is_lost_task parameter. This parameter is used to determine whether auto-retries should be skipped when the disable_retries_on_lost option is enabled in the Kubernetes configuration. --- tron/core/actionrun.py | 34 ++++++++++++++++++++++++++++++++-- tron/kubernetes.py | 7 +++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/tron/core/actionrun.py b/tron/core/actionrun.py index 580cf263c..b452eaa67 100644 --- a/tron/core/actionrun.py +++ b/tron/core/actionrun.py @@ -1314,8 +1314,38 @@ def kill(self, final: bool = True) -> Optional[str]: return "\n".join(msgs) + def _exit_unsuccessful( + self, exit_status=None, retry_original_command=True, is_lost_task=False + ) -> Optional[Union[bool, ActionCommand]]: + + k8s_cluster = KubernetesClusterRepository.get_cluster() + disable_retries_on_lost = False if not k8s_cluster else k8s_cluster.disable_retries_on_lost + + if self.is_done: + log.info( + f"{self} got exit code {exit_status} but already in terminal " f'state "{self.state}", not retrying', + ) + return None + if self.last_attempt is not None: + self.last_attempt.exit(exit_status) + if self.retries_remaining is not None: + if disable_retries_on_lost and is_lost_task: + log.info(f"{self} skipping auto-retries due to disable_retries_on_lost being enabled.") + else: + if self.retries_remaining > 0: + self.retries_remaining -= 1 + return self.restart(original_command=retry_original_command) + else: + log.info( + f"Reached maximum number of retries: {len(self.attempts)}", + ) + if exit_status is None: + return self._done("fail_unknown", exit_status) + else: + return self._done("fail", exit_status) + def handle_action_command_state_change( - self, action_command: ActionCommand, event: str, event_data=None + self, action_command: KubernetesTask, event: str, event_data=None ) -> Optional[Union[bool, ActionCommand]]: """ Observe ActionCommand state changes and transition the ActionCommand state machine to a new state. @@ -1331,7 +1361,7 @@ def handle_action_command_state_change( if event == ActionCommand.EXITING: if action_command.exit_status is None: # This is different from SSHActionRun - allows retries to happen, if configured - return self._exit_unsuccessful(None) + return self._exit_unsuccessful(None, is_lost_task=action_command.is_lost_task) if not action_command.exit_status: return self.success() diff --git a/tron/kubernetes.py b/tron/kubernetes.py index 4c06f31bb..1e0097e8a 100644 --- a/tron/kubernetes.py +++ b/tron/kubernetes.py @@ -68,6 +68,8 @@ def __init__(self, action_run_id: str, task_config: KubernetesTaskConfig, serial self.log.info(f"Kubernetes task {self.get_kubernetes_id()} created with config {self.get_config()}") + self.is_lost_task = False + def get_event_logger(self) -> Logger: """ Get or create a logger for a the action run associated with this task. @@ -252,6 +254,7 @@ def handle_event(self, event: Event) -> None: self.log.warning(f" tronctl skip {self.id}") self.log.warning("If you want Tron to NOT run it and consider it as a failure, fail it with:") self.log.warning(f" tronctl fail {self.id}") + self.is_lost_task = True self.exited(None) else: self.log.info( @@ -280,10 +283,12 @@ def __init__( enabled: bool = True, default_volumes: Optional[List[ConfigVolume]] = None, pod_launch_timeout: Optional[int] = None, + disable_retries_on_lost: bool = False, ): # general k8s config self.kubeconfig_path = kubeconfig_path self.enabled = enabled + self.disable_retries_on_lost = disable_retries_on_lost self.default_volumes: Optional[List[ConfigVolume]] = default_volumes or [] self.pod_launch_timeout = pod_launch_timeout or DEFAULT_POD_LAUNCH_TIMEOUT_S # creating a task_proc executor has a couple steps: @@ -618,6 +623,7 @@ def recover(self, task: KubernetesTask) -> None: class KubernetesClusterRepository: # Kubernetes config kubernetes_enabled: bool = False + kubernetes_disable_retries_on_lost: bool = False kubeconfig_path: Optional[str] = None pod_launch_timeout: Optional[int] = None default_volumes: Optional[List[ConfigVolume]] = None @@ -658,6 +664,7 @@ def shutdown(cls) -> None: def configure(cls, kubernetes_options: ConfigKubernetes) -> None: cls.kubeconfig_path = kubernetes_options.kubeconfig_path cls.kubernetes_enabled = kubernetes_options.enabled + cls.kubernetes_disable_retries_on_lost = kubernetes_options.disable_retries_on_lost cls.default_volumes = kubernetes_options.default_volumes for cluster in cls.clusters.values(): From e3ecf1b5da5b9adb99b6003f6b6ba19820b9fdd4 Mon Sep 17 00:00:00 2001 From: Dave Cuza Date: Tue, 9 Jul 2024 14:38:08 -0700 Subject: [PATCH 03/12] chore: Refactor _exit_unsuccessful method in KubernetesActionRun --- tron/core/actionrun.py | 10 ++++------ tron/kubernetes.py | 5 +---- tron/utils/exitcode.py | 2 ++ 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/tron/core/actionrun.py b/tron/core/actionrun.py index b452eaa67..9c907ee99 100644 --- a/tron/core/actionrun.py +++ b/tron/core/actionrun.py @@ -1314,9 +1314,7 @@ def kill(self, final: bool = True) -> Optional[str]: return "\n".join(msgs) - def _exit_unsuccessful( - self, exit_status=None, retry_original_command=True, is_lost_task=False - ) -> Optional[Union[bool, ActionCommand]]: + def _exit_unsuccessful(self, exit_status=None, retry_original_command=True) -> Optional[Union[bool, ActionCommand]]: k8s_cluster = KubernetesClusterRepository.get_cluster() disable_retries_on_lost = False if not k8s_cluster else k8s_cluster.disable_retries_on_lost @@ -1329,7 +1327,7 @@ def _exit_unsuccessful( if self.last_attempt is not None: self.last_attempt.exit(exit_status) if self.retries_remaining is not None: - if disable_retries_on_lost and is_lost_task: + if disable_retries_on_lost and exit_status == exitcode.EXIT_KUBERNETES_TASK_LOST: log.info(f"{self} skipping auto-retries due to disable_retries_on_lost being enabled.") else: if self.retries_remaining > 0: @@ -1345,7 +1343,7 @@ def _exit_unsuccessful( return self._done("fail", exit_status) def handle_action_command_state_change( - self, action_command: KubernetesTask, event: str, event_data=None + self, action_command: ActionCommand, event: str, event_data=None ) -> Optional[Union[bool, ActionCommand]]: """ Observe ActionCommand state changes and transition the ActionCommand state machine to a new state. @@ -1361,7 +1359,7 @@ def handle_action_command_state_change( if event == ActionCommand.EXITING: if action_command.exit_status is None: # This is different from SSHActionRun - allows retries to happen, if configured - return self._exit_unsuccessful(None, is_lost_task=action_command.is_lost_task) + return self._exit_unsuccessful(None) if not action_command.exit_status: return self.success() diff --git a/tron/kubernetes.py b/tron/kubernetes.py index 1e0097e8a..2872f80e6 100644 --- a/tron/kubernetes.py +++ b/tron/kubernetes.py @@ -68,8 +68,6 @@ def __init__(self, action_run_id: str, task_config: KubernetesTaskConfig, serial self.log.info(f"Kubernetes task {self.get_kubernetes_id()} created with config {self.get_config()}") - self.is_lost_task = False - def get_event_logger(self) -> Logger: """ Get or create a logger for a the action run associated with this task. @@ -254,8 +252,7 @@ def handle_event(self, event: Event) -> None: self.log.warning(f" tronctl skip {self.id}") self.log.warning("If you want Tron to NOT run it and consider it as a failure, fail it with:") self.log.warning(f" tronctl fail {self.id}") - self.is_lost_task = True - self.exited(None) + self.exited(exitcode.EXIT_KUBERNETES_TASK_LOST) else: self.log.info( f"Did not handle unknown kubernetes event type: {event}", diff --git a/tron/utils/exitcode.py b/tron/utils/exitcode.py index df7819385..b5a95d509 100644 --- a/tron/utils/exitcode.py +++ b/tron/utils/exitcode.py @@ -10,6 +10,7 @@ EXIT_KUBERNETES_ABNORMAL = -9 EXIT_KUBERNETES_SPOT_INTERRUPTION = -10 EXIT_KUBERNETES_NODE_SCALEDOWN = -11 +EXIT_KUBERNETES_TASK_LOST = -12 EXIT_REASONS = { EXIT_INVALID_COMMAND: "Invalid command", @@ -23,4 +24,5 @@ EXIT_KUBERNETES_ABNORMAL: "Kubernetes task failed in an unexpected manner", EXIT_KUBERNETES_SPOT_INTERRUPTION: "Kubernetes task failed due to spot interruption", EXIT_KUBERNETES_NODE_SCALEDOWN: "Kubernetes task failed due to the autoscaler scaling down a node", + EXIT_KUBERNETES_TASK_LOST: "Tron lost track of a pod it already thought it had started for a job.", } From 1ece1c63944d9dc46f19586a501acf03efdfe73b Mon Sep 17 00:00:00 2001 From: Dave Cuza Date: Wed, 31 Jul 2024 15:12:48 -0700 Subject: [PATCH 04/12] chore: Update Kubernetes configuration to use non_retryable_exit_codes The Kubernetes configuration has been updated to use the `non_retryable_exit_codes` option instead of `disable_retries_on_lost`. This change allows for more flexibility in specifying the exit codes that should not trigger auto-retries for Kubernetes tasks. --- tests/config/config_parse_test.py | 2 +- tests/kubernetes_test.py | 2 +- tron/config/config_parse.py | 4 +-- tron/config/schema.py | 2 +- tron/core/actionrun.py | 50 ++++++++++++------------------- tron/kubernetes.py | 9 +++--- 6 files changed, 29 insertions(+), 40 deletions(-) diff --git a/tests/config/config_parse_test.py b/tests/config/config_parse_test.py index 1de3986c9..67f87fb5a 100644 --- a/tests/config/config_parse_test.py +++ b/tests/config/config_parse_test.py @@ -116,7 +116,7 @@ def make_mesos_options(): def make_k8s_options(): - return schema.ConfigKubernetes(enabled=False, default_volumes=()) + return schema.ConfigKubernetes(enabled=False, non_retryable_exit_codes=(), default_volumes=()) def make_action(**kwargs): diff --git a/tests/kubernetes_test.py b/tests/kubernetes_test.py index 62be2b254..ad2089b01 100644 --- a/tests/kubernetes_test.py +++ b/tests/kubernetes_test.py @@ -447,7 +447,7 @@ def test_handle_event_lost(mock_kubernetes_task): ) ) - assert mock_kubernetes_task.is_unknown + assert mock_kubernetes_task.exit_status == exitcode.EXIT_KUBERNETES_TASK_LOST def test_create_task_disabled(): diff --git a/tron/config/config_parse.py b/tron/config/config_parse.py index 0f2396643..feb05513a 100644 --- a/tron/config/config_parse.py +++ b/tron/config/config_parse.py @@ -867,14 +867,14 @@ class ValidateKubernetes(Validator): defaults = { "kubeconfig_path": None, "enabled": False, - "disable_retries_on_lost": False, + "non_retryable_exit_codes": (), "default_volumes": (), } validators = { "kubeconfig_path": valid_string, "enabled": valid_bool, - "disable_retries_on_lost": valid_bool, + "non_retryable_exit_codes": build_list_of_type_validator(valid_int, allow_empty=True), "default_volumes": build_list_of_type_validator(valid_volume, allow_empty=True), } diff --git a/tron/config/schema.py b/tron/config/schema.py index 31819a38e..601f73687 100644 --- a/tron/config/schema.py +++ b/tron/config/schema.py @@ -119,7 +119,7 @@ def config_object_factory(name, required=None, optional=None): optional=[ "kubeconfig_path", "enabled", - "disable_retries_on_lost", + "non_retryable_exit_codes", "default_volumes", ], ) diff --git a/tron/core/actionrun.py b/tron/core/actionrun.py index 9c907ee99..3976834c1 100644 --- a/tron/core/actionrun.py +++ b/tron/core/actionrun.py @@ -590,7 +590,9 @@ def fail(self, exit_status=None): return self._done("fail", exit_status) - def _exit_unsuccessful(self, exit_status=None, retry_original_command=True) -> Optional[Union[bool, ActionCommand]]: + def _exit_unsuccessful( + self, exit_status=None, retry_original_command=True, non_retryable_exit_codes=[] + ) -> Optional[Union[bool, ActionCommand]]: if self.is_done: log.info( f"{self} got exit code {exit_status} but already in terminal " f'state "{self.state}", not retrying', @@ -599,13 +601,16 @@ def _exit_unsuccessful(self, exit_status=None, retry_original_command=True) -> O if self.last_attempt is not None: self.last_attempt.exit(exit_status) if self.retries_remaining is not None: - if self.retries_remaining > 0: - self.retries_remaining -= 1 - return self.restart(original_command=retry_original_command) + if exit_status in non_retryable_exit_codes: + log.info(f"{self} skipping auto-retries due to non-retryable exit code was recieved.") else: - log.info( - f"Reached maximum number of retries: {len(self.attempts)}", - ) + if self.retries_remaining > 0: + self.retries_remaining -= 1 + return self.restart(original_command=retry_original_command) + else: + log.info( + f"Reached maximum number of retries: {len(self.attempts)}", + ) if exit_status is None: return self._done("fail_unknown", exit_status) else: @@ -1314,33 +1319,16 @@ def kill(self, final: bool = True) -> Optional[str]: return "\n".join(msgs) - def _exit_unsuccessful(self, exit_status=None, retry_original_command=True) -> Optional[Union[bool, ActionCommand]]: + def _exit_unsuccessful( + self, exit_status=None, retry_original_command=True, non_retryable_exit_codes=[] + ) -> Optional[Union[bool, ActionCommand]]: k8s_cluster = KubernetesClusterRepository.get_cluster() - disable_retries_on_lost = False if not k8s_cluster else k8s_cluster.disable_retries_on_lost + non_retryable_exit_codes = () if not k8s_cluster else k8s_cluster.non_retryable_exit_codes - if self.is_done: - log.info( - f"{self} got exit code {exit_status} but already in terminal " f'state "{self.state}", not retrying', - ) - return None - if self.last_attempt is not None: - self.last_attempt.exit(exit_status) - if self.retries_remaining is not None: - if disable_retries_on_lost and exit_status == exitcode.EXIT_KUBERNETES_TASK_LOST: - log.info(f"{self} skipping auto-retries due to disable_retries_on_lost being enabled.") - else: - if self.retries_remaining > 0: - self.retries_remaining -= 1 - return self.restart(original_command=retry_original_command) - else: - log.info( - f"Reached maximum number of retries: {len(self.attempts)}", - ) - if exit_status is None: - return self._done("fail_unknown", exit_status) - else: - return self._done("fail", exit_status) + return super()._exit_unsuccessful( + exit_status=None, retry_original_command=True, non_retryable_exit_codes=non_retryable_exit_codes + ) def handle_action_command_state_change( self, action_command: ActionCommand, event: str, event_data=None diff --git a/tron/kubernetes.py b/tron/kubernetes.py index 2872f80e6..04fa1ee87 100644 --- a/tron/kubernetes.py +++ b/tron/kubernetes.py @@ -5,6 +5,7 @@ from typing import Dict from typing import List from typing import Optional +from typing import Set from typing import TYPE_CHECKING from task_processing.interfaces.event import Event @@ -280,12 +281,12 @@ def __init__( enabled: bool = True, default_volumes: Optional[List[ConfigVolume]] = None, pod_launch_timeout: Optional[int] = None, - disable_retries_on_lost: bool = False, + non_retryable_exit_codes: Optional[Set[int]] = (), ): # general k8s config self.kubeconfig_path = kubeconfig_path self.enabled = enabled - self.disable_retries_on_lost = disable_retries_on_lost + self.non_retryable_exit_codes = non_retryable_exit_codes self.default_volumes: Optional[List[ConfigVolume]] = default_volumes or [] self.pod_launch_timeout = pod_launch_timeout or DEFAULT_POD_LAUNCH_TIMEOUT_S # creating a task_proc executor has a couple steps: @@ -620,7 +621,7 @@ def recover(self, task: KubernetesTask) -> None: class KubernetesClusterRepository: # Kubernetes config kubernetes_enabled: bool = False - kubernetes_disable_retries_on_lost: bool = False + kubernetes_non_retryable_exit_codes: Set[int] = () kubeconfig_path: Optional[str] = None pod_launch_timeout: Optional[int] = None default_volumes: Optional[List[ConfigVolume]] = None @@ -661,7 +662,7 @@ def shutdown(cls) -> None: def configure(cls, kubernetes_options: ConfigKubernetes) -> None: cls.kubeconfig_path = kubernetes_options.kubeconfig_path cls.kubernetes_enabled = kubernetes_options.enabled - cls.kubernetes_disable_retries_on_lost = kubernetes_options.disable_retries_on_lost + cls.kubernetes_non_retryable_exit_codes = kubernetes_options.non_retryable_exit_codes cls.default_volumes = kubernetes_options.default_volumes for cluster in cls.clusters.values(): From 1ed7dbed2723153e836cadaad84b662626de9633 Mon Sep 17 00:00:00 2001 From: Dave Cuza Date: Fri, 2 Aug 2024 09:49:16 -0700 Subject: [PATCH 05/12] chore: Refactor _exit_unsuccessful method in KubernetesActionRun --- tron/core/actionrun.py | 8 +++++--- tron/kubernetes.py | 6 +++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tron/core/actionrun.py b/tron/core/actionrun.py index 3976834c1..ca9d0c0fc 100644 --- a/tron/core/actionrun.py +++ b/tron/core/actionrun.py @@ -591,7 +591,7 @@ def fail(self, exit_status=None): return self._done("fail", exit_status) def _exit_unsuccessful( - self, exit_status=None, retry_original_command=True, non_retryable_exit_codes=[] + self, exit_status=None, retry_original_command=True, non_retryable_exit_codes=() ) -> Optional[Union[bool, ActionCommand]]: if self.is_done: log.info( @@ -1320,14 +1320,16 @@ def kill(self, final: bool = True) -> Optional[str]: return "\n".join(msgs) def _exit_unsuccessful( - self, exit_status=None, retry_original_command=True, non_retryable_exit_codes=[] + self, exit_status=None, retry_original_command=True, non_retryable_exit_codes=() ) -> Optional[Union[bool, ActionCommand]]: k8s_cluster = KubernetesClusterRepository.get_cluster() non_retryable_exit_codes = () if not k8s_cluster else k8s_cluster.non_retryable_exit_codes return super()._exit_unsuccessful( - exit_status=None, retry_original_command=True, non_retryable_exit_codes=non_retryable_exit_codes + exit_status=exit_status, + retry_original_command=retry_original_command, + non_retryable_exit_codes=non_retryable_exit_codes, ) def handle_action_command_state_change( diff --git a/tron/kubernetes.py b/tron/kubernetes.py index 04fa1ee87..b12227982 100644 --- a/tron/kubernetes.py +++ b/tron/kubernetes.py @@ -5,7 +5,7 @@ from typing import Dict from typing import List from typing import Optional -from typing import Set +from typing import Tuple from typing import TYPE_CHECKING from task_processing.interfaces.event import Event @@ -281,7 +281,7 @@ def __init__( enabled: bool = True, default_volumes: Optional[List[ConfigVolume]] = None, pod_launch_timeout: Optional[int] = None, - non_retryable_exit_codes: Optional[Set[int]] = (), + non_retryable_exit_codes: Optional[Tuple[int]] = (), ): # general k8s config self.kubeconfig_path = kubeconfig_path @@ -621,7 +621,7 @@ def recover(self, task: KubernetesTask) -> None: class KubernetesClusterRepository: # Kubernetes config kubernetes_enabled: bool = False - kubernetes_non_retryable_exit_codes: Set[int] = () + kubernetes_non_retryable_exit_codes: Tuple[int] = () kubeconfig_path: Optional[str] = None pod_launch_timeout: Optional[int] = None default_volumes: Optional[List[ConfigVolume]] = None From 6e2163e7a2c646dce676a0842238f447bb53de1e Mon Sep 17 00:00:00 2001 From: Dave Cuza Date: Fri, 2 Aug 2024 09:52:30 -0700 Subject: [PATCH 06/12] chore: Refactor non_retryable_exit_codes in KubernetesCluster and KubernetesClusterRepository The `non_retryable_exit_codes` attribute in the `KubernetesCluster` and `KubernetesClusterRepository` classes has been refactored to use a tuple of integers instead of a single integer. This change allows for more flexibility in specifying multiple exit codes that should not trigger auto-retries for Kubernetes tasks. --- tron/kubernetes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tron/kubernetes.py b/tron/kubernetes.py index b12227982..676974f3f 100644 --- a/tron/kubernetes.py +++ b/tron/kubernetes.py @@ -281,7 +281,7 @@ def __init__( enabled: bool = True, default_volumes: Optional[List[ConfigVolume]] = None, pod_launch_timeout: Optional[int] = None, - non_retryable_exit_codes: Optional[Tuple[int]] = (), + non_retryable_exit_codes: Optional[Tuple[int, ...]] = (), ): # general k8s config self.kubeconfig_path = kubeconfig_path @@ -621,7 +621,7 @@ def recover(self, task: KubernetesTask) -> None: class KubernetesClusterRepository: # Kubernetes config kubernetes_enabled: bool = False - kubernetes_non_retryable_exit_codes: Tuple[int] = () + kubernetes_non_retryable_exit_codes: Tuple[int, ...] = () kubeconfig_path: Optional[str] = None pod_launch_timeout: Optional[int] = None default_volumes: Optional[List[ConfigVolume]] = None From d58cb93f4d273cd11e4038433fa7dff0b45264e5 Mon Sep 17 00:00:00 2001 From: Dave Cuza Date: Fri, 2 Aug 2024 10:04:09 -0700 Subject: [PATCH 07/12] chore: Update debugpy dependency to version 1.8.1 The `debugpy` dependency in the `requirements-dev-minimal.txt` and `requirements-dev.txt` files has been updated to version 1.8.1. This update ensures compatibility and brings in the latest features and bug fixes for debugging purposes. --- requirements-dev-minimal.txt | 1 + requirements-dev.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/requirements-dev-minimal.txt b/requirements-dev-minimal.txt index eb9d38808..eddbadb2b 100644 --- a/requirements-dev-minimal.txt +++ b/requirements-dev-minimal.txt @@ -1,4 +1,5 @@ asynctest +debugpy flake8 mock mypy diff --git a/requirements-dev.txt b/requirements-dev.txt index 41f7131fc..2860357cd 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,6 +1,7 @@ astroid==2.13.3 asynctest==0.12.0 cfgv==2.0.1 +debugpy==1.8.1 dill==0.3.6 distlib==0.3.6 filelock==3.4.1 From 0cda7e523d89684e6b5199b42e7522034863a57c Mon Sep 17 00:00:00 2001 From: Dave Cuza Date: Fri, 2 Aug 2024 10:04:43 -0700 Subject: [PATCH 08/12] chore: Update .gitignore to include .fleet The .gitignore file has been updated to include the .fleet directory. This ensures that the .fleet directory is ignored by Git and not tracked in the repository. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2354cd11e..372c00c9d 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ tron.egg-info docs/_build/ .idea .vscode +.fleet tron.iml docs/images/ *.dot From b8e91c32c8f78e7b6788da98cd7f79e1d3b098c7 Mon Sep 17 00:00:00 2001 From: Dave Cuza Date: Thu, 15 Aug 2024 18:10:02 -0400 Subject: [PATCH 09/12] Update tron/core/actionrun.py Co-authored-by: Jen Patague --- tron/core/actionrun.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tron/core/actionrun.py b/tron/core/actionrun.py index ca9d0c0fc..065dbd3af 100644 --- a/tron/core/actionrun.py +++ b/tron/core/actionrun.py @@ -602,7 +602,7 @@ def _exit_unsuccessful( self.last_attempt.exit(exit_status) if self.retries_remaining is not None: if exit_status in non_retryable_exit_codes: - log.info(f"{self} skipping auto-retries due to non-retryable exit code was recieved.") + log.info(f"{self} skipping auto-retries, received non-retryable exit code ({exit_status}).") else: if self.retries_remaining > 0: self.retries_remaining -= 1 From fa00a349cdf9aa0e73a8381d7a3bd7e0eab0837d Mon Sep 17 00:00:00 2001 From: Dave Cuza Date: Mon, 19 Aug 2024 08:04:17 -0700 Subject: [PATCH 10/12] chore: Refactor non_retryable_exit_codes to use list instead of tuple --- tests/kubernetes_test.py | 1 + tron/core/actionrun.py | 6 +++--- tron/kubernetes.py | 5 ++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/kubernetes_test.py b/tests/kubernetes_test.py index ad2089b01..838f4734c 100644 --- a/tests/kubernetes_test.py +++ b/tests/kubernetes_test.py @@ -447,6 +447,7 @@ def test_handle_event_lost(mock_kubernetes_task): ) ) + assert mock_kubernetes_task.is_unknown assert mock_kubernetes_task.exit_status == exitcode.EXIT_KUBERNETES_TASK_LOST diff --git a/tron/core/actionrun.py b/tron/core/actionrun.py index 065dbd3af..565d5ed7b 100644 --- a/tron/core/actionrun.py +++ b/tron/core/actionrun.py @@ -591,7 +591,7 @@ def fail(self, exit_status=None): return self._done("fail", exit_status) def _exit_unsuccessful( - self, exit_status=None, retry_original_command=True, non_retryable_exit_codes=() + self, exit_status=None, retry_original_command=True, non_retryable_exit_codes=[] ) -> Optional[Union[bool, ActionCommand]]: if self.is_done: log.info( @@ -1320,11 +1320,11 @@ def kill(self, final: bool = True) -> Optional[str]: return "\n".join(msgs) def _exit_unsuccessful( - self, exit_status=None, retry_original_command=True, non_retryable_exit_codes=() + self, exit_status=None, retry_original_command=True, non_retryable_exit_codes=[] ) -> Optional[Union[bool, ActionCommand]]: k8s_cluster = KubernetesClusterRepository.get_cluster() - non_retryable_exit_codes = () if not k8s_cluster else k8s_cluster.non_retryable_exit_codes + non_retryable_exit_codes = [] if not k8s_cluster else k8s_cluster.non_retryable_exit_codes return super()._exit_unsuccessful( exit_status=exit_status, diff --git a/tron/kubernetes.py b/tron/kubernetes.py index cd74cbe3c..5c9d55eaa 100644 --- a/tron/kubernetes.py +++ b/tron/kubernetes.py @@ -5,7 +5,6 @@ from typing import Dict from typing import List from typing import Optional -from typing import Tuple from typing import TYPE_CHECKING from task_processing.interfaces.event import Event @@ -282,7 +281,7 @@ def __init__( default_volumes: Optional[List[ConfigVolume]] = None, pod_launch_timeout: Optional[int] = None, watcher_kubeconfig_paths: Optional[List[str]] = None, - non_retryable_exit_codes: Optional[Tuple[int, ...]] = (), + non_retryable_exit_codes: Optional[List[int, ...]] = [], ): # general k8s config self.kubeconfig_path = kubeconfig_path @@ -624,7 +623,7 @@ def recover(self, task: KubernetesTask) -> None: class KubernetesClusterRepository: # Kubernetes config kubernetes_enabled: bool = False - kubernetes_non_retryable_exit_codes: Tuple[int, ...] = () + kubernetes_non_retryable_exit_codes: Optional[List[int, ...]] = [] kubeconfig_path: Optional[str] = None pod_launch_timeout: Optional[int] = None default_volumes: Optional[List[ConfigVolume]] = None From f16b828b498cac48891b1d8c6a3f6bc4a9dc7671 Mon Sep 17 00:00:00 2001 From: Dave Cuza Date: Mon, 19 Aug 2024 08:35:54 -0700 Subject: [PATCH 11/12] Fixing tests --- tests/kubernetes_test.py | 1 - tron/kubernetes.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/kubernetes_test.py b/tests/kubernetes_test.py index 838f4734c..ad2089b01 100644 --- a/tests/kubernetes_test.py +++ b/tests/kubernetes_test.py @@ -447,7 +447,6 @@ def test_handle_event_lost(mock_kubernetes_task): ) ) - assert mock_kubernetes_task.is_unknown assert mock_kubernetes_task.exit_status == exitcode.EXIT_KUBERNETES_TASK_LOST diff --git a/tron/kubernetes.py b/tron/kubernetes.py index 5c9d55eaa..e9316064e 100644 --- a/tron/kubernetes.py +++ b/tron/kubernetes.py @@ -281,7 +281,7 @@ def __init__( default_volumes: Optional[List[ConfigVolume]] = None, pod_launch_timeout: Optional[int] = None, watcher_kubeconfig_paths: Optional[List[str]] = None, - non_retryable_exit_codes: Optional[List[int, ...]] = [], + non_retryable_exit_codes: Optional[List[int]] = [], ): # general k8s config self.kubeconfig_path = kubeconfig_path @@ -623,7 +623,7 @@ def recover(self, task: KubernetesTask) -> None: class KubernetesClusterRepository: # Kubernetes config kubernetes_enabled: bool = False - kubernetes_non_retryable_exit_codes: Optional[List[int, ...]] = [] + kubernetes_non_retryable_exit_codes: Optional[List[int]] = [] kubeconfig_path: Optional[str] = None pod_launch_timeout: Optional[int] = None default_volumes: Optional[List[ConfigVolume]] = None From b9883a32559c61a79e817bbf6dd9438d801e0712 Mon Sep 17 00:00:00 2001 From: Dave Cuza Date: Mon, 26 Aug 2024 18:01:51 -0700 Subject: [PATCH 12/12] Adding self.retries_remaining = 0 when we skip a non retryable job --- tron/core/actionrun.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tron/core/actionrun.py b/tron/core/actionrun.py index 565d5ed7b..98eddaabd 100644 --- a/tron/core/actionrun.py +++ b/tron/core/actionrun.py @@ -602,6 +602,7 @@ def _exit_unsuccessful( self.last_attempt.exit(exit_status) if self.retries_remaining is not None: if exit_status in non_retryable_exit_codes: + self.retries_remaining = 0 log.info(f"{self} skipping auto-retries, received non-retryable exit code ({exit_status}).") else: if self.retries_remaining > 0: