From d2178b0522b38e75d7d8d7bb60d09c5aa7f67d80 Mon Sep 17 00:00:00 2001 From: Javier de la Puente Date: Fri, 28 Feb 2025 11:13:19 +0100 Subject: [PATCH 01/13] try idea for building instance id --- .../manager/cloud_runner_manager.py | 13 +- .../github_runner_manager/manager/models.py | 99 ++++++ .../manager/runner_manager.py | 16 +- .../manager/runner_scaler.py | 21 +- .../openstack_cloud/openstack_cloud.py | 3 +- .../openstack_runner_manager.py | 26 +- .../tests/unit/mock_runner_managers.py | 22 +- .../tests/unit/test_runner_scaler.py | 4 +- .../tests/unit/types_/__init__.py | 2 - tests/unit/mock_runner_managers.py | 285 ------------------ 10 files changed, 145 insertions(+), 346 deletions(-) create mode 100644 github-runner-manager/src/github_runner_manager/manager/models.py delete mode 100644 github-runner-manager/tests/unit/types_/__init__.py delete mode 100644 tests/unit/mock_runner_managers.py diff --git a/github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py b/github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py index 51b7daf80..e1a2d30d1 100644 --- a/github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py +++ b/github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py @@ -9,12 +9,11 @@ from enum import Enum, auto from typing import Iterator, Sequence, Tuple +from github_runner_manager.manager.models import InstanceID from github_runner_manager.metrics.runner import RunnerMetrics logger = logging.getLogger(__name__) -InstanceId = str - class HealthState(Enum): """Health state of the runners. @@ -141,7 +140,7 @@ class CloudRunnerInstance: """ name: str - instance_id: InstanceId + instance_id: InstanceID health: HealthState state: CloudRunnerState @@ -159,11 +158,7 @@ def name_prefix(self) -> str: """Get the name prefix of the self-hosted runners.""" @abc.abstractmethod - def generate_instance_id(self) -> InstanceId: - """Generate an intance_id to name a runner.""" - - @abc.abstractmethod - def create_runner(self, instance_id: InstanceId, registration_jittoken: str) -> None: + def create_runner(self, instance_id: InstanceID, registration_jittoken: str) -> None: """Create a self-hosted runner. Args: @@ -181,7 +176,7 @@ def get_runners(self, states: Sequence[CloudRunnerState]) -> Tuple[CloudRunnerIn """ @abc.abstractmethod - def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMetrics | None: + def delete_runner(self, instance_id: InstanceID, remove_token: str) -> RunnerMetrics | None: """Delete self-hosted runner. Args: diff --git a/github-runner-manager/src/github_runner_manager/manager/models.py b/github-runner-manager/src/github_runner_manager/manager/models.py new file mode 100644 index 000000000..49edbe127 --- /dev/null +++ b/github-runner-manager/src/github_runner_manager/manager/models.py @@ -0,0 +1,99 @@ +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +"""TODO.""" + +import secrets +from dataclasses import dataclass + + +@dataclass +class InstanceID: + """TODO. + + Attributes: + name: TODO + prefix: TODO + reactive: TODO + suffix: TODO + """ + + prefix: str + reactive: bool + suffix: str + + @property + def name(self) -> str: + """TODO. + + Returns: + TODO + """ + return f"{self.prefix}-{self.suffix}" + + @classmethod + def build_from_name(cls, prefix: str, name: str) -> "InstanceID": + """TODO. + + Args: + prefix: TODO + name: TODO + + Raises: + ValueError: TODO + + Returns: + TODO + """ + if name.startswith(prefix): + suffix = name.removeprefix(f"{prefix}-") + else: + # TODO should we raise if invalid name? + raise ValueError(f"Invalid runner name {name} for prefix {prefix}") + + return cls( + prefix=prefix, + reactive=False, + suffix=suffix, + ) + + @classmethod + def build(cls, prefix: str, reactive: bool = False) -> "InstanceID": + r"""Generate an intance_id to name a runner. + + It should be valid for all the CloudProviders and GitHub. + + The GitHub runner name convention is as following: + A valid runner name is 64 characters or less in length and does not include '"', '/', ':', + '<', '>', '\', '|', '*' and '?'. + + The collision rate calculation: + alphanumeric 12 chars long (26 alphabet + 10 digits = 36) + 36^12 is big enough for our use-case. + + Args: + prefix: TODO + reactive: TODO + + Returns: + Instance ID of the runner. + """ + suffix = secrets.token_hex(6) + + return cls(prefix=prefix, reactive=reactive, suffix=suffix) + + def __str__(self) -> str: + """TODO. + + Returns: + TODO. + """ + return self.name + + def __repr__(self) -> str: + """TODO. + + Returns: + TODO. + """ + return f"InstanceID({self.name!r})" diff --git a/github-runner-manager/src/github_runner_manager/manager/runner_manager.py b/github-runner-manager/src/github_runner_manager/manager/runner_manager.py index 1adde0f32..855bbb15a 100644 --- a/github-runner-manager/src/github_runner_manager/manager/runner_manager.py +++ b/github-runner-manager/src/github_runner_manager/manager/runner_manager.py @@ -1,7 +1,7 @@ # Copyright 2025 Canonical Ltd. # See LICENSE file for licensing details. -"""Class for managing the GitHub self-hosted runners hosted on cloud instances.""" +"""Module for managing the GitHub self-hosted runners hosted on cloud instances.""" import logging from dataclasses import dataclass @@ -17,12 +17,12 @@ CloudRunnerManager, CloudRunnerState, HealthState, - InstanceId, ) from github_runner_manager.manager.github_runner_manager import ( GitHubRunnerManager, GitHubRunnerState, ) +from github_runner_manager.manager.models import InstanceID from github_runner_manager.metrics import events as metric_events from github_runner_manager.metrics import github as github_metrics from github_runner_manager.metrics import runner as runner_metrics @@ -59,7 +59,7 @@ class RunnerInstance: """ name: str - instance_id: InstanceId + instance_id: InstanceID health: HealthState github_state: GitHubRunnerState | None cloud_state: CloudRunnerState @@ -112,7 +112,7 @@ def __init__( ) self._labels = labels - def create_runners(self, num: int) -> tuple[InstanceId, ...]: + def create_runners(self, num: int) -> tuple[InstanceID, ...]: """Create runners. Args: @@ -248,7 +248,7 @@ def cleanup(self) -> IssuedMetricEventsStats: @staticmethod def _spawn_runners( create_runner_args_sequence: Sequence["RunnerManager._CreateRunnerArgs"], - ) -> tuple[InstanceId, ...]: + ) -> tuple[InstanceID, ...]: """Spawn runners in parallel using multiprocessing. Multiprocessing is only used if there are more than one runner to spawn. Otherwise, @@ -278,7 +278,7 @@ def _spawn_runners( @staticmethod def _spawn_runners_using_multiprocessing( create_runner_args_sequence: Sequence["RunnerManager._CreateRunnerArgs"], num: int - ) -> tuple[InstanceId, ...]: + ) -> tuple[InstanceID, ...]: """Parallel spawn of runners. The length of the create_runner_args is number _create_runner invocation, and therefore the @@ -389,7 +389,7 @@ class _CreateRunnerArgs: labels: list[str] @staticmethod - def _create_runner(args: _CreateRunnerArgs) -> InstanceId: + def _create_runner(args: _CreateRunnerArgs) -> InstanceID: """Create a single runner. This is a staticmethod for usage with multiprocess.Pool. @@ -400,7 +400,7 @@ def _create_runner(args: _CreateRunnerArgs) -> InstanceId: Returns: The instance ID of the runner created. """ - instance_id = args.cloud_runner_manager.generate_instance_id() + instance_id = InstanceID.build(args.cloud_runner_manager.name_prefix) registration_jittoken = args.github_runner_manager.get_registration_jittoken( instance_id, args.labels ) diff --git a/github-runner-manager/src/github_runner_manager/manager/runner_scaler.py b/github-runner-manager/src/github_runner_manager/manager/runner_scaler.py index 5852452f0..9632df516 100644 --- a/github-runner-manager/src/github_runner_manager/manager/runner_scaler.py +++ b/github-runner-manager/src/github_runner_manager/manager/runner_scaler.py @@ -261,8 +261,27 @@ def reconcile(self) -> int: metric_stats = {} start_timestamp = time.time() - expected_runner_quantity = self._base_quantity if self._reactive_config is None else None + expected_runner_quantity = self._base_quantity + # TODO THE NEW STEPS MAY BE AS FOLLOWS: + + # 0. Initial Cleanup: + # 0.1 Be careful, offline runners could be in the process of being created, because of JIT + # 0.2 Clean Finished things, that is, get all runners cloud + github + # 0.2.1 Get all runners cloud + github, and create a state for the runner + # 0.2.2 If it is finished or there is no way it will advance, delete it from both places. + # 0.2.3 Issue metrics for that. + # 0.2.4 TODO offline in github and nothing in cloud. Check how long it's been in github? + + # 1. Non-reactive + # 1.1. TODO + # 2. reactive + # 2.1. TODO + + # 3. Final Cleanup + # 3.1. TODO + + # TODO What should this function return if any??? try: if self._reactive_config is not None: logger.info( diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py index f3855e01f..5e0f13f20 100644 --- a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py +++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py @@ -24,6 +24,7 @@ from paramiko.ssh_exception import NoValidConnectionsError from github_runner_manager.errors import KeyfileError, OpenStackError, SSHError +from github_runner_manager.manager.models import InstanceID from github_runner_manager.openstack_cloud.configuration import OpenStackCredentials from github_runner_manager.openstack_cloud.constants import CREATE_SERVER_TIMEOUT @@ -52,7 +53,7 @@ class OpenstackInstance: addresses: list[str] created_at: datetime - instance_id: str + instance_id: InstanceID server_id: str server_name: str status: str diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py index 12ed7e4ba..bf3216d1f 100644 --- a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py +++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py @@ -35,8 +35,8 @@ CloudRunnerInstance, CloudRunnerManager, CloudRunnerState, - InstanceId, ) +from github_runner_manager.manager.models import InstanceID from github_runner_manager.manager.runner_manager import HealthState from github_runner_manager.metrics import runner as runner_metrics from github_runner_manager.metrics import storage as metrics_storage @@ -169,23 +169,7 @@ def name_prefix(self) -> str: """ return self._config.prefix - def generate_instance_id(self) -> InstanceId: - r"""Generate an intance_id to name a runner. - - The GitHub runner name convention is as following: - A valid runner name is 64 characters or less in length and does not include '"', '/', ':', - '<', '>', '\', '|', '*' and '?'. - - The collision rate calculation: - alphanumeric 12 chars long (26 alphabet + 10 digits = 36) - 36^12 is big enough for our use-case. - - Returns: - Instance ID of the runner. - """ - return f"{self.name_prefix}-{secrets.token_hex(6)}" - - def create_runner(self, instance_id: InstanceId, registration_jittoken: str) -> None: + def create_runner(self, instance_id: InstanceID, registration_jittoken: str) -> None: """Create a self-hosted runner. Args: @@ -195,9 +179,6 @@ def create_runner(self, instance_id: InstanceId, registration_jittoken: str) -> Raises: MissingServerConfigError: Unable to create runner due to missing configuration. RunnerCreateError: Unable to create runner due to OpenStack issues. - - Returns: - Instance ID of the runner. """ if (server_config := self._config.server_config) is None: raise MissingServerConfigError("Missing server configuration to create runners") @@ -223,7 +204,6 @@ def create_runner(self, instance_id: InstanceId, registration_jittoken: str) -> self._wait_runner_running(instance) logger.info("Runner %s created successfully", instance.server_name) - return instance_id def get_runners( self, states: Sequence[CloudRunnerState] | None = None @@ -262,7 +242,7 @@ def get_runners( return tuple(runner for runner in runners if runner.state in state_set) def delete_runner( - self, instance_id: InstanceId, remove_token: str + self, instance_id: InstanceID, remove_token: str ) -> runner_metrics.RunnerMetrics | None: """Delete self-hosted runners. diff --git a/github-runner-manager/tests/unit/mock_runner_managers.py b/github-runner-manager/tests/unit/mock_runner_managers.py index 3a7c21eb1..75137ed89 100644 --- a/github-runner-manager/tests/unit/mock_runner_managers.py +++ b/github-runner-manager/tests/unit/mock_runner_managers.py @@ -13,9 +13,9 @@ CloudRunnerInstance, CloudRunnerManager, CloudRunnerState, - InstanceId, ) from github_runner_manager.manager.github_runner_manager import GitHubRunnerState +from github_runner_manager.manager.models import InstanceID from github_runner_manager.metrics.runner import RunnerMetrics from github_runner_manager.types_.github import ( GitHubRunnerStatus, @@ -228,7 +228,7 @@ class MockRunner: """ name: str - instance_id: InstanceId + instance_id: InstanceID cloud_state: CloudRunnerState github_state: GitHubRunnerState health: bool @@ -269,7 +269,7 @@ class SharedMockRunnerManagerState: runners: The runners. """ - runners: dict[InstanceId, MockRunner] + runners: dict[InstanceID, MockRunner] def __init__(self): """Construct the object.""" @@ -301,15 +301,7 @@ def name_prefix(self) -> str: """Get the name prefix of the self-hosted runners.""" return self.prefix - def generate_instance_id(self) -> InstanceId: - """Generate an intance_id to name a runner. - - Returns: - Instance ID of the runner. - """ - return secrets.token_hex(6) - - def create_runner(self, instance_id: InstanceId, registration_jittoken: str) -> None: + def create_runner(self, instance_id: InstanceID, registration_jittoken: str) -> None: """Create a self-hosted runner. Args: @@ -318,7 +310,7 @@ def create_runner(self, instance_id: InstanceId, registration_jittoken: str) -> """ name = f"{self.name_prefix}-{instance_id}" runner = MockRunner(name) - self.state.runners[instance_id] = runner + self.state.runners[str(instance_id)] = runner def get_runners( self, states: Sequence[CloudRunnerState] | None = None @@ -342,7 +334,7 @@ def get_runners( if runner.cloud_state in state_set ) - def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMetrics | None: + def delete_runner(self, instance_id: InstanceID, remove_token: str) -> RunnerMetrics | None: """Delete self-hosted runner. Args: @@ -352,7 +344,7 @@ def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMet Returns: Any runner metrics produced during deletion. """ - runner = self.state.runners.pop(instance_id, None) + runner = self.state.runners.pop(str(instance_id), None) if runner is not None: return iter([MagicMock()]) return iter([]) diff --git a/github-runner-manager/tests/unit/test_runner_scaler.py b/github-runner-manager/tests/unit/test_runner_scaler.py index d83acdc82..a824573c8 100644 --- a/github-runner-manager/tests/unit/test_runner_scaler.py +++ b/github-runner-manager/tests/unit/test_runner_scaler.py @@ -33,9 +33,9 @@ from github_runner_manager.errors import CloudError, ReconcileError from github_runner_manager.manager.cloud_runner_manager import ( CloudRunnerState, - InstanceId, ) from github_runner_manager.manager.github_runner_manager import GitHubRunnerState +from github_runner_manager.manager.models import InstanceID from github_runner_manager.manager.runner_manager import ( FlushMode, RunnerManager, @@ -60,7 +60,7 @@ def mock_runner_manager_spawn_runners( create_runner_args: Iterable[RunnerManager._CreateRunnerArgs], -) -> tuple[InstanceId, ...]: +) -> tuple[InstanceID, ...]: """Mock _spawn_runners method of RunnerManager. The _spawn_runners method uses multi-process, which copies the object, e.g., the mocks. diff --git a/github-runner-manager/tests/unit/types_/__init__.py b/github-runner-manager/tests/unit/types_/__init__.py deleted file mode 100644 index 97bca57b0..000000000 --- a/github-runner-manager/tests/unit/types_/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright 2025 Canonical Ltd. -# See LICENSE file for licensing details. diff --git a/tests/unit/mock_runner_managers.py b/tests/unit/mock_runner_managers.py deleted file mode 100644 index 141f259c7..000000000 --- a/tests/unit/mock_runner_managers.py +++ /dev/null @@ -1,285 +0,0 @@ -# Copyright 2025 Canonical Ltd. -# See LICENSE file for licensing details. - -import random -import secrets -from dataclasses import dataclass -from typing import Iterable, Iterator, Sequence -from unittest.mock import MagicMock - -from github_runner_manager.github_client import GithubClient -from github_runner_manager.manager.cloud_runner_manager import ( - CloudRunnerInstance, - CloudRunnerManager, - CloudRunnerState, - InstanceId, -) -from github_runner_manager.manager.github_runner_manager import GitHubRunnerState -from github_runner_manager.metrics.runner import RunnerMetrics -from github_runner_manager.types_.github import GitHubRunnerStatus, SelfHostedRunner - -from charm_state import GitHubPath -from tests.unit.mock import MockGhapiClient - - -@dataclass -class MockRunner: - """Mock of a runner. - - Attributes: - name: The name of the runner. - instance_id: The instance id of the runner. - cloud_state: The cloud state of the runner. - github_state: The github state of the runner. - health: The health state of the runner. - """ - - name: str - instance_id: InstanceId - cloud_state: CloudRunnerState - github_state: GitHubRunnerState - health: bool - - def __init__(self, name: str): - """Construct the object. - - Args: - name: The name of the runner. - """ - self.name = name - self.instance_id = secrets.token_hex(6) - self.cloud_state = CloudRunnerState.ACTIVE - self.github_state = GitHubRunnerState.IDLE - self.health = True - - def to_cloud_runner(self) -> CloudRunnerInstance: - """Construct CloudRunnerInstance from this object. - - Returns: - The CloudRunnerInstance instance. - """ - return CloudRunnerInstance( - name=self.name, - instance_id=self.instance_id, - health=self.health, - state=self.cloud_state, - ) - - -@dataclass -class SharedMockRunnerManagerState: - """State shared by mock runner managers. - - For sharing the mock runner states between MockCloudRunnerManager and MockGitHubRunnerManager. - - Attributes: - runners: The runners. - """ - - runners: dict[InstanceId, MockRunner] - - def __init__(self): - """Construct the object.""" - self.runners = {} - - -class MockCloudRunnerManager(CloudRunnerManager): - """Mock of CloudRunnerManager. - - Metrics is not supported in this mock. - - Attributes: - name_prefix: The naming prefix for runners managed. - prefix: The naming prefix for runners managed. - state: The shared state between mocks runner managers. - """ - - def __init__(self, state: SharedMockRunnerManagerState): - """Construct the object. - - Args: - state: The shared state between cloud and github runner managers. - """ - self.prefix = f"mock_{secrets.token_hex(4)}" - self.state = state - - @property - def name_prefix(self) -> str: - """Get the name prefix of the self-hosted runners.""" - return self.prefix - - def create_runner(self, registration_jittoken: str) -> InstanceId: - """Create a self-hosted runner. - - Args: - registration_jittoken: The GitHub registration token for registering runners. - - Returns: - The instance id of the runner created. - """ - name = f"{self.name_prefix}-{secrets.token_hex(6)}" - runner = MockRunner(name) - self.state.runners[runner.instance_id] = runner - return runner.instance_id - - def get_runners( - self, states: Sequence[CloudRunnerState] | None = None - ) -> tuple[CloudRunnerInstance, ...]: - """Get self-hosted runners by state. - - Args: - states: Filter for the runners with these github states. If None all states will be - included. - - Returns: - The list of runner instances. - """ - if states is None: - states = [member.value for member in CloudRunnerState] - - state_set = set(states) - return tuple( - runner.to_cloud_runner() - for runner in self.state.runners.values() - if runner.cloud_state in state_set - ) - - def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMetrics | None: - """Delete self-hosted runner. - - Args: - instance_id: The instance id of the runner to delete. - remove_token: The GitHub remove token. - - Returns: - Any runner metrics produced during deletion. - """ - runner = self.state.runners.pop(instance_id, None) - if runner is not None: - return iter([MagicMock()]) - return iter([]) - - def flush_runners(self, remove_token: str, busy: bool = False) -> Iterator[RunnerMetrics]: - """Stop all runners. - - Args: - remove_token: The GitHub remove token for removing runners. - busy: If false, only idle runners are removed. If true, both idle and busy runners are - removed. - - Returns: - Any runner metrics produced during flushing. - """ - if busy: - self.state.runners = {} - else: - self.state.runners = { - instance_id: runner - for instance_id, runner in self.state.runners.items() - if runner.github_state == GitHubRunnerState.BUSY - } - return iter([MagicMock()]) - - def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: - """Cleanup runner and resource on the cloud. - - Perform health check on runner and delete the runner if it fails. - - Args: - remove_token: The GitHub remove token for removing runners. - - Returns: - Any runner metrics produced during cleanup. - """ - # Do nothing in mocks. - return iter([MagicMock()]) - - -class MockGitHubRunnerManager: - """Mock of GitHubRunnerManager. - - Attributes: - github: The GitHub client. - name_prefix: The naming prefix for runner managed. - state: The shared state between mock runner managers. - path: The GitHub path to register the runners under. - """ - - def __init__(self, name_prefix: str, path: GitHubPath, state: SharedMockRunnerManagerState): - """Construct the object. - - Args: - name_prefix: The naming prefix for runner managed. - path: The GitHub path to register the runners under. - state: The shared state between mock runner managers. - """ - self.github = GithubClient("mock_token") - self.github._client = MockGhapiClient("mock_token") - self.name_prefix = name_prefix - self.state = state - self.path = path - - def get_registration_jittoken(self, instance_id: str, labels: list[str]) -> str: - """Get the registration JIT token for registering runners on GitHub. - - Args: - instance_id: Instance ID of the runner. - labels: Labels for the runner. - - Returns: - The registration token. - """ - return "mock_registration_token" - - def get_removal_token(self) -> str: - """Get the remove token for removing runners on GitHub. - - Returns: - The remove token. - """ - return "mock_remove_token" - - def get_runners( - self, github_states: Iterable[GitHubRunnerState] | None = None - ) -> tuple[SelfHostedRunner, ...]: - """Get the runners. - - Args: - github_states: The states to filter for. - - Returns: - List of runners. - """ - if github_states is None: - github_states = [member.value for member in GitHubRunnerState] - - github_state_set = set(github_states) - return tuple( - SelfHostedRunner( - busy=runner.github_state == GitHubRunnerState.BUSY, - id=random.randint(1, 1000000), - labels=[], - os="linux", - name=runner.name, - status=( - GitHubRunnerStatus.OFFLINE - if runner.github_state == GitHubRunnerState.OFFLINE - else GitHubRunnerStatus.ONLINE - ), - ) - for runner in self.state.runners.values() - if runner.github_state in github_state_set - ) - - def delete_runners(self, states: Iterable[GitHubRunnerState]) -> None: - """Delete the runners. - - Args: - states: The states to filter the runners to delete. - """ - github_states = set(states) - self.state.runners = { - instance_id: runner - for instance_id, runner in self.state.runners.items() - if runner.github_state not in github_states - } From 47294ea851062806c40cf80f56678021d2d5bd96 Mon Sep 17 00:00:00 2001 From: Javier de la Puente Date: Fri, 28 Feb 2025 13:29:44 +0100 Subject: [PATCH 02/13] a bit more playing with it --- .../github_runner_manager/github_client.py | 7 +- .../manager/github_runner_manager.py | 3 +- .../openstack_cloud/health_checks.py | 22 ++--- .../openstack_cloud/openstack_cloud.py | 25 +++--- .../openstack_runner_manager.py | 82 +++++++++---------- .../test_openstack_runner_manager.py | 4 +- .../tests/unit/test_github_client.py | 8 +- 7 files changed, 77 insertions(+), 74 deletions(-) diff --git a/github-runner-manager/src/github_runner_manager/github_client.py b/github-runner-manager/src/github_runner_manager/github_client.py index 98da12e93..451eed916 100644 --- a/github-runner-manager/src/github_runner_manager/github_client.py +++ b/github-runner-manager/src/github_runner_manager/github_client.py @@ -23,6 +23,7 @@ GitHubRepo, ) from github_runner_manager.errors import GithubApiError, JobNotFoundError, TokenError +from github_runner_manager.manager.models import InstanceID from github_runner_manager.types_.github import ( JobInfo, RegistrationToken, @@ -161,7 +162,7 @@ def get_runner_remove_token(self, path: GitHubPath) -> str: @catch_http_errors def get_runner_registration_jittoken( - self, path: GitHubPath, instance_id: str, labels: list[str] + self, path: GitHubPath, instance_id: InstanceID, labels: list[str] ) -> str: """Get token from GitHub used for registering runners. @@ -181,7 +182,7 @@ def get_runner_registration_jittoken( token = self._client.actions.generate_runner_jitconfig_for_repo( owner=path.owner, repo=path.repo, - name=instance_id, + name=instance_id.name, runner_group_id=1, labels=labels, ) @@ -191,7 +192,7 @@ def get_runner_registration_jittoken( runner_group_id = self._get_runner_group_id(path) token = self._client.actions.generate_runner_jitconfig_for_org( org=path.org, - name=instance_id, + name=instance_id.name, runner_group_id=runner_group_id, labels=labels, ) diff --git a/github-runner-manager/src/github_runner_manager/manager/github_runner_manager.py b/github-runner-manager/src/github_runner_manager/manager/github_runner_manager.py index 1989932da..b7b0fc0c1 100644 --- a/github-runner-manager/src/github_runner_manager/manager/github_runner_manager.py +++ b/github-runner-manager/src/github_runner_manager/manager/github_runner_manager.py @@ -8,6 +8,7 @@ from github_runner_manager.configuration.github import GitHubConfiguration from github_runner_manager.github_client import GithubClient +from github_runner_manager.manager.models import InstanceID from github_runner_manager.types_.github import GitHubRunnerStatus, SelfHostedRunner @@ -94,7 +95,7 @@ def delete_runners(self, states: Iterable[GitHubRunnerState] | None = None) -> N for runner in runner_list: self.github.delete_runner(self._path, runner.id) - def get_registration_jittoken(self, instance_id: str, labels: list[str]) -> str: + def get_registration_jittoken(self, instance_id: InstanceID, labels: list[str]) -> str: """Get registration JIT token from GitHub. This token is used for registering self-hosted runners. diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py index 75796b14d..9d90c77d9 100644 --- a/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py +++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py @@ -49,7 +49,7 @@ def check_runner(openstack_cloud: OpenstackCloud, instance: OpenstackInstance) - ssh_conn = _get_ssh_connection(openstack_cloud=openstack_cloud, instance=instance) except KeyfileError: logger.exception( - "Health check failed due to unable to find keyfile for %s", instance.server_name + "Health check failed due to unable to find keyfile for %s", instance.instance_id.name ) # KeyfileError indicates that we'll never be able to ssh into the unit, # so we mark it as unhealthy. @@ -57,7 +57,7 @@ def check_runner(openstack_cloud: OpenstackCloud, instance: OpenstackInstance) - except _SSHError: logger.exception( "Unable to get SSH connection for instance %s, marking as unhealthy.", - instance.server_name, + instance.instance_id.name, ) # We assume that the failure to get the SSH connection is not transient, and mark # the runner as unhealthy. @@ -93,13 +93,15 @@ def check_active_runner( if ( check_ok := _run_health_check_cloud_init( - ssh_conn, instance.server_name, accept_finished_job + ssh_conn, instance.instance_id.name, accept_finished_job ) ) is not None: return check_ok if ( - check_ok := _run_health_check_runner_processes_running(ssh_conn, instance.server_name) + check_ok := _run_health_check_runner_processes_running( + ssh_conn, instance.instance_id.name + ) ) is not None: return check_ok except _SSHError as exc: @@ -130,7 +132,7 @@ def _get_ssh_connection( ssh_conn = openstack_cloud.get_ssh_connection(instance) except SSHError as exc: - raise _SSHError(f"Unable to get SSH connection to {instance.server_name}") from exc + raise _SSHError(f"Unable to get SSH connection to {instance.instance_id}") from exc return ssh_conn @@ -144,7 +146,7 @@ def _health_check_cloud_state(instance: OpenstackInstance) -> _HealthCheckResult Whether the runner should be considered healthy or None. """ cloud_state = CloudRunnerState.from_openstack_server_status(instance.status) - logger.debug("Cloud state of %s: %s", instance.server_name, cloud_state) + logger.debug("Cloud state of %s: %s", instance.instance_id, cloud_state) if cloud_state in ( CloudRunnerState.DELETED, CloudRunnerState.STOPPED, @@ -155,7 +157,7 @@ def _health_check_cloud_state(instance: OpenstackInstance) -> _HealthCheckResult logger.error( "Instance in unexpected status, failing health check. %s: %s (%s)", cloud_state, - instance.server_name, + instance.instance_id, instance.server_id, ) return False @@ -166,7 +168,7 @@ def _health_check_cloud_state(instance: OpenstackInstance) -> _HealthCheckResult logger.error( "Instance in created status for too long, failing health check. %s: %s (%s)", cloud_state, - instance.server_name, + instance.instance_id, instance.server_id, ) return False @@ -229,14 +231,14 @@ def _run_health_check_runner_installed( if not result.ok: logger.info( "Runner installed timestamp file not found on %s, cloud-init may still run", - instance.server_name, + instance.instance_id, ) if datetime.now() - instance.created_at >= timedelta( hours=INSTANCE_IN_BUILD_MODE_TIMEOUT_IN_HOURS ): logger.error( "Instance in building status for too long, failing health check. %s (%s)", - instance.server_name, + instance.instance_id, instance.server_id, ) return False diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py index 5e0f13f20..9d8ca1429 100644 --- a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py +++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py @@ -47,7 +47,6 @@ class OpenstackInstance: instance_id: ID used by OpenstackCloud class to manage the instances. See docs on the OpenstackCloud. server_id: ID of server assigned by OpenStack. - server_name: Name of the server on OpenStack. status: Status of the server. """ @@ -55,7 +54,6 @@ class OpenstackInstance: created_at: datetime instance_id: InstanceID server_id: str - server_name: str status: str def __init__(self, server: OpenstackServer, prefix: str): @@ -79,9 +77,8 @@ def __init__(self, server: OpenstackServer, prefix: str): for address in network_addresses ] self.created_at = datetime.strptime(server.created_at, "%Y-%m-%dT%H:%M:%SZ") - self.instance_id = server.name + self.instance_id = InstanceID.build_from_name(prefix, server.name) self.server_id = server.id - self.server_name = server.name self.status = server.status @@ -177,7 +174,7 @@ def __init__(self, credentials: OpenStackCredentials, prefix: str, system_user: # Ignore "Too many arguments" as 6 args should be fine. Move to a dataclass if new args are # added. def launch_instance( # pylint: disable=too-many-arguments, too-many-positional-arguments - self, instance_id: str, image: str, flavor: str, network: str, cloud_init: str + self, instance_id: InstanceID, image: str, flavor: str, network: str, cloud_init: str ) -> OpenstackInstance: """Create an OpenStack instance. @@ -202,7 +199,7 @@ def launch_instance( # pylint: disable=too-many-arguments, too-many-positional- try: server = conn.create_server( - name=instance_id, + name=instance_id.name, image=image, key_name=keypair.name, flavor=flavor, @@ -229,7 +226,7 @@ def launch_instance( # pylint: disable=too-many-arguments, too-many-positional- return OpenstackInstance(server, self.prefix) @_catch_openstack_errors - def get_instance(self, instance_id: str) -> OpenstackInstance | None: + def get_instance(self, instance_id: InstanceID) -> OpenstackInstance | None: """Get OpenStack instance by instance ID. Args: @@ -247,7 +244,7 @@ def get_instance(self, instance_id: str) -> OpenstackInstance | None: return None @_catch_openstack_errors - def delete_instance(self, instance_id: str) -> None: + def delete_instance(self, instance_id: InstanceID) -> None: """Delete a openstack instance. Args: @@ -293,14 +290,14 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SSHConnection: Returns: SSH connection object. """ - key_path = self._get_key_path(instance.server_name) + key_path = self._get_key_path(instance.instance_id.name) if not key_path.exists(): raise KeyfileError( - f"Missing keyfile for server: {instance.server_name}, key path: {key_path}" + f"Missing keyfile for server: {instance.instance_id.name}, key path: {key_path}" ) if not instance.addresses: - raise SSHError(f"No addresses found for OpenStack server {instance.server_name}") + raise SSHError(f"No addresses found for OpenStack server {instance.instance_id.name}") for ip in instance.addresses: try: @@ -314,7 +311,7 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SSHConnection: if not result.ok: logger.warning( "SSH test connection failed, server: %s, address: %s", - instance.server_name, + instance.instance_id.name, ip, ) continue @@ -323,13 +320,13 @@ def get_ssh_connection(self, instance: OpenstackInstance) -> SSHConnection: except (NoValidConnectionsError, TimeoutError, paramiko.ssh_exception.SSHException): logger.warning( "Unable to SSH into %s with address %s", - instance.server_name, + instance.instance_id.name, connection.host, exc_info=True, ) continue raise SSHError( - f"No connectable SSH addresses found, server: {instance.server_name}, " + f"No connectable SSH addresses found, server: {instance.instance_id.name}, " f"addresses: {instance.addresses}" ) diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py index bf3216d1f..c9a4bd7f6 100644 --- a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py +++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py @@ -198,12 +198,12 @@ def create_runner(self, instance_id: InstanceID, registration_jittoken: str) -> except OpenStackError as err: raise RunnerCreateError(f"Failed to create {instance_id} openstack runner") from err - logger.debug("Waiting for runner process to startup: %s", instance.server_name) + logger.debug("Waiting for runner process to startup: %s", instance.instance_id) self._wait_runner_startup(instance) - logger.debug("Waiting for runner process to be running: %s", instance.server_name) + logger.debug("Waiting for runner process to be running: %s", instance.instance_id) self._wait_runner_running(instance) - logger.info("Runner %s created successfully", instance.server_name) + logger.info("Runner %s created successfully", instance.instance_id) def get_runners( self, states: Sequence[CloudRunnerState] | None = None @@ -225,11 +225,11 @@ def get_runners( openstack_cloud=self._openstack_cloud, instance=instance ) except OpenstackHealthCheckError: - logger.exception(HEALTH_CHECK_ERROR_LOG_MSG, instance.server_name) + logger.exception(HEALTH_CHECK_ERROR_LOG_MSG, instance.instance_id.name) healthy = None runners.append( CloudRunnerInstance( - name=instance.server_name, + name=instance.instance_id.name, instance_id=instance.instance_id, health=HealthState.from_value(healthy), state=CloudRunnerState.from_openstack_server_status(instance.status), @@ -263,14 +263,14 @@ def delete_runner( return None logger.debug( - "Metrics extracted, deleting instance %s %s", instance_id, instance.server_name + "Metrics extracted, deleting instance %s %s", instance_id, instance.instance_id ) self._delete_runner(instance, remove_token) - logger.debug("Instance deleted successfully %s %s", instance_id, instance.server_name) - logger.debug("Extract metrics for runner %s %s", instance_id, instance.server_name) + logger.debug("Instance deleted successfully %s %s", instance_id, instance.instance_id) + logger.debug("Extract metrics for runner %s %s", instance_id, instance.instance_id) extracted_metrics = runner_metrics.extract( metrics_storage_manager=self._metrics_storage_manager, - runners={instance.server_name}, + runners={instance.instance_id.name}, include=True, ) return next(extracted_metrics, None) @@ -294,13 +294,13 @@ def flush_runners( logger.debug( "Checking runner state and flushing %s %s", instance.server_id, - instance.server_name, + instance.instance_id, ) self._check_state_and_flush(instance, busy) except SSHError: logger.warning( "Unable to determine state of %s and kill runner process due to SSH issues", - instance.server_name, + instance.instance_id, ) continue logger.debug("Runners successfully flushed, cleaning up.") @@ -318,9 +318,9 @@ def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: logger.debug("Getting runner healths for cleanup.") runners = self._get_runners_health() - healthy_runner_names = {runner.server_name for runner in runners.healthy} - unhealthy_runner_names = {runner.server_name for runner in runners.unhealthy} - unknown_runner_names = {runner.server_name for runner in runners.unknown} + healthy_runner_names = {runner.instance_id.name for runner in runners.healthy} + unhealthy_runner_names = {runner.instance_id.name for runner in runners.unhealthy} + unknown_runner_names = {runner.instance_id.name for runner in runners.unknown} logger.debug("Healthy runners: %s", healthy_runner_names) logger.debug("Unhealthy runners: %s", unhealthy_runner_names) logger.debug("Unknown health runners: %s", unknown_runner_names) @@ -386,31 +386,31 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None """ try: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - self._pull_runner_metrics(instance.server_name, ssh_conn) + self._pull_runner_metrics(instance.instance_id.name, ssh_conn) try: OpenStackRunnerManager._run_runner_removal_script( - instance.server_name, ssh_conn, remove_token + instance.instance_id.name, ssh_conn, remove_token ) except _GithubRunnerRemoveError: logger.warning( "Unable to run github runner removal script for %s", - instance.server_name, + instance.instance_id, stack_info=True, ) except SSHError: logger.exception( - "Failed to get SSH connection while removing %s", instance.server_name + "Failed to get SSH connection while removing %s", instance.instance_id ) logger.warning( - "Skipping runner remove script for %s due to SSH issues", instance.server_name + "Skipping runner remove script for %s due to SSH issues", instance.instance_id ) try: self._openstack_cloud.delete_instance(instance.instance_id) except OpenStackError: logger.exception( - "Unable to delete openstack instance for runner %s", instance.server_name + "Unable to delete openstack instance for runner %s", instance.instance_id ) def _get_runners_health(self) -> _RunnerHealth: @@ -431,7 +431,7 @@ def _get_runners_health(self) -> _RunnerHealth: else: unhealthy.append(runner) except OpenstackHealthCheckError: - logger.exception(HEALTH_CHECK_ERROR_LOG_MSG, runner.server_name) + logger.exception(HEALTH_CHECK_ERROR_LOG_MSG, runner.instance_id) unknown.append(runner) return _RunnerHealth( healthy=tuple(healthy), unhealthy=tuple(unhealthy), unknown=tuple(unknown) @@ -528,19 +528,19 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non ssh_conn = self._openstack_cloud.get_ssh_connection(instance) except KeyfileError: logger.exception( - "Health check failed due to unable to find keyfile for %s", instance.server_name + "Health check failed due to unable to find keyfile for %s", instance.instance_id ) return except SSHError: logger.exception( - "SSH connection failure with %s during flushing", instance.server_name + "SSH connection failure with %s during flushing", instance.instance_id ) raise # Using a single command to determine the state and kill the process if needed. # This makes it more robust when network is unstable. if busy: - logger.info("Attempting to kill all runner process on %s", instance.server_name) + logger.info("Attempting to kill all runner process on %s", instance.instance_id) # kill both Runner.Listener and Runner.Worker processes. # This kills pre-job.sh, a child process of Runner.Worker. kill_command = ( @@ -550,7 +550,7 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non ) else: logger.info( - "Attempting to kill runner process on %s if not busy", instance.server_name + "Attempting to kill runner process on %s if not busy", instance.instance_id ) # Only kill Runner.Listener if Runner.Worker does not exist. kill_command = ( @@ -582,32 +582,32 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) except SSHError as err: raise RunnerStartError( - f"Failed to SSH to {instance.server_name} during creation possible due to setup " + f"Failed to SSH to {instance.instance_id} during creation possible due to setup " "not completed" ) from err - logger.debug("Running `cloud-init status` on instance %s.", instance.server_name) + logger.debug("Running `cloud-init status` on instance %s.", instance.instance_id) result: invoke.runners.Result = ssh_conn.run("cloud-init status", warn=True, timeout=60) if not result.ok: logger.warning( - "cloud-init status command failed on %s: %s.", instance.server_name, result.stderr + "cloud-init status command failed on %s: %s.", instance.instance_id, result.stderr ) - raise RunnerStartError(f"Runner startup process not found on {instance.server_name}") + raise RunnerStartError(f"Runner startup process not found on {instance.instance_id}") # A short running job may have already completed and exited the runner, hence check the # condition via cloud-init status check. if CloudInitStatus.DONE in result.stdout: return - logger.debug("Running `ps aux` on instance %s.", instance.server_name) + logger.debug("Running `ps aux` on instance %s.", instance.instance_id) result = ssh_conn.run("ps aux", warn=True, timeout=60) if not result.ok: - logger.warning("SSH run of `ps aux` failed on %s", instance.server_name) - raise RunnerStartError(f"Unable to SSH run `ps aux` on {instance.server_name}") + logger.warning("SSH run of `ps aux` failed on %s", instance.instance_id) + raise RunnerStartError(f"Unable to SSH run `ps aux` on {instance.instance_id}") # Runner startup process is the parent process of runner.Listener and runner.Worker which # starts up much faster. if RUNNER_STARTUP_PROCESS not in result.stdout: - logger.warning("Runner startup process not found on %s", instance.server_name) - raise RunnerStartError(f"Runner startup process not found on {instance.server_name}") - logger.info("Runner startup process found to be healthy on %s", instance.server_name) + logger.warning("Runner startup process not found on %s", instance.instance_id) + raise RunnerStartError(f"Runner startup process not found on {instance.instance_sid}") + logger.info("Runner startup process found to be healthy on %s", instance.instance_id) @retry(tries=5, delay=60, local_logger=logger) def _wait_runner_running(self, instance: OpenstackInstance) -> None: @@ -623,7 +623,7 @@ def _wait_runner_running(self, instance: OpenstackInstance) -> None: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) except SSHError as err: raise RunnerStartError( - f"Failed to SSH connect to {instance.server_name} openstack runner" + f"Failed to SSH connect to {instance.instance_id} openstack runner" ) from err try: @@ -632,15 +632,15 @@ def _wait_runner_running(self, instance: OpenstackInstance) -> None: ) except OpenstackHealthCheckError as exc: raise RunnerStartError( - f"Failed to check health of runner process on {instance.server_name}" + f"Failed to check health of runner process on {instance.instance_id}" ) from exc if not healthy: - logger.info("Runner %s not considered healthy", instance.server_name) + logger.info("Runner %s not considered healthy", instance.instance_id) raise RunnerStartError( - f"Runner {instance.server_name} failed to initialize after starting" + f"Runner {instance.instance_id} failed to initialize after starting" ) - logger.info("Runner %s found to be healthy", instance.server_name) + logger.info("Runner %s found to be healthy", instance.instance_id) def _init_metrics_storage(self, name: str, install_start_timestamp: float) -> None: """Create metrics storage for runner. @@ -777,7 +777,7 @@ def _ssh_pull_file( @staticmethod def _run_runner_removal_script( - instance_id: str, ssh_conn: SSHConnection, remove_token: str + instance_id: InstanceID, ssh_conn: SSHConnection, remove_token: str ) -> None: """Run Github runner removal script. diff --git a/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py b/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py index 73d292ca7..2abd78326 100644 --- a/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py +++ b/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py @@ -266,9 +266,9 @@ def _health_checks_side_effect(openstack_cloud, instance): This implements the logic mentioned in the docstring above. """ - if instance.server_name.startswith("test-healthy"): + if instance.instance_id.name.startswith("test-healthy"): return True - if instance.server_name.startswith("test-unhealthy"): + if instance.instance_id.name.startswith("test-unhealthy"): return False raise OpenstackHealthCheckError("Health check failed") diff --git a/github-runner-manager/tests/unit/test_github_client.py b/github-runner-manager/tests/unit/test_github_client.py index 62f522bba..bddbf533e 100644 --- a/github-runner-manager/tests/unit/test_github_client.py +++ b/github-runner-manager/tests/unit/test_github_client.py @@ -15,6 +15,7 @@ from github_runner_manager.configuration.github import GitHubOrg, GitHubRepo from github_runner_manager.errors import GithubApiError, JobNotFoundError, TokenError from github_runner_manager.github_client import GithubClient +from github_runner_manager.manager.models import InstanceID from github_runner_manager.types_.github import JobConclusion, JobInfo, JobStatus JobStatsRawData = namedtuple( @@ -313,7 +314,7 @@ def test_get_registration_jittoken_repo(github_client: GithubClient): "encoded_jit_config": "hugestringinhere", } - instance_id = "test-runner-99999999" + instance_id = InstanceID.build("test-runner") labels = ["label1", "label2"] jittoken = github_client.get_runner_registration_jittoken( path=github_repo, instance_id=instance_id, labels=labels @@ -393,10 +394,12 @@ def raise_for_status(self): monkeypatch.setattr(requests, "get", _mock_get) + instance_id = InstanceID.build("test-runner") + def _mock_generate_runner_jitconfig_for_org(org, name, runner_group_id, labels): """Mocked generate_runner_jitconfig_for_org.""" assert org == "theorg" - assert name == "test-runner-99999999" + assert name == instance_id.name assert runner_group_id == 3 assert labels == ["label1", "label2"] return { @@ -420,7 +423,6 @@ def _mock_generate_runner_jitconfig_for_org(org, name, runner_group_id, labels): _mock_generate_runner_jitconfig_for_org ) - instance_id = "test-runner-99999999" labels = ["label1", "label2"] jittoken = github_client.get_runner_registration_jittoken( path=github_repo, instance_id=instance_id, labels=labels From be79be1762e58d9036cac7c640eafdb82ccb4ffd Mon Sep 17 00:00:00 2001 From: Javier de la Puente Date: Fri, 28 Feb 2025 15:18:24 +0100 Subject: [PATCH 03/13] more moving around --- .../github_runner_manager/manager/models.py | 2 +- .../github_runner_manager/metrics/runner.py | 67 +++++------ .../github_runner_manager/metrics/storage.py | 66 ++++++----- .../openstack_runner_manager.py | 36 +++--- .../tests/unit/metrics/test_runner.py | 109 +++++++++--------- .../tests/unit/metrics/test_storage.py | 46 ++++---- .../test_openstack_runner_manager.py | 26 +++-- 7 files changed, 185 insertions(+), 167 deletions(-) diff --git a/github-runner-manager/src/github_runner_manager/manager/models.py b/github-runner-manager/src/github_runner_manager/manager/models.py index 49edbe127..101957b78 100644 --- a/github-runner-manager/src/github_runner_manager/manager/models.py +++ b/github-runner-manager/src/github_runner_manager/manager/models.py @@ -7,7 +7,7 @@ from dataclasses import dataclass -@dataclass +@dataclass(eq=True, frozen=True) class InstanceID: """TODO. diff --git a/github-runner-manager/src/github_runner_manager/metrics/runner.py b/github-runner-manager/src/github_runner_manager/metrics/runner.py index 52a1891ee..f69716d33 100644 --- a/github-runner-manager/src/github_runner_manager/metrics/runner.py +++ b/github-runner-manager/src/github_runner_manager/metrics/runner.py @@ -18,6 +18,7 @@ IssueMetricEventError, RunnerMetricsError, ) +from github_runner_manager.manager.models import InstanceID from github_runner_manager.metrics import events as metric_events from github_runner_manager.metrics.storage import MetricsStorage from github_runner_manager.metrics.storage import StorageManagerProtocol as MetricsStorageManager @@ -97,14 +98,14 @@ class RunnerMetrics(BaseModel): installed_timestamp: The UNIX time stamp of the time at which the runner was installed. pre_job: The metrics for the pre-job phase. post_job: The metrics for the post-job phase. - runner_name: The name of the runner. + instance_id: The name of the runner. """ installation_start_timestamp: NonNegativeFloat | None installed_timestamp: NonNegativeFloat pre_job: PreJobMetrics | None post_job: PostJobMetrics | None - runner_name: str + instance_id: InstanceID def extract( @@ -131,14 +132,14 @@ def extract( Extracted runner metrics of a particular runner. """ for ms in metrics_storage_manager.list_all(): - if (include and ms.runner_name in runners) or ( - not include and ms.runner_name not in runners + if (include and ms.instance_id in runners) or ( + not include and ms.instance_id not in runners ): runner_metrics = _extract_storage( metrics_storage_manager=metrics_storage_manager, metrics_storage=ms ) if not runner_metrics: - logger.warning("Not able to issue metrics for runner %s", ms.runner_name) + logger.warning("Not able to issue metrics for runner %s", ms.instance_id) else: yield runner_metrics @@ -180,7 +181,7 @@ def issue_events( else: logger.debug( "Pre-job metrics not found for runner %s. Will not issue RunnerStop metric.", - runner_metrics.runner_name, + runner_metrics.instance_id, ) except (ValidationError, IssueMetricEventError): if runner_metrics.installation_start_timestamp and not issued_events: @@ -188,7 +189,7 @@ def issue_events( "Not able to issue RunnerInstalled metric for runner %s with" " installation_start_timestamp %s." "Will not issue RunnerStart and RunnerStop metric.", - runner_metrics.runner_name, + runner_metrics.instance_id, runner_metrics.installation_start_timestamp, ) elif metric_events.RunnerStart not in issued_events: @@ -196,7 +197,7 @@ def issue_events( "Not able to issue RunnerStart metric for " "runner %s with pre-job metrics %s and job_metrics %s." "Will not issue RunnerStop metric.", - runner_metrics.runner_name, + runner_metrics.instance_id, runner_metrics.pre_job, job_metrics, ) @@ -204,7 +205,7 @@ def issue_events( logger.exception( "Not able to issue RunnerStop metric for " "runner %s with pre-job metrics %s, post-job metrics %s and job_metrics %s.", - runner_metrics.runner_name, + runner_metrics.instance_id, runner_metrics.pre_job, runner_metrics.post_job, job_metrics, @@ -234,7 +235,7 @@ def _issue_runner_installed( duration=runner_metrics.installed_timestamp # type: ignore - runner_metrics.installation_start_timestamp, # type: ignore ) - logger.debug("Issuing RunnerInstalled metric for runner %s", runner_metrics.runner_name) + logger.debug("Issuing RunnerInstalled metric for runner %s", runner_metrics.instance_id) metric_events.issue_event(runner_installed) return metric_events.RunnerInstalled @@ -255,7 +256,7 @@ def _issue_runner_start( """ runner_start_event = _create_runner_start(runner_metrics, flavor, job_metrics) - logger.debug("Issuing RunnerStart metric for runner %s", runner_metrics.runner_name) + logger.debug("Issuing RunnerStart metric for runner %s", runner_metrics.instance_id) metric_events.issue_event(runner_start_event) return metric_events.RunnerStart @@ -276,7 +277,7 @@ def _issue_runner_stop( """ runner_stop_event = _create_runner_stop(runner_metrics, flavor, job_metrics) - logger.debug("Issuing RunnerStop metric for runner %s", runner_metrics.runner_name) + logger.debug("Issuing RunnerStop metric for runner %s", runner_metrics.instance_id) metric_events.issue_event(runner_stop_event) return metric_events.RunnerStop @@ -314,7 +315,7 @@ def _create_runner_start( " Setting idle_duration to zero", pre_job_metrics.timestamp, runner_metrics.installed_timestamp, - runner_metrics.runner_name, + runner_metrics.instance_id, ) idle_duration = max(pre_job_metrics.timestamp - runner_metrics.installed_timestamp, 0) @@ -322,7 +323,7 @@ def _create_runner_start( if job_metrics and job_metrics.queue_duration < 0: logger.warning( "Queue duration for runner %s is negative: %f. Setting it to zero.", - runner_metrics.runner_name, + runner_metrics.instance_id, job_metrics.queue_duration, ) queue_duration = max(job_metrics.queue_duration, 0) if job_metrics else None @@ -378,7 +379,7 @@ def _create_runner_stop( " Setting job_duration to zero", post_job_metrics.timestamp, pre_job_metrics.timestamp, - runner_metrics.runner_name, + runner_metrics.instance_id, ) job_duration = max(post_job_metrics.timestamp - pre_job_metrics.timestamp, 0) @@ -408,16 +409,16 @@ def _extract_storage( Returns: The extracted metrics if at least the runner installed timestamp is present. """ - runner_name = metrics_storage.runner_name + instance_id = metrics_storage.instance_id try: - logger.debug("Extracting metrics from metrics storage for runner %s", runner_name) + logger.debug("Extracting metrics from metrics storage for runner %s", instance_id) metrics_from_fs = _extract_metrics_from_storage(metrics_storage) except CorruptMetricDataError: - logger.exception("Corrupt metric data found for runner %s", runner_name) - metrics_storage_manager.move_to_quarantine(runner_name) + logger.exception("Corrupt metric data found for runner %s", instance_id) + metrics_storage_manager.move_to_quarantine(instance_id) return None - logger.debug("Cleaning metrics storage for runner %s", runner_name) + logger.debug("Cleaning metrics storage for runner %s", instance_id) _clean_up_storage( metrics_storage_manager=metrics_storage_manager, metrics_storage=metrics_storage ) @@ -442,12 +443,12 @@ def _extract_metrics_from_storage(metrics_storage: MetricsStorage) -> Optional[R f"The limit is {FILE_SIZE_BYTES_LIMIT} bytes." ) - runner_name = metrics_storage.runner_name + instance_id = metrics_storage.instance_id installation_start_timestamp = _extract_file_from_storage( metrics_storage=metrics_storage, filename=RUNNER_INSTALLATION_START_TS_FILE_NAME ) - logger.debug("Runner %s installation started at %s", runner_name, installation_start_timestamp) + logger.debug("Runner %s installation started at %s", instance_id, installation_start_timestamp) installed_timestamp = _extract_file_from_storage( metrics_storage=metrics_storage, filename=RUNNER_INSTALLED_TS_FILE_NAME @@ -455,26 +456,26 @@ def _extract_metrics_from_storage(metrics_storage: MetricsStorage) -> Optional[R if not installed_timestamp: logger.error( "installed timestamp not found for runner %s, will not extract any metrics.", - runner_name, + instance_id, ) return None - logger.debug("Runner %s installed at %s", runner_name, installed_timestamp) + logger.debug("Runner %s installed at %s", instance_id, installed_timestamp) pre_job_metrics = _extract_json_file_from_storage( metrics_storage=metrics_storage, filename=PRE_JOB_METRICS_FILE_NAME ) if pre_job_metrics: - logger.debug("Pre-job metrics for runner %s: %s", runner_name, pre_job_metrics) + logger.debug("Pre-job metrics for runner %s: %s", instance_id, pre_job_metrics) post_job_metrics = _extract_json_file_from_storage( metrics_storage=metrics_storage, filename=POST_JOB_METRICS_FILE_NAME ) - logger.debug("Post-job metrics for runner %s: %s", runner_name, post_job_metrics) + logger.debug("Post-job metrics for runner %s: %s", instance_id, post_job_metrics) else: logger.error( "Pre-job metrics for runner %s not found, stop extracting post-jobs metrics.", - runner_name, + instance_id, ) post_job_metrics = None @@ -486,7 +487,7 @@ def _extract_metrics_from_storage(metrics_storage: MetricsStorage) -> Optional[R installed_timestamp=float(installed_timestamp), pre_job=PreJobMetrics(**pre_job_metrics) if pre_job_metrics else None, post_job=PostJobMetrics(**post_job_metrics) if post_job_metrics else None, - runner_name=runner_name, + instance_id=instance_id, ) except ValueError as exc: raise CorruptMetricDataError(str(exc)) from exc @@ -538,7 +539,7 @@ def _extract_json_file_from_storage(metrics_storage: MetricsStorage, filename: s raise CorruptMetricDataError(str(exc)) from exc if not isinstance(job_metrics, dict): raise CorruptMetricDataError( - f"{filename} metrics for runner {metrics_storage.runner_name} is not a JSON object." + f"{filename} metrics for runner {metrics_storage.instance_id} is not a JSON object." ) return job_metrics @@ -556,7 +557,7 @@ def _extract_file_from_storage(metrics_storage: MetricsStorage, filename: str) - try: file_content = metrics_storage.path.joinpath(filename).read_text(encoding="utf-8") except FileNotFoundError: - logger.exception("%s not found for runner %s", filename, metrics_storage.runner_name) + logger.exception("%s not found for runner %s", filename, metrics_storage.instance_id) file_content = None return file_content @@ -580,12 +581,12 @@ def _clean_up_storage( logger.exception( "Could not remove metric files for runner %s, " "this may lead to duplicate metrics issued", - metrics_storage.runner_name, + metrics_storage.instance_id, ) try: - metrics_storage_manager.delete(metrics_storage.runner_name) + metrics_storage_manager.delete(metrics_storage.instance_id) except DeleteMetricsStorageError: logger.exception( - "Could not delete metrics storage for runner %s.", metrics_storage.runner_name + "Could not delete metrics storage for runner %s.", metrics_storage.instance_id ) diff --git a/github-runner-manager/src/github_runner_manager/metrics/storage.py b/github-runner-manager/src/github_runner_manager/metrics/storage.py index 131aa8634..b43a8025f 100644 --- a/github-runner-manager/src/github_runner_manager/metrics/storage.py +++ b/github-runner-manager/src/github_runner_manager/metrics/storage.py @@ -19,6 +19,7 @@ GetMetricsStorageError, QuarantineMetricsStorageError, ) +from github_runner_manager.manager.models import InstanceID _FILESYSTEM_BASE_DIR_NAME = "runner-fs" _FILESYSTEM_QUARANTINE_DIR_NAME = "runner-fs-quarantine" @@ -32,11 +33,11 @@ class MetricsStorage: Attributes: path: The path to the directory holding the metrics inside the charm. - runner_name: The name of the associated runner. + instance_id: The name of the associated runner. """ path: Path - runner_name: str + instance_id: InstanceID class StorageManagerProtocol(Protocol): # pylint: disable=too-few-public-methods @@ -61,8 +62,12 @@ class StorageManagerProtocol(Protocol): # pylint: disable=too-few-public-method class StorageManager(StorageManagerProtocol): """Manager for the metrics storage.""" - def __init__(self) -> None: - """Initialize the storage manager.""" + def __init__(self, prefix: str) -> None: + """Initialize the storage manager. + + Args: + prefix: TODO + """ self._base_dir = ( Path(f"~{constants.RUNNER_MANAGER_USER}").expanduser() / _FILESYSTEM_BASE_DIR_NAME ) @@ -70,15 +75,16 @@ def __init__(self) -> None: Path(f"~{constants.RUNNER_MANAGER_USER}").expanduser() / _FILESYSTEM_QUARANTINE_DIR_NAME ) + self._prefix = prefix - def create(self, runner_name: str) -> MetricsStorage: + def create(self, instance_id: InstanceID) -> MetricsStorage: """Create metrics storage for the runner. The method is not idempotent and will raise an exception if the storage already exists. Args: - runner_name: The name of the runner. + instance_id: The name of the runner. Returns: The metrics storage object. @@ -119,16 +125,16 @@ def create(self, runner_name: str) -> MetricsStorage: "Failed to create metrics storage directories" ) from exc - runner_fs_path = self._get_runner_fs_path(runner_name=runner_name) + runner_fs_path = self._get_runner_fs_path(instance_id=instance_id) try: runner_fs_path.mkdir() except FileExistsError as exc: raise CreateMetricsStorageError( - f"Metrics storage for runner {runner_name} already exists." + f"Metrics storage for runner {instance_id} already exists." ) from exc - return MetricsStorage(runner_fs_path, runner_name) + return MetricsStorage(runner_fs_path, instance_id) def list_all(self) -> Iterator[MetricsStorage]: """List all the metric storages. @@ -142,17 +148,17 @@ def list_all(self) -> Iterator[MetricsStorage]: directories = (entry for entry in self._base_dir.iterdir() if entry.is_dir()) for directory in directories: try: - fs = self.get(runner_name=directory.name) + fs = self.get(instance_id=InstanceID.build_from_name(self._prefix, directory.name)) except GetMetricsStorageError: logger.error("Failed to get metrics storage for runner %s", directory.name) else: yield fs - def get(self, runner_name: str) -> MetricsStorage: + def get(self, instance_id: InstanceID) -> MetricsStorage: """Get the metrics storage for the runner. Args: - runner_name: The name of the runner. + instance_id: The name of the runner. Returns: The metrics storage object. @@ -161,73 +167,73 @@ def get(self, runner_name: str) -> MetricsStorage: GetMetricsStorageError: If the storage does not exist. """ runner_fs_path = self._get_runner_fs_path( - runner_name=runner_name, + instance_id=instance_id, ) if not runner_fs_path.exists(): - raise GetMetricsStorageError(f"Metrics storage for runner {runner_name} not found.") + raise GetMetricsStorageError(f"Metrics storage for runner {instance_id} not found.") - return MetricsStorage(runner_fs_path, runner_name) + return MetricsStorage(runner_fs_path, instance_id) - def delete(self, runner_name: str) -> None: + def delete(self, instance_id: InstanceID) -> None: """Delete the metrics storage for the runner. Args: - runner_name: The name of the runner. + instance_id: The name of the runner. Raises: DeleteMetricsStorageError: If the storage could not be deleted. """ - runner_fs_path = self._get_runner_fs_path(runner_name=runner_name) + runner_fs_path = self._get_runner_fs_path(instance_id=instance_id) try: shutil.rmtree(runner_fs_path) except OSError as exc: raise DeleteMetricsStorageError( - f"Failed to remove metrics storage for runner {runner_name}" + f"Failed to remove metrics storage for runner {instance_id}" ) from exc def move_to_quarantine( self, - runner_name: str, + instance_id: InstanceID, ) -> None: """Archive the metrics storage for the runner and delete it. Args: - runner_name: The name of the runner. + instance_id: The name of the runner. Raises: QuarantineMetricsStorageError: If the metrics storage could not be quarantined. """ try: - runner_fs = self.get(runner_name) + runner_fs = self.get(instance_id) except GetMetricsStorageError as exc: raise QuarantineMetricsStorageError( - f"Failed to get metrics storage for runner {runner_name}" + f"Failed to get metrics storage for runner {instance_id}" ) from exc - tarfile_path = self._quarantine_dir.joinpath(runner_name).with_suffix(".tar.gz") + tarfile_path = self._quarantine_dir.joinpath(str(instance_id)).with_suffix(".tar.gz") try: with tarfile.open(tarfile_path, "w:gz") as tar: tar.add(runner_fs.path, arcname=runner_fs.path.name) except OSError as exc: raise QuarantineMetricsStorageError( - f"Failed to archive metrics storage for runner {runner_name}" + f"Failed to archive metrics storage for runner {instance_id}" ) from exc try: - self.delete(runner_name) + self.delete(instance_id) except DeleteMetricsStorageError as exc: raise QuarantineMetricsStorageError( - f"Failed to delete metrics storage for runner {runner_name}" + f"Failed to delete metrics storage for runner {instance_id}" ) from exc - def _get_runner_fs_path(self, runner_name: str) -> Path: + def _get_runner_fs_path(self, instance_id: InstanceID) -> Path: """Get the path of the runner metrics storage. Args: - runner_name: The name of the runner. + instance_id: The name of the runner. Returns: The path of the runner shared filesystem. """ - return self._base_dir / runner_name + return self._base_dir / str(instance_id) diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py index c9a4bd7f6..ff7d22233 100644 --- a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py +++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py @@ -149,7 +149,7 @@ def __init__( prefix=self.name_prefix, system_user=constants.RUNNER_MANAGER_USER, ) - self._metrics_storage_manager = metrics_storage.StorageManager() + self._metrics_storage_manager = metrics_storage.StorageManager(self._config.prefix) # Setting the env var to this process and any child process spawned. proxies = config.service_config.proxy_config @@ -184,7 +184,9 @@ def create_runner(self, instance_id: InstanceID, registration_jittoken: str) -> raise MissingServerConfigError("Missing server configuration to create runners") start_timestamp = time.time() - self._init_metrics_storage(name=instance_id, install_start_timestamp=start_timestamp) + self._init_metrics_storage( + instance_id=instance_id, install_start_timestamp=start_timestamp + ) cloud_init = self._generate_cloud_init(registration_jittoken=registration_jittoken) try: @@ -363,11 +365,11 @@ def _cleanup_extract_metrics( unmatched_metrics_storage = ( ms for ms in metrics_storage_manager.list_all() - if ms.runner_name not in all_runner_names + if ms.instance_id not in all_runner_names ) # We assume that storage is dangling if it has not been updated for a long time. dangling_storage_runner_names = { - ms.runner_name + ms.instance_id for ms in unmatched_metrics_storage if ms.path.stat().st_mtime < time.time() - OUTDATED_METRICS_STORAGE_IN_SECONDS } @@ -386,7 +388,7 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None """ try: ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - self._pull_runner_metrics(instance.instance_id.name, ssh_conn) + self._pull_runner_metrics(instance.instance_id, ssh_conn) try: OpenStackRunnerManager._run_runner_removal_script( @@ -642,7 +644,9 @@ def _wait_runner_running(self, instance: OpenstackInstance) -> None: logger.info("Runner %s found to be healthy", instance.instance_id) - def _init_metrics_storage(self, name: str, install_start_timestamp: float) -> None: + def _init_metrics_storage( + self, instance_id: InstanceID, install_start_timestamp: float + ) -> None: """Create metrics storage for runner. An error will be logged if the storage cannot be created. @@ -650,16 +654,16 @@ def _init_metrics_storage(self, name: str, install_start_timestamp: float) -> No and not fail for other operations. Args: - name: The name of the runner. + instance_id: The name of the runner. install_start_timestamp: The timestamp of installation start. """ try: - storage = self._metrics_storage_manager.create(runner_name=name) + storage = self._metrics_storage_manager.create(instance_id=instance_id) except CreateMetricsStorageError: logger.exception( "Failed to create metrics storage for runner %s, " "will not be able to issue all metrics.", - name, + instance_id, ) else: try: @@ -670,24 +674,24 @@ def _init_metrics_storage(self, name: str, install_start_timestamp: float) -> No logger.exception( f"Failed to write {runner_metrics.RUNNER_INSTALLATION_START_TS_FILE_NAME}" f" into metrics storage for runner %s, will not be able to issue all metrics.", - name, + instance_id, ) - def _pull_runner_metrics(self, name: str, ssh_conn: SSHConnection) -> None: + def _pull_runner_metrics(self, instance_id: InstanceID, ssh_conn: SSHConnection) -> None: """Pull metrics from runner. Args: - name: The name of the runner. + instance_id: The name of the runner. ssh_conn: The SSH connection to the runner. """ - logger.debug("Pulling metrics for %s", name) + logger.debug("Pulling metrics for %s", instance_id) try: - storage = self._metrics_storage_manager.get(runner_name=name) + storage = self._metrics_storage_manager.get(instance_id=instance_id) except GetMetricsStorageError: logger.exception( "Failed to get shared metrics storage for runner %s, " "will not be able to issue all metrics.", - name, + instance_id, ) return @@ -713,7 +717,7 @@ def _pull_runner_metrics(self, name: str, ssh_conn: SSHConnection) -> None: except _PullFileError as exc: logger.warning( "Failed to pull metrics for %s: %s . Will not be able to issue all metrics", - name, + instance_id, exc, ) diff --git a/github-runner-manager/tests/unit/metrics/test_runner.py b/github-runner-manager/tests/unit/metrics/test_runner.py index 023212c8d..fb73c8538 100644 --- a/github-runner-manager/tests/unit/metrics/test_runner.py +++ b/github-runner-manager/tests/unit/metrics/test_runner.py @@ -8,6 +8,7 @@ import pytest from github_runner_manager.errors import DeleteMetricsStorageError, IssueMetricEventError +from github_runner_manager.manager.models import InstanceID from github_runner_manager.metrics import events as metric_events from github_runner_manager.metrics import runner as runner_metrics from github_runner_manager.metrics import type as metrics_type @@ -38,11 +39,11 @@ def runner_fs_base_fixture(tmp_path: Path) -> Path: return runner_fs_base -def _create_metrics_data(runner_name: str) -> RunnerMetrics: +def _create_metrics_data(instance_id: InstanceID) -> RunnerMetrics: """Create a RunnerMetrics object that is suitable for most tests. Args: - runner_name: The test runner name. + instance_id: The test runner name. Returns: Test metrics data. @@ -58,7 +59,7 @@ def _create_metrics_data(runner_name: str) -> RunnerMetrics: event="push", ), post_job=PostJobMetrics(timestamp=3, status=runner_metrics.PostJobStatus.NORMAL), - runner_name=runner_name, + instance_id=instance_id, ) @@ -78,7 +79,7 @@ def _create_runner_fs_base(tmp_path: Path): def _create_runner_files( runner_fs_base: Path, - runner_name: str, + runner_name: InstanceID, pre_job_data: str | bytes | None, post_job_data: str | bytes | None, installed_timestamp: str | bytes | None, @@ -100,7 +101,7 @@ def _create_runner_files( Returns: A SharedFilesystem instance. """ - runner_fs = runner_fs_base / runner_name + runner_fs = runner_fs_base / str(runner_name) runner_fs.mkdir() if pre_job_data: if isinstance(pre_job_data, bytes): @@ -137,7 +138,7 @@ def _create_runner_files( runner_fs.joinpath(runner_metrics.RUNNER_INSTALLATION_START_TS_FILE_NAME).write_text( installation_start_timestamp, encoding="utf-8" ) - return MetricsStorage(path=runner_fs, runner_name=runner_name) + return MetricsStorage(path=runner_fs, instance_id=runner_name) def test_extract(runner_fs_base: Path): @@ -153,22 +154,22 @@ def test_extract(runner_fs_base: Path): 1. - 4. metrics are extracted 5. no metrics are extracted """ - runner_all_metrics_name = secrets.token_hex(16) + runner_all_metrics_name = InstanceID.build("prefix") runner_all_metrics = _create_metrics_data(runner_all_metrics_name) - runner_without_install_start_ts_name = secrets.token_hex(16) + runner_without_install_start_ts_name = InstanceID.build("prefix") runner_without_install_start_ts_metrics = runner_all_metrics.copy( update={"installation_start_timestamp": None} ) - runner_without_install_start_ts_metrics.runner_name = runner_without_install_start_ts_name - runner_wihout_post_job_name = secrets.token_hex(16) + runner_without_install_start_ts_metrics.instance_id = runner_without_install_start_ts_name + runner_wihout_post_job_name = InstanceID.build("prefix") runner_without_post_job_metrics = runner_all_metrics.copy() runner_without_post_job_metrics.post_job = None - runner_without_post_job_metrics.runner_name = runner_wihout_post_job_name - runner_with_only_install_timestamps_name = secrets.token_hex(16) + runner_without_post_job_metrics.instance_id = runner_wihout_post_job_name + runner_with_only_install_timestamps_name = InstanceID.build("prefix") runner_with_only_install_timestamps_metrics = runner_without_post_job_metrics.copy( update={"pre_job": None} ) - runner_with_only_install_timestamps_metrics.runner_name = ( + runner_with_only_install_timestamps_metrics.instance_id = ( runner_with_only_install_timestamps_name ) @@ -236,11 +237,11 @@ def test_extract(runner_fs_base: Path): ] metrics_storage_manager.delete.assert_has_calls( [ - ((runner1_fs.runner_name,),), - ((runner2_fs.runner_name,),), - ((runner3_fs.runner_name,),), - ((runner4_fs.runner_name,),), - ((runner5_fs.runner_name,),), + ((runner1_fs.instance_id,),), + ((runner2_fs.instance_id,),), + ((runner3_fs.instance_id,),), + ((runner4_fs.instance_id,),), + ((runner5_fs.instance_id,),), ] ) @@ -255,7 +256,7 @@ def test_extract_ignores_runners(runner_fs_base: Path): runner_filesystems = [] for i in range(5): - runner_name = secrets.token_hex(16) + runner_name = InstanceID.build("prefix") data = _create_metrics_data(runner_name) data.pre_job.workflow = f"workflow{i}" runner_metrics_data.append(data) @@ -272,7 +273,7 @@ def test_extract_ignores_runners(runner_fs_base: Path): metrics_storage_manager = MagicMock() metrics_storage_manager.list_all.return_value = runner_filesystems - ignore_runners = {runner_filesystems[0].runner_name, runner_filesystems[2].runner_name} + ignore_runners = {runner_filesystems[0].instance_id, runner_filesystems[2].instance_id} extracted_metrics = list( runner_metrics.extract( @@ -295,8 +296,8 @@ def test_extract_corrupt_data(runner_fs_base: Path, monkeypatch: pytest.MonkeyPa act: Call extract. assert: No metrics are extracted is issued and shared filesystems are quarantined in all cases. """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) + runner_name = InstanceID.build("prefix") + runner_metrics_data = _create_metrics_data(instance_id=runner_name) # 1. Runner has noncompliant pre-job metrics inside metrics storage invalid_pre_job_data = runner_metrics_data.pre_job.copy(update={"timestamp": -1}) @@ -317,11 +318,11 @@ def test_extract_corrupt_data(runner_fs_base: Path, monkeypatch: pytest.MonkeyPa ) assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(runner_fs.runner_name) + move_to_quarantine_mock.assert_any_call(runner_fs.instance_id) # 2. Runner has non-json post-job metrics inside metrics storage. - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) + runner_name = InstanceID.build("prefix") + runner_metrics_data = _create_metrics_data(instance_id=runner_name) runner_fs = _create_runner_files( runner_fs_base, @@ -336,11 +337,11 @@ def test_extract_corrupt_data(runner_fs_base: Path, monkeypatch: pytest.MonkeyPa runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) ) assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(runner_fs.runner_name) + move_to_quarantine_mock.assert_any_call(runner_fs.instance_id) # 3. Runner has json post-job metrics but a json array (not object) inside metrics storage. - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) + runner_name = InstanceID.build("prefix") + runner_metrics_data = _create_metrics_data(instance_id=runner_name) runner_fs = _create_runner_files( runner_fs_base, @@ -355,11 +356,11 @@ def test_extract_corrupt_data(runner_fs_base: Path, monkeypatch: pytest.MonkeyPa runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) ) assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(runner_fs.runner_name) + move_to_quarantine_mock.assert_any_call(runner_fs.instance_id) # 4. Runner has not a timestamp in installed_timestamp file inside metrics storage. - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) + runner_name = InstanceID.build("prefix") + runner_metrics_data = _create_metrics_data(instance_id=runner_name) runner_fs = _create_runner_files( runner_fs_base, @@ -375,11 +376,11 @@ def test_extract_corrupt_data(runner_fs_base: Path, monkeypatch: pytest.MonkeyPa ) assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(runner_fs.runner_name) + move_to_quarantine_mock.assert_any_call(runner_fs.instance_id) # 5. Runner has not a timestamp in installation_start_timestamp file inside metrics storage. - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) + runner_name = InstanceID.build("prefix") + runner_metrics_data = _create_metrics_data(instance_id=runner_name) runner_fs = _create_runner_files( runner_fs_base, @@ -396,7 +397,7 @@ def test_extract_corrupt_data(runner_fs_base: Path, monkeypatch: pytest.MonkeyPa ) assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(runner_fs.runner_name) + move_to_quarantine_mock.assert_any_call(runner_fs.instance_id) def test_extract_raises_error_for_too_large_files(runner_fs_base: Path): @@ -405,7 +406,7 @@ def test_extract_raises_error_for_too_large_files(runner_fs_base: Path): act: Call extract. assert: No metrics are extracted and shared filesystems is quarantined. """ - runner_name = secrets.token_hex(16) + runner_name = InstanceID.build("prefix") runner_metrics_data = _create_metrics_data(runner_name) # 1. Runner has a pre-job metrics file that is too large @@ -432,10 +433,10 @@ def test_extract_raises_error_for_too_large_files(runner_fs_base: Path): ) assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(runner_fs.runner_name) + move_to_quarantine_mock.assert_any_call(runner_fs.instance_id) # 2. Runner has a post-job metrics file that is too large - runner_name = secrets.token_hex(16) + runner_name = InstanceID.build("prefix") runner_metrics_data = _create_metrics_data(runner_name) invalid_post_job_data = runner_metrics_data.post_job.copy( update={"status": "a" * runner_metrics.FILE_SIZE_BYTES_LIMIT + "b"} @@ -455,10 +456,10 @@ def test_extract_raises_error_for_too_large_files(runner_fs_base: Path): assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(runner_fs.runner_name) + move_to_quarantine_mock.assert_any_call(runner_fs.instance_id) # 3. Runner has an installed_timestamp file that is too large - runner_name = secrets.token_hex(16) + runner_name = InstanceID.build("prefix") runner_metrics_data = _create_metrics_data(runner_name) invalid_ts = "1" * (runner_metrics.FILE_SIZE_BYTES_LIMIT + 1) @@ -477,10 +478,10 @@ def test_extract_raises_error_for_too_large_files(runner_fs_base: Path): ) assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(runner_fs.runner_name) + move_to_quarantine_mock.assert_any_call(runner_fs.instance_id) # 4. Runner has an installation_start_timestamp file that is too large - runner_name = secrets.token_hex(16) + runner_name = InstanceID.build("prefix") runner_metrics_data = _create_metrics_data(runner_name) invalid_ts = "1" * (runner_metrics.FILE_SIZE_BYTES_LIMIT + 1) @@ -500,7 +501,7 @@ def test_extract_raises_error_for_too_large_files(runner_fs_base: Path): ) assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(runner_fs.runner_name) + move_to_quarantine_mock.assert_any_call(runner_fs.instance_id) def test_extract_ignores_filesystems_without_ts(runner_fs_base: Path): @@ -509,7 +510,7 @@ def test_extract_ignores_filesystems_without_ts(runner_fs_base: Path): act: Call extract. assert: No metrics are extracted and shared filesystem is removed. """ - runner_name = secrets.token_hex(16) + runner_name = InstanceID.build("prefix") runner_metrics_data = RunnerMetrics.construct( installed_timestamp=1, pre_job=PreJobMetrics( @@ -520,7 +521,7 @@ def test_extract_ignores_filesystems_without_ts(runner_fs_base: Path): event="push", ), post_job=PostJobMetrics(timestamp=3, status=runner_metrics.PostJobStatus.NORMAL), - runner_name=runner_name, + instance_id=runner_name, ) runner_fs = _create_runner_files( @@ -537,7 +538,7 @@ def test_extract_ignores_filesystems_without_ts(runner_fs_base: Path): runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) ) assert not extracted_metrics - metrics_storage_manager.delete.assert_called_once_with(runner_fs.runner_name) + metrics_storage_manager.delete.assert_called_once_with(runner_fs.instance_id) def test_extract_ignores_failure_on_metrics_storage_cleanup( @@ -549,11 +550,11 @@ def test_extract_ignores_failure_on_metrics_storage_cleanup( act: Call extract. assert: The metric is extracted and the exception is caught and logged. """ - runner_name = secrets.token_hex(16) + runner_name = InstanceID.build("prefix") runner_metrics_data = _create_metrics_data(runner_name) runner_fs = _create_runner_files( runner_fs_base, - runner_metrics_data.runner_name, + runner_metrics_data.instance_id, runner_metrics_data.pre_job.json(), runner_metrics_data.post_job.json(), str(runner_metrics_data.installed_timestamp), @@ -581,7 +582,7 @@ def test_issue_events(issue_event_mock: MagicMock): act: Call issue_events. assert: RunnerInstalled, RunnerStart and RunnerStop metrics are issued. """ - runner_name = secrets.token_hex(16) + runner_name = InstanceID.build("prefix") runner_metrics_data = _create_metrics_data(runner_name) flavor = secrets.token_hex(16) @@ -641,7 +642,7 @@ def test_issue_events_pre_job_before_runner_installed(issue_event_mock: MagicMoc act: Call issue_events. assert: RunnerStart metric is issued with idle set to 0. """ - runner_name = secrets.token_hex(16) + runner_name = InstanceID.build("prefix") runner_metrics_data = _create_metrics_data(runner_name) runner_metrics_data.pre_job.timestamp = 0 @@ -676,7 +677,7 @@ def test_issue_events_post_job_before_pre_job(issue_event_mock: MagicMock): act: Call issue_events. assert: job_duration is set to zero. """ - runner_name = secrets.token_hex(16) + runner_name = InstanceID.build("prefix") runner_metrics_data = _create_metrics_data(runner_name) runner_metrics_data.post_job = PostJobMetrics( timestamp=0, status=runner_metrics.PostJobStatus.NORMAL @@ -734,7 +735,7 @@ def test_issue_events_partial_metrics( act: Call issue_events. assert: Only the expected metrics are issued. """ - runner_name = secrets.token_hex(16) + runner_name = InstanceID.build("prefix") runner_metrics_data = _create_metrics_data(runner_name) if not with_installation_start: runner_metrics_data.installation_start_timestamp = None @@ -803,7 +804,7 @@ def test_issue_events_returns_empty_set_on_issue_event_failure( act: Call issue_events. assert: No metrics at all are issued. The exception is caught and logged. """ - runner_name = secrets.token_hex(16) + runner_name = InstanceID.build("prefix") runner_metrics_data = _create_metrics_data(runner_name) issue_event_mock.side_effect = [IssueMetricEventError("Failed to issue metric"), None] @@ -828,7 +829,7 @@ def test_issue_events_post_job_but_no_pre_job( act: Call issue_events. assert: Only RunnerInstalled is issued. """ - runner_name = secrets.token_hex(16) + runner_name = InstanceID.build("prefix") runner_metrics_data = _create_metrics_data(runner_name) runner_metrics_data.pre_job = None diff --git a/github-runner-manager/tests/unit/metrics/test_storage.py b/github-runner-manager/tests/unit/metrics/test_storage.py index 125281e36..b29ee6d01 100644 --- a/github-runner-manager/tests/unit/metrics/test_storage.py +++ b/github-runner-manager/tests/unit/metrics/test_storage.py @@ -1,7 +1,6 @@ # Copyright 2025 Canonical Ltd. # See LICENSE file for licensing details. import os -import secrets import tarfile from grp import getgrgid from pathlib import Path @@ -15,6 +14,7 @@ GetMetricsStorageError, QuarantineMetricsStorageError, ) +from github_runner_manager.manager.models import InstanceID from github_runner_manager.metrics import storage from github_runner_manager.metrics.storage import ( _FILESYSTEM_BASE_DIR_NAME, @@ -54,9 +54,9 @@ def test_create_creates_directory(): act: Call create. assert: The directory is created. """ - runner_name = secrets.token_hex(16) + runner_name = InstanceID.build("prefix") - fs = StorageManager().create(runner_name) + fs = StorageManager("prefix").create(runner_name) assert fs.path.exists() assert fs.path.is_dir() @@ -68,8 +68,8 @@ def test_create_raises_exception_if_already_exists(): act: Call create. assert: The expected exception is raised. """ - runner_name = secrets.token_hex(16) - storage_manager = StorageManager() + runner_name = InstanceID.build("prefix") + storage_manager = StorageManager("prefix") storage_manager.create(runner_name) with pytest.raises(CreateMetricsStorageError): @@ -82,8 +82,8 @@ def test_list_all(): act: Call list_all. assert: A generator listing all the shared filesystems is returned. """ - runner_names = [secrets.token_hex(16) for _ in range(3)] - storage_manager = StorageManager() + runner_names = [InstanceID.build("prefix") for _ in range(3)] + storage_manager = StorageManager("prefix") for runner_name in runner_names: storage_manager.create(runner_name) @@ -92,7 +92,7 @@ def test_list_all(): assert len(fs_list) == 3 for fs in fs_list: assert isinstance(fs, storage.MetricsStorage) - assert fs.runner_name in runner_names + assert fs.instance_id in runner_names def test_list_all_empty(): @@ -101,7 +101,7 @@ def test_list_all_empty(): act: Call list_all. assert: An empty iterator is returned. """ - fs_list = list(StorageManager().list_all()) + fs_list = list(StorageManager("prefix").list_all()) assert len(fs_list) == 0 @@ -112,8 +112,8 @@ def test_delete(): act: Call delete assert: The storage is deleted. """ - runner_name = secrets.token_hex(16) - storage_manager = StorageManager() + runner_name = InstanceID.build("prefix") + storage_manager = StorageManager("prefix") storage_manager.create(runner_name) storage_manager.delete(runner_name) @@ -128,8 +128,8 @@ def test_delete_raises_error(): act: Call delete. assert: A DeleteMetricsStorageError is raised. """ - runner_name = secrets.token_hex(16) - storage_manager = StorageManager() + runner_name = InstanceID.build("prefix") + storage_manager = StorageManager("prefix") with pytest.raises(DeleteMetricsStorageError): storage_manager.delete(runner_name) @@ -141,14 +141,14 @@ def test_get(): act: Call create and get. assert: A metrics storage object for this runner is returned. """ - runner_name = secrets.token_hex(16) - storage_manager = StorageManager() + runner_name = InstanceID.build("prefix") + storage_manager = StorageManager("prefix") storage_manager.create(runner_name) ms = storage_manager.get(runner_name) assert isinstance(ms, MetricsStorage) - assert ms.runner_name == runner_name + assert ms.instance_id == runner_name def test_get_raises_error_if_not_found(): @@ -157,8 +157,8 @@ def test_get_raises_error_if_not_found(): act: Call get. assert: A GetMetricsStorageError is raised. """ - runner_name = secrets.token_hex(16) - storage_manager = StorageManager() + runner_name = InstanceID.build("prefix") + storage_manager = StorageManager("prefix") with pytest.raises(GetMetricsStorageError): storage_manager.get(runner_name) @@ -170,14 +170,14 @@ def test_quarantine(filesystem_paths: dict[str, Path], tmp_path: Path): act: Call quarantine. assert: The storage is moved to the quarantine. """ - runner_name = secrets.token_hex(16) - storage_manager = StorageManager() + runner_name = InstanceID.build("prefix") + storage_manager = StorageManager("prefix") ms = storage_manager.create(runner_name) ms.path.joinpath("test.txt").write_text("foo bar") storage_manager.move_to_quarantine(runner_name) - tarfile_path = filesystem_paths["quarantine"].joinpath(runner_name).with_suffix(".tar.gz") + tarfile_path = filesystem_paths["quarantine"].joinpath(str(runner_name)).with_suffix(".tar.gz") assert tarfile_path.exists() tarfile.open(tarfile_path).extractall(path=tmp_path) assert tmp_path.joinpath(f"{runner_name}/test.txt").exists() @@ -191,8 +191,8 @@ def test_quarantine_raises_error(): act: Call quarantine. assert: A QuarantineMetricsStorageError is raised. """ - runner_name = secrets.token_hex(16) - storage_manager = StorageManager() + runner_name = InstanceID.build("prefix") + storage_manager = StorageManager("prefix") with pytest.raises(QuarantineMetricsStorageError): storage_manager.move_to_quarantine(runner_name) diff --git a/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py b/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py index 2abd78326..3a688957d 100644 --- a/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py +++ b/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py @@ -11,6 +11,7 @@ from github_runner_manager.configuration import SupportServiceConfig from github_runner_manager.errors import OpenstackHealthCheckError +from github_runner_manager.manager.models import InstanceID from github_runner_manager.metrics import runner from github_runner_manager.metrics.storage import MetricsStorage, StorageManager from github_runner_manager.openstack_cloud import ( @@ -47,7 +48,6 @@ def openstack_runner_manager_fixture(monkeypatch: pytest.MonkeyPatch) -> OpenSta prefix="test", credentials=MagicMock(), server_config=MagicMock(), - runner_config=MagicMock(), service_config=service_config_mock, ) @@ -67,16 +67,22 @@ def runner_metrics_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: "expected_storage_to_be_extracted", [ pytest.param( - default_healthy := {"healthy1", "healthy2"}, - default_unhealthy := {"unhealthy1", "unhealthy2"}, + default_healthy := { + InstanceID.build(prefix := "prefix", "healthy1"), + InstanceID.build(prefix, "healthy2"), + }, + default_unhealthy := { + InstanceID.build(prefix, "unhealthy1"), + InstanceID.build(prefix, "unhealthy2"), + }, default_undecided := { ("in_construction", datetime.now()), ( - "dangling", + dangling := InstanceID.build(prefix, "dangling"), datetime.now() - timedelta(seconds=OUTDATED_METRICS_STORAGE_IN_SECONDS + 1), ), }, - default_result := default_unhealthy | {"dangling"}, + default_result := default_unhealthy | {dangling}, id="one dangling", ), pytest.param( @@ -85,15 +91,15 @@ def runner_metrics_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: { ("in_construction", datetime.now()), ( - "dangling", + dangling, datetime.now() - timedelta(seconds=OUTDATED_METRICS_STORAGE_IN_SECONDS + 1), ), ( - "dangling2", + dangling2 := InstanceID.build(prefix, "dangling2"), datetime.now() - timedelta(seconds=OUTDATED_METRICS_STORAGE_IN_SECONDS + 1), ), }, - default_unhealthy | {"dangling", "dangling2"}, + default_unhealthy | {dangling, dangling2}, id="two dangling", ), pytest.param( @@ -107,7 +113,7 @@ def runner_metrics_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: default_healthy, set(), default_undecided, - {"dangling"}, + {dangling}, id="no unhealthy", ), pytest.param( @@ -229,7 +235,7 @@ def _create_metrics_storage(runner_name: str, mtime: datetime) -> MetricsStorage Returns: A metric storage mock object. """ - metrics_storage = MetricsStorage(runner_name=runner_name, path=MagicMock(spec=Path)) + metrics_storage = MetricsStorage(instance_id=runner_name, path=MagicMock(spec=Path)) stat = MagicMock() stat_mock = MagicMock(return_value=stat) stat.st_mtime = mtime.timestamp() From b461e60e47e514d5f25d05439c841207299721a5 Mon Sep 17 00:00:00 2001 From: Javier de la Puente Date: Fri, 28 Feb 2025 15:22:21 +0100 Subject: [PATCH 04/13] fix key str --- .../openstack_cloud/openstack_cloud.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py index 9d8ca1429..bfca8cbf1 100644 --- a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py +++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py @@ -466,34 +466,36 @@ def _get_and_ensure_unique_server( return latest_server - def _get_key_path(self, name: str) -> Path: + def _get_key_path(self, instance_id: InstanceID) -> Path: """Get the filepath for storing private SSH of a runner. Args: - name: The name of the runner. + instance_id: The name of the runner. Returns: Path to reserved for the key file of the runner. """ - return self._ssh_key_dir / f"{name}.key" + return self._ssh_key_dir / f"{instance_id}.key" - def _setup_keypair(self, conn: OpenstackConnection, name: str) -> OpenstackKeypair: + def _setup_keypair( + self, conn: OpenstackConnection, instance_id: InstanceID + ) -> OpenstackKeypair: """Create OpenStack keypair. Args: conn: The connection object to access OpenStack cloud. - name: The name of the keypair. + instance_id: The name of the keypair. Returns: The OpenStack keypair. """ - key_path = self._get_key_path(name) + key_path = self._get_key_path(instance_id) if key_path.exists(): - logger.warning("Existing private key file for %s found, removing it.", name) + logger.warning("Existing private key file for %s found, removing it.", instance_id) key_path.unlink(missing_ok=True) - keypair = conn.create_keypair(name=name) + keypair = conn.create_keypair(name=str(instance_id)) key_path.write_text(keypair.private_key) # the charm executes this as root, so we need to change the ownership of the key file shutil.chown(key_path, user=self._system_user) From 0df1b3e491f1c9ec5f1c464776a205983fe48d21 Mon Sep 17 00:00:00 2001 From: Javier de la Puente Date: Fri, 28 Feb 2025 15:49:16 +0100 Subject: [PATCH 05/13] add new exception for invalid runner names --- .../src/github_runner_manager/metrics/storage.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/github-runner-manager/src/github_runner_manager/metrics/storage.py b/github-runner-manager/src/github_runner_manager/metrics/storage.py index b43a8025f..dfeb7e750 100644 --- a/github-runner-manager/src/github_runner_manager/metrics/storage.py +++ b/github-runner-manager/src/github_runner_manager/metrics/storage.py @@ -151,6 +151,12 @@ def list_all(self) -> Iterator[MetricsStorage]: fs = self.get(instance_id=InstanceID.build_from_name(self._prefix, directory.name)) except GetMetricsStorageError: logger.error("Failed to get metrics storage for runner %s", directory.name) + except ValueError: + logger.exception( + "Failed to get metrics storage for runner %s and prefix %s", + directory.name, + self._prefix, + ) else: yield fs From 102e5a49d7c7289f17c2def59fb59839f07421bd Mon Sep 17 00:00:00 2001 From: Javier de la Puente Date: Fri, 28 Feb 2025 16:41:54 +0100 Subject: [PATCH 06/13] Let's see if we can put a capital R in the name --- .../github_runner_manager/manager/models.py | 14 ++-- .../manager/runner_manager.py | 10 ++- .../reactive/consumer.py | 2 +- .../tests/unit/manager/test_models.py | 64 +++++++++++++++++++ .../tests/unit/reactive/test_consumer.py | 2 +- 5 files changed, 83 insertions(+), 9 deletions(-) create mode 100644 github-runner-manager/tests/unit/manager/test_models.py diff --git a/github-runner-manager/src/github_runner_manager/manager/models.py b/github-runner-manager/src/github_runner_manager/manager/models.py index 101957b78..b7e92e31f 100644 --- a/github-runner-manager/src/github_runner_manager/manager/models.py +++ b/github-runner-manager/src/github_runner_manager/manager/models.py @@ -6,6 +6,8 @@ import secrets from dataclasses import dataclass +INSTANCE_SUFFIX_LENGTH = 12 + @dataclass(eq=True, frozen=True) class InstanceID: @@ -29,7 +31,7 @@ def name(self) -> str: Returns: TODO """ - return f"{self.prefix}-{self.suffix}" + return f"{self.prefix}-{self.suffix}{'R' if self.reactive else ''}" @classmethod def build_from_name(cls, prefix: str, name: str) -> "InstanceID": @@ -51,9 +53,14 @@ def build_from_name(cls, prefix: str, name: str) -> "InstanceID": # TODO should we raise if invalid name? raise ValueError(f"Invalid runner name {name} for prefix {prefix}") + reactive = False + if len(suffix) > INSTANCE_SUFFIX_LENGTH: + suffix = suffix[:INSTANCE_SUFFIX_LENGTH] + reactive = True + return cls( prefix=prefix, - reactive=False, + reactive=reactive, suffix=suffix, ) @@ -78,8 +85,7 @@ def build(cls, prefix: str, reactive: bool = False) -> "InstanceID": Returns: Instance ID of the runner. """ - suffix = secrets.token_hex(6) - + suffix = secrets.token_hex(INSTANCE_SUFFIX_LENGTH // 2) return cls(prefix=prefix, reactive=reactive, suffix=suffix) def __str__(self) -> str: diff --git a/github-runner-manager/src/github_runner_manager/manager/runner_manager.py b/github-runner-manager/src/github_runner_manager/manager/runner_manager.py index 855bbb15a..253142873 100644 --- a/github-runner-manager/src/github_runner_manager/manager/runner_manager.py +++ b/github-runner-manager/src/github_runner_manager/manager/runner_manager.py @@ -112,11 +112,12 @@ def __init__( ) self._labels = labels - def create_runners(self, num: int) -> tuple[InstanceID, ...]: + def create_runners(self, num: int, reactive: bool = False) -> tuple[InstanceID, ...]: """Create runners. Args: num: Number of runners to create. + reactive: TODO Returns: List of instance ID of the runners. @@ -128,7 +129,8 @@ def create_runners(self, num: int) -> tuple[InstanceID, ...]: # we have to add them manually. labels += constants.GITHUB_DEFAULT_LABELS create_runner_args = [ - RunnerManager._CreateRunnerArgs(self._cloud, self._github, labels) for _ in range(num) + RunnerManager._CreateRunnerArgs(self._cloud, self._github, labels, reactive) + for _ in range(num) ] return RunnerManager._spawn_runners(create_runner_args) @@ -382,11 +384,13 @@ class _CreateRunnerArgs: cloud_runner_manager: For managing the cloud instance of the runner. github_runner_manager: To manage self-hosted runner on the GitHub side. labels: List of labels to add to the runners. + reactive: TODO. """ cloud_runner_manager: CloudRunnerManager github_runner_manager: GitHubRunnerManager labels: list[str] + reactive: bool @staticmethod def _create_runner(args: _CreateRunnerArgs) -> InstanceID: @@ -400,7 +404,7 @@ def _create_runner(args: _CreateRunnerArgs) -> InstanceID: Returns: The instance ID of the runner created. """ - instance_id = InstanceID.build(args.cloud_runner_manager.name_prefix) + instance_id = InstanceID.build(args.cloud_runner_manager.name_prefix, args.reactive) registration_jittoken = args.github_runner_manager.get_registration_jittoken( instance_id, args.labels ) diff --git a/github-runner-manager/src/github_runner_manager/reactive/consumer.py b/github-runner-manager/src/github_runner_manager/reactive/consumer.py index af48c18f0..1bcbd8f7d 100644 --- a/github-runner-manager/src/github_runner_manager/reactive/consumer.py +++ b/github-runner-manager/src/github_runner_manager/reactive/consumer.py @@ -192,7 +192,7 @@ def _spawn_runner( if _check_job_been_picked_up(job_url=job_url, github_client=github_client): msg.ack() return - instance_ids = runner_manager.create_runners(1) + instance_ids = runner_manager.create_runners(1, reactive=True) if not instance_ids: logger.error("Failed to spawn a runner. Will reject the message.") msg.reject(requeue=True) diff --git a/github-runner-manager/tests/unit/manager/test_models.py b/github-runner-manager/tests/unit/manager/test_models.py new file mode 100644 index 000000000..a62c802a8 --- /dev/null +++ b/github-runner-manager/tests/unit/manager/test_models.py @@ -0,0 +1,64 @@ +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Unit test for the manager models.""" + +import pytest + +from github_runner_manager.manager.models import InstanceID + + +def test_new_instance_id(): + """ + arrange: TODO. + act: TODO + assert: TODO + """ + prefix = "theprefix" + instance_id = InstanceID.build("theprefix") + + assert instance_id.name == str(instance_id) + assert instance_id.prefix == prefix + assert not instance_id.reactive + assert instance_id.name.startswith(prefix) + assert instance_id.name.startswith(prefix) + + +@pytest.mark.parametrize( + "reactive", + [ + pytest.param(True, id="reactive job name"), + pytest.param(False, id="non reactive job name"), + ], +) +def test_build_instance_id_from_name(reactive): + """ + arrange: TODO. + act: TODO + assert: TODO + """ + prefix = "theprefix" + instance_id = InstanceID.build(prefix, reactive) + + name = instance_id.name + new_instance_id = InstanceID.build_from_name(prefix, name) + + assert new_instance_id == instance_id + assert new_instance_id.name == instance_id.name + assert new_instance_id.suffix == instance_id.suffix + assert new_instance_id.reactive == reactive + + +def test_build_instance_id_from_name_fails_with_wrong_prefix(): + """ + arrange: TODO. + act: TODO + assert: TODO + """ + prefix = "theprefix" + instance_id = InstanceID.build(prefix) + + name = "wrong" + instance_id.name + + with pytest.raises(ValueError): + _ = InstanceID.build_from_name(prefix, name) diff --git a/github-runner-manager/tests/unit/reactive/test_consumer.py b/github-runner-manager/tests/unit/reactive/test_consumer.py index d4d18c42f..491c62f67 100644 --- a/github-runner-manager/tests/unit/reactive/test_consumer.py +++ b/github-runner-manager/tests/unit/reactive/test_consumer.py @@ -70,7 +70,7 @@ def test_consume(labels: Labels, supported_labels: Labels, queue_config: QueueCo supported_labels=supported_labels, ) - runner_manager_mock.create_runners.assert_called_once_with(1) + runner_manager_mock.create_runners.assert_called_once_with(1, reactive=True) _assert_queue_is_empty(queue_config.queue_name) From bb48d573a0a3936016da44bf9113c2d85a33e2b9 Mon Sep 17 00:00:00 2001 From: Javier de la Puente Date: Fri, 28 Feb 2025 17:28:01 +0100 Subject: [PATCH 07/13] missing bits --- .../src/github_runner_manager/manager/models.py | 2 ++ .../src/github_runner_manager/manager/runner_manager.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/github-runner-manager/src/github_runner_manager/manager/models.py b/github-runner-manager/src/github_runner_manager/manager/models.py index b7e92e31f..7c4f28fda 100644 --- a/github-runner-manager/src/github_runner_manager/manager/models.py +++ b/github-runner-manager/src/github_runner_manager/manager/models.py @@ -13,6 +13,8 @@ class InstanceID: """TODO. + This class needs to add compatibility for all cloud providers and GitHub. + Attributes: name: TODO prefix: TODO diff --git a/github-runner-manager/src/github_runner_manager/manager/runner_manager.py b/github-runner-manager/src/github_runner_manager/manager/runner_manager.py index 253142873..3fb7d494d 100644 --- a/github-runner-manager/src/github_runner_manager/manager/runner_manager.py +++ b/github-runner-manager/src/github_runner_manager/manager/runner_manager.py @@ -351,16 +351,16 @@ def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetri job_metrics = github_metrics.job( github_client=self._github.github, pre_job_metrics=extracted_metrics.pre_job, - runner_name=extracted_metrics.runner_name, + runner_name=extracted_metrics.instance_id.name, ) except GithubMetricsError: logger.exception( - "Failed to calculate job metrics for %s", extracted_metrics.runner_name + "Failed to calculate job metrics for %s", extracted_metrics.instance_id, ) else: logger.debug( "No pre-job metrics found for %s, will not calculate job metrics.", - extracted_metrics.runner_name, + extracted_metrics.instance_id, ) issued_events = runner_metrics.issue_events( From fdd9b71ade8b87e4e66305f284b89f1ed35633d7 Mon Sep 17 00:00:00 2001 From: Javier de la Puente Date: Fri, 28 Feb 2025 17:33:19 +0100 Subject: [PATCH 08/13] fix linting --- .../src/github_runner_manager/manager/runner_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/github-runner-manager/src/github_runner_manager/manager/runner_manager.py b/github-runner-manager/src/github_runner_manager/manager/runner_manager.py index 3fb7d494d..9a4ee5163 100644 --- a/github-runner-manager/src/github_runner_manager/manager/runner_manager.py +++ b/github-runner-manager/src/github_runner_manager/manager/runner_manager.py @@ -355,7 +355,8 @@ def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetri ) except GithubMetricsError: logger.exception( - "Failed to calculate job metrics for %s", extracted_metrics.instance_id, + "Failed to calculate job metrics for %s", + extracted_metrics.instance_id, ) else: logger.debug( From cbde5c90515b5fe666d6907408a264efed18d3a4 Mon Sep 17 00:00:00 2001 From: Javier de la Puente Date: Fri, 28 Feb 2025 17:58:55 +0100 Subject: [PATCH 09/13] capital letters not valid --- .../src/github_runner_manager/manager/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/github-runner-manager/src/github_runner_manager/manager/models.py b/github-runner-manager/src/github_runner_manager/manager/models.py index 7c4f28fda..e9a06b118 100644 --- a/github-runner-manager/src/github_runner_manager/manager/models.py +++ b/github-runner-manager/src/github_runner_manager/manager/models.py @@ -33,7 +33,7 @@ def name(self) -> str: Returns: TODO """ - return f"{self.prefix}-{self.suffix}{'R' if self.reactive else ''}" + return f"{self.prefix}-{self.suffix}{'r' if self.reactive else ''}" @classmethod def build_from_name(cls, prefix: str, name: str) -> "InstanceID": From 5b235aa2574b5918475bcc03df34001ca2e957c2 Mon Sep 17 00:00:00 2001 From: Javier de la Puente Date: Fri, 28 Feb 2025 19:37:20 +0100 Subject: [PATCH 10/13] fix tests --- .../openstack_cloud/openstack_runner_manager.py | 10 +++++----- .../openstack_cloud/test_openstack_runner_manager.py | 11 ++++++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py index ff7d22233..cc14b5484 100644 --- a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py +++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py @@ -320,9 +320,9 @@ def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: logger.debug("Getting runner healths for cleanup.") runners = self._get_runners_health() - healthy_runner_names = {runner.instance_id.name for runner in runners.healthy} - unhealthy_runner_names = {runner.instance_id.name for runner in runners.unhealthy} - unknown_runner_names = {runner.instance_id.name for runner in runners.unknown} + healthy_runner_names = {runner.instance_id for runner in runners.healthy} + unhealthy_runner_names = {runner.instance_id for runner in runners.unhealthy} + unknown_runner_names = {runner.instance_id for runner in runners.unknown} logger.debug("Healthy runners: %s", healthy_runner_names) logger.debug("Unhealthy runners: %s", unhealthy_runner_names) logger.debug("Unknown health runners: %s", unknown_runner_names) @@ -344,8 +344,8 @@ def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: @staticmethod def _cleanup_extract_metrics( metrics_storage_manager: StorageManager, - ignore_runner_names: set[str], - include_runner_names: set[str], + ignore_runner_names: set[InstanceID], + include_runner_names: set[InstanceID], ) -> Iterator[runner_metrics.RunnerMetrics]: """Extract metrics for certain runners and dangling metrics storage. diff --git a/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py b/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py index 3a688957d..5b2b6af4e 100644 --- a/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py +++ b/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py @@ -133,10 +133,10 @@ def runner_metrics_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: ], ) def test__cleanup_extract_metrics( - healthy_runner_names: set[str], - unhealthy_runner_names: set[str], - undecided_runner_storage: set[tuple[str, datetime]], - expected_storage_to_be_extracted: set[str], + healthy_runner_names: set[InstanceID], + unhealthy_runner_names: set[InstanceID], + undecided_runner_storage: set[tuple[InstanceID, datetime]], + expected_storage_to_be_extracted: set[InstanceID], monkeypatch: pytest.MonkeyPatch, runner_metrics_mock: MagicMock, ): @@ -219,7 +219,8 @@ def test_cleanup_ignores_runners_with_health_check_errors( if instance_id.startswith("unhealthy"): openstack_cloud_mock.delete_instance.assert_any_call(instance_id) assert runner_metrics_mock.extract.call_count == 1 - assert runner_metrics_mock.extract.call_args[1]["runners"] == { + + assert {r.name for r in runner_metrics_mock.extract.call_args[1]["runners"]} == { names for names in names if names.startswith(f"{OPENSTACK_INSTANCE_PREFIX}-unhealthy") } From b67c4c5542d633546c38d486b08d01cb970b0fd6 Mon Sep 17 00:00:00 2001 From: Javier de la Puente Date: Mon, 3 Mar 2025 08:12:25 +0100 Subject: [PATCH 11/13] make test pass --- .../manager/runner_scaler.py | 2 +- .../openstack_cloud/openstack_cloud.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/github-runner-manager/src/github_runner_manager/manager/runner_scaler.py b/github-runner-manager/src/github_runner_manager/manager/runner_scaler.py index 9632df516..7e984471c 100644 --- a/github-runner-manager/src/github_runner_manager/manager/runner_scaler.py +++ b/github-runner-manager/src/github_runner_manager/manager/runner_scaler.py @@ -261,7 +261,7 @@ def reconcile(self) -> int: metric_stats = {} start_timestamp = time.time() - expected_runner_quantity = self._base_quantity + expected_runner_quantity = None if self._reactive_config else self._base_quantity # TODO THE NEW STEPS MAY BE AS FOLLOWS: diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py index bfca8cbf1..2ab239c32 100644 --- a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py +++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py @@ -255,7 +255,7 @@ def delete_instance(self, instance_id: InstanceID) -> None: with _get_openstack_connection(credentials=self._credentials) as conn: self._delete_instance(conn, instance_id) - def _delete_instance(self, conn: OpenstackConnection, instance_id: str) -> None: + def _delete_instance(self, conn: OpenstackConnection, instance_id: InstanceID) -> None: """Delete a openstack instance. Raises: @@ -400,7 +400,7 @@ def _cleanup_openstack_keypairs( if str(key.name) in exclude_instance_set: continue try: - self._delete_keypair(conn, key.name) + self._delete_keypair(conn, InstanceID.build_from_name(self.prefix, key.name)) except openstack.exceptions.SDKException: logger.warning( "Unable to delete OpenStack keypair associated with deleted key file %s ", @@ -502,22 +502,22 @@ def _setup_keypair( key_path.chmod(0o400) return keypair - def _delete_keypair(self, conn: OpenstackConnection, name: str) -> None: + def _delete_keypair(self, conn: OpenstackConnection, instance_id: InstanceID) -> None: """Delete OpenStack keypair. Args: conn: The connection object to access OpenStack cloud. - name: The name of the keypair. + instance_id: The name of the keypair. """ - logger.debug("Deleting keypair for %s", name) + logger.debug("Deleting keypair for %s", instance_id) try: # Keypair have unique names, access by ID is not needed. - if not conn.delete_keypair(name): - logger.warning("Unable to delete keypair for %s", name) + if not conn.delete_keypair(instance_id.name): + logger.warning("Unable to delete keypair for %s", instance_id) except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout): - logger.warning("Unable to delete keypair for %s", name, stack_info=True) + logger.warning("Unable to delete keypair for %s", instance_id, stack_info=True) - key_path = self._get_key_path(name) + key_path = self._get_key_path(instance_id.name) key_path.unlink(missing_ok=True) @staticmethod From 462482a7428fc48ff8eb90fbc541fdc9b5ef1c16 Mon Sep 17 00:00:00 2001 From: Javier de la Puente Date: Mon, 3 Mar 2025 11:49:11 +0100 Subject: [PATCH 12/13] do not use extra character for reactive --- .../src/github_runner_manager/manager/models.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/github-runner-manager/src/github_runner_manager/manager/models.py b/github-runner-manager/src/github_runner_manager/manager/models.py index e9a06b118..ca6cd5000 100644 --- a/github-runner-manager/src/github_runner_manager/manager/models.py +++ b/github-runner-manager/src/github_runner_manager/manager/models.py @@ -33,7 +33,9 @@ def name(self) -> str: Returns: TODO """ - return f"{self.prefix}-{self.suffix}{'r' if self.reactive else ''}" + # Having a not number as a separator is good, as the prefix should end + # with a number (it is the unit number). + return f"{self.prefix}{'r' if self.reactive else '-'}{self.suffix}" @classmethod def build_from_name(cls, prefix: str, name: str) -> "InstanceID": @@ -50,15 +52,20 @@ def build_from_name(cls, prefix: str, name: str) -> "InstanceID": TODO """ if name.startswith(prefix): - suffix = name.removeprefix(f"{prefix}-") + suffix = name.removeprefix(f"{prefix}") else: # TODO should we raise if invalid name? raise ValueError(f"Invalid runner name {name} for prefix {prefix}") - reactive = False - if len(suffix) > INSTANCE_SUFFIX_LENGTH: - suffix = suffix[:INSTANCE_SUFFIX_LENGTH] + # The separator from prefix and suffix may indicate if the runner is reactive. + separator = suffix[:1] + if separator == "r": reactive = True + elif separator == "-": + reactive = False + else: + raise ValueError(f"Invalid runner name {name} for prefix {prefix}") + suffix = suffix[1:] return cls( prefix=prefix, From 298633f517bc865f6c235e0d0456a30538943686 Mon Sep 17 00:00:00 2001 From: Javier de la Puente Date: Mon, 3 Mar 2025 13:54:28 +0100 Subject: [PATCH 13/13] a few bugs about that may include unit-11 in unit-1. --- .../manager/github_runner_manager.py | 12 +++---- .../github_runner_manager/manager/models.py | 32 ++++++++++++++++--- .../manager/runner_scaler.py | 4 +-- .../openstack_cloud/openstack_cloud.py | 18 ++++------- 4 files changed, 42 insertions(+), 24 deletions(-) diff --git a/github-runner-manager/src/github_runner_manager/manager/github_runner_manager.py b/github-runner-manager/src/github_runner_manager/manager/github_runner_manager.py index b7b0fc0c1..e0930a6b7 100644 --- a/github-runner-manager/src/github_runner_manager/manager/github_runner_manager.py +++ b/github-runner-manager/src/github_runner_manager/manager/github_runner_manager.py @@ -73,7 +73,11 @@ def get_runners( Information on the runners. """ runner_list = self.github.get_runner_github_info(self._path) - runner_list = [runner for runner in runner_list if runner.name.startswith(self._prefix)] + runner_list = [ + runner + for runner in runner_list + if InstanceID.name_has_prefix(self._prefix, runner.name) + ] if states is None: return tuple(runner_list) @@ -120,9 +124,7 @@ def get_removal_token(self) -> str: return self.github.get_runner_remove_token(self._path) @staticmethod - def _is_runner_in_state( - runner: SelfHostedRunner, states: set[GitHubRunnerState] | None - ) -> bool: + def _is_runner_in_state(runner: SelfHostedRunner, states: set[GitHubRunnerState]) -> bool: """Check that the runner is in one of the states provided. Args: @@ -132,6 +134,4 @@ def _is_runner_in_state( Returns: True if the runner is in one of the state, else false. """ - if states is None: - return True return GitHubRunnerState.from_runner(runner) in states diff --git a/github-runner-manager/src/github_runner_manager/manager/models.py b/github-runner-manager/src/github_runner_manager/manager/models.py index ca6cd5000..5f8779c8d 100644 --- a/github-runner-manager/src/github_runner_manager/manager/models.py +++ b/github-runner-manager/src/github_runner_manager/manager/models.py @@ -51,20 +51,17 @@ def build_from_name(cls, prefix: str, name: str) -> "InstanceID": Returns: TODO """ - if name.startswith(prefix): + if InstanceID.name_has_prefix(prefix, name): suffix = name.removeprefix(f"{prefix}") else: # TODO should we raise if invalid name? raise ValueError(f"Invalid runner name {name} for prefix {prefix}") # The separator from prefix and suffix may indicate if the runner is reactive. + reactive = False separator = suffix[:1] if separator == "r": reactive = True - elif separator == "-": - reactive = False - else: - raise ValueError(f"Invalid runner name {name} for prefix {prefix}") suffix = suffix[1:] return cls( @@ -97,6 +94,31 @@ def build(cls, prefix: str, reactive: bool = False) -> "InstanceID": suffix = secrets.token_hex(INSTANCE_SUFFIX_LENGTH // 2) return cls(prefix=prefix, reactive=reactive, suffix=suffix) + @staticmethod + def name_has_prefix(prefix: str, name: str) -> bool: + """TODO. + + TODO THIS CHECKS THE DIFFERENCE BETWEEN + name-1-suffix + and + namd-11-suffix + that is a bug in many places now. + + name-11 is not a name in the prefix name-11 + + Args: + prefix: TODO + name: TODO + + Returns: + TODO + """ + if name.startswith(prefix): + suffix = name.removeprefix(f"{prefix}") + if suffix[:1] in ("-", "r"): + return True + return False + def __str__(self) -> str: """TODO. diff --git a/github-runner-manager/src/github_runner_manager/manager/runner_scaler.py b/github-runner-manager/src/github_runner_manager/manager/runner_scaler.py index 7e984471c..f7790c9bb 100644 --- a/github-runner-manager/src/github_runner_manager/manager/runner_scaler.py +++ b/github-runner-manager/src/github_runner_manager/manager/runner_scaler.py @@ -95,7 +95,7 @@ class _ReconcileMetricData: metric_stats: IssuedMetricEventsStats runner_list: tuple[RunnerInstance] flavor: str - expected_runner_quantity: int | None = None + expected_runner_quantity: int class RunnerScaler: @@ -261,7 +261,7 @@ def reconcile(self) -> int: metric_stats = {} start_timestamp = time.time() - expected_runner_quantity = None if self._reactive_config else self._base_quantity + expected_runner_quantity = self._base_quantity # TODO THE NEW STEPS MAY BE AS FOLLOWS: diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py index 2ab239c32..0a4debf6e 100644 --- a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py +++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py @@ -62,15 +62,7 @@ def __init__(self, server: OpenstackServer, prefix: str): Args: server: The OpenStack server. prefix: The name prefix for the servers. - - Raises: - ValueError: Provided server should not be managed under this prefix. """ - if not server.name.startswith(f"{prefix}-"): - # Should never happen. - raise ValueError( - f"Found openstack server {server.name} managed under prefix {prefix}, contact devs" - ) self.addresses = [ address["addr"] for network_addresses in server.addresses.values() @@ -374,7 +366,11 @@ def _cleanup_key_files(self, exclude_instances: Iterable[str]) -> None: deleted = 0 for path in self._ssh_key_dir.iterdir(): # Find key file from this application. - if path.is_file() and path.name.startswith(self.prefix) and path.name.endswith(".key"): + if ( + path.is_file() + and InstanceID.name_has_prefix(self.prefix, path.name) + and path.name.endswith(".key") + ): total += 1 if path in exclude_filename: continue @@ -396,7 +392,7 @@ def _cleanup_openstack_keypairs( keypairs = conn.list_keypairs() for key in keypairs: # The `name` attribute is of resource.Body type. - if key.name and str(key.name).startswith(self.prefix): + if key.name and InstanceID.name_has_prefix(self.prefix, key.name): if str(key.name) in exclude_instance_set: continue try: @@ -419,7 +415,7 @@ def _get_openstack_instances(self, conn: OpenstackConnection) -> tuple[Openstack return tuple( server for server in cast(list[OpenstackServer], conn.list_servers()) - if server.name.startswith(f"{self.prefix}-") + if InstanceID.name_has_prefix(self.prefix, server.name) ) @staticmethod