From 7709befb618a93709d06f5d57199ac3a6bf4c3ba Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Mon, 10 Jun 2024 09:12:09 +0200 Subject: [PATCH 1/2] issue reconciliation metrics on error --- src-docs/openstack_manager.md | 8 +- src/openstack_cloud/openstack_manager.py | 244 ++++++++++++++--------- 2 files changed, 148 insertions(+), 104 deletions(-) diff --git a/src-docs/openstack_manager.md b/src-docs/openstack_manager.md index 42c1cda39..0555ee8df 100644 --- a/src-docs/openstack_manager.md +++ b/src-docs/openstack_manager.md @@ -214,7 +214,7 @@ Construct OpenstackRunnerManager object. --- - + ### method `flush` @@ -266,12 +266,6 @@ Reconcile the quantity of runners. -**Raises:** - - - `OpenstackInstanceLaunchError`: Unable to launch OpenStack instance. - - - **Returns:** The change in number of runners. diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py index d8cd2d966..4af360095 100644 --- a/src/openstack_cloud/openstack_manager.py +++ b/src/openstack_cloud/openstack_manager.py @@ -1219,14 +1219,63 @@ def reconcile(self, quantity: int) -> int: Args: quantity: The number of intended runners. - Raises: - OpenstackInstanceLaunchError: Unable to launch OpenStack instance. - Returns: The change in number of runners. """ start_ts = time.time() + try: + delta = self._reconcile_runners(quantity) + finally: + end_ts = time.time() + self._issue_reconciliation_metrics( + reconciliation_start_ts=start_ts, reconciliation_end_ts=end_ts + ) + + return delta + + def _reconcile_runners(self, quantity: int) -> int: + """Reconcile the number of runners. + + Args: + quantity: The number of intended runners. + + Returns: + The change in number of runners. + """ + with _create_connection(self._cloud_config) as conn: + runner_by_health = self._get_openstack_runner_status(conn) + logger.info( + "Found %s healthy runner and %s unhealthy runner", + len(runner_by_health.healthy), + len(runner_by_health.unhealthy), + ) + logger.debug("Healthy runner: %s", runner_by_health.healthy) + logger.debug("Unhealthy runner: %s", runner_by_health.unhealthy) + remove_token = self._github.get_runner_remove_token(path=self._config.path) + + self._clean_up_runners( + conn=conn, runner_by_health=runner_by_health, remove_token=remove_token + ) + + delta = self._scale( + quantity=quantity, + conn=conn, + runner_by_health=runner_by_health, + remove_token=remove_token, + ) + return delta + def _clean_up_runners( + self, conn: OpenstackConnection, runner_by_health: RunnerByHealth, remove_token: str + ) -> None: + """Clean up offline or unhealthy runners. + + Args: + conn: The openstack connection instance. + runner_by_health: The runner status grouped by health. + remove_token: The GitHub runner remove token. + + """ github_info = self.get_github_runner_info() online_runners = [runner.runner_name for runner in github_info if runner.online] offline_runners = [runner.runner_name for runner in github_info if not runner.online] @@ -1241,96 +1290,98 @@ def reconcile(self, quantity: int) -> int: logger.debug("Offline runner: %s", offline_runners) logger.debug("Busy runner: %s", busy_runners) - with _create_connection(self._cloud_config) as conn: - runner_by_health = self._get_openstack_runner_status(conn) - logger.info( - "Found %s healthy runner and %s unhealthy runner", - len(runner_by_health.healthy), - len(runner_by_health.unhealthy), - ) - logger.debug("Healthy runner: %s", runner_by_health.healthy) - logger.debug("Unhealthy runner: %s", runner_by_health.unhealthy) + healthy_runners_set = set(runner_by_health.healthy) + busy_runners_set = set(busy_runners) + busy_unhealthy_runners = set(runner_by_health.unhealthy).intersection(busy_runners_set) + if busy_unhealthy_runners: + logger.warning("Found unhealthy busy runners %s", busy_unhealthy_runners) + + # Clean up offline (SHUTOFF) runners or unhealthy (no connection/cloud-init script) + # runners. + # Possible for a healthy runner to be appear as offline for sometime as GitHub can be + # slow to update the status. + # For busy runners let GitHub decide whether the runner should be removed. + instance_to_remove = tuple( + runner + for runner in (*runner_by_health.unhealthy, *offline_runners) + if runner not in healthy_runners_set and runner not in busy_runners_set + ) + logger.debug("Removing following runners with issues %s", instance_to_remove) + self._remove_runners( + conn=conn, instance_names=instance_to_remove, remove_token=remove_token + ) + # Clean up orphan keys, e.g., If openstack instance is removed externally the key + # would not be deleted. + self._clean_up_keys_files(conn, runner_by_health.healthy) + self._clean_up_openstack_keypairs(conn, runner_by_health.healthy) - healthy_runners_set = set(runner_by_health.healthy) - busy_runners_set = set(busy_runners) - busy_unhealthy_runners = set(runner_by_health.unhealthy).intersection(busy_runners_set) - if busy_unhealthy_runners: - logger.warning("Found unhealthy busy runners %s", busy_unhealthy_runners) + def _scale( + self, + quantity: int, + conn: OpenstackConnection, + runner_by_health: RunnerByHealth, + remove_token: str, + ) -> int: + """Scale the number of runners. - # Clean up offline (SHUTOFF) runners or unhealthy (no connection/cloud-init script) - # runners. - remove_token = self._github.get_runner_remove_token(path=self._config.path) - # Possible for a healthy runner to be appear as offline for sometime as GitHub can be - # slow to update the status. - # For busy runners let GitHub decide whether the runner should be removed. - instance_to_remove = tuple( - runner - for runner in (*runner_by_health.unhealthy, *offline_runners) - if runner not in healthy_runners_set and runner not in busy_runners_set - ) - logger.debug("Removing following runners with issues %s", instance_to_remove) - self._remove_runners( - conn=conn, instance_names=instance_to_remove, remove_token=remove_token - ) - # Clean up orphan keys, e.g., If openstack instance is removed externally the key - # would not be deleted. - self._clean_up_keys_files(conn, runner_by_health.healthy) - self._clean_up_openstack_keypairs(conn, runner_by_health.healthy) - - # Get the number of OpenStack servers. - # This is not calculated due to there might be removal failures. - servers = self._get_openstack_instances(conn) - delta = quantity - len(servers) - - # Spawn new runners - if delta > 0: - # Skip this reconcile if image not present. - try: - if conn.get_image(name_or_id=IMAGE_NAME) is None: - logger.warning( - "No OpenStack runner was spawned due to image needed not found" - ) - return 0 - except openstack.exceptions.SDKException as exc: - # Will be resolved by charm integration with image build charm. - logger.exception("Multiple image named %s found", IMAGE_NAME) - raise OpenstackInstanceLaunchError( - "Multiple image found, unable to determine the image to use" - ) from exc - - logger.info("Creating %s OpenStack runners", delta) - args = [ - OpenstackRunnerManager._CreateRunnerArgs( - cloud_config=self._cloud_config, - app_name=self.app_name, - unit_num=self.unit_num, - config=self._config, - ) - for _ in range(delta) - ] - with Pool(processes=min(delta, 10)) as pool: - pool.map( - func=OpenstackRunnerManager._create_runner, - iterable=args, - ) + Args: + quantity: The number of intended runners. + conn: The openstack connection instance. + runner_by_health: The runner status grouped by health. + remove_token: The GitHub runner remove token. + + Raises: + OpenstackInstanceLaunchError: Unable to launch OpenStack instance. - elif delta < 0: - logger.info("Removing %s OpenStack runners", delta) - self._remove_runners( - conn=conn, - instance_names=runner_by_health.healthy, - remove_token=remove_token, - num_to_remove=abs(delta), + Returns: + The change in number of runners. + """ + # Get the number of OpenStack servers. + # This is not calculated due to there might be removal failures. + servers = self._get_openstack_instances(conn) + delta = quantity - len(servers) + + # Spawn new runners + if delta > 0: + # Skip this reconcile if image not present. + try: + if conn.get_image(name_or_id=IMAGE_NAME) is None: + logger.warning("No OpenStack runner was spawned due to image needed not found") + except openstack.exceptions.SDKException as exc: + # Will be resolved by charm integration with image build charm. + logger.exception("Multiple image named %s found", IMAGE_NAME) + raise OpenstackInstanceLaunchError( + "Multiple image found, unable to determine the image to use" + ) from exc + + logger.info("Creating %s OpenStack runners", delta) + args = [ + OpenstackRunnerManager._CreateRunnerArgs( + cloud_config=self._cloud_config, + app_name=self.app_name, + unit_num=self.unit_num, + config=self._config, + ) + for _ in range(delta) + ] + with Pool(processes=min(delta, 10)) as pool: + pool.map( + func=OpenstackRunnerManager._create_runner, + iterable=args, ) - else: - logger.info("No changes to number of runners needed") - end_ts = time.time() - self._issue_reconciliation_metrics( - conn=conn, reconciliation_start_ts=start_ts, reconciliation_end_ts=end_ts + elif delta < 0: + logger.info("Removing %s OpenStack runners", delta) + self._remove_runners( + conn=conn, + instance_names=runner_by_health.healthy, + remove_token=remove_token, + num_to_remove=abs(delta), ) + else: + logger.info("No changes to number of runners needed") - return delta + return delta def flush(self) -> int: """Flush Openstack servers. @@ -1491,7 +1542,6 @@ def _issue_runner_installed_metric( def _issue_reconciliation_metrics( self, - conn: OpenstackConnection, reconciliation_start_ts: float, reconciliation_end_ts: float, ) -> None: @@ -1500,19 +1550,19 @@ def _issue_reconciliation_metrics( This includes the metrics for the runners and the reconciliation metric itself. Args: - conn: The connection object to access OpenStack cloud. reconciliation_start_ts: The timestamp of when reconciliation started. reconciliation_end_ts: The timestamp of when reconciliation ended. """ - runner_states = self._get_openstack_runner_status(conn) - - metric_stats = self._issue_runner_metrics(conn) - self._issue_reconciliation_metric( - metric_stats=metric_stats, - reconciliation_start_ts=reconciliation_start_ts, - reconciliation_end_ts=reconciliation_end_ts, - runner_states=runner_states, - ) + with _create_connection(self._cloud_config) as conn: + runner_states = self._get_openstack_runner_status(conn) + + metric_stats = self._issue_runner_metrics(conn) + self._issue_reconciliation_metric( + metric_stats=metric_stats, + reconciliation_start_ts=reconciliation_start_ts, + reconciliation_end_ts=reconciliation_end_ts, + runner_states=runner_states, + ) def _issue_runner_metrics(self, conn: OpenstackConnection) -> IssuedMetricEventsStats: """Issue runner metrics. From bc4b89bc7c9ec53a244aaae6795bee61c58a5633 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Mon, 10 Jun 2024 09:15:14 +0200 Subject: [PATCH 2/2] increase create server timeout to 5 mins --- src-docs/openstack_manager.md | 23 ++++++++++++----------- src/openstack_cloud/openstack_manager.py | 4 +++- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src-docs/openstack_manager.md b/src-docs/openstack_manager.md index 0555ee8df..add58cfe7 100644 --- a/src-docs/openstack_manager.md +++ b/src-docs/openstack_manager.md @@ -13,10 +13,11 @@ Module for handling interactions with OpenStack. - **SECURITY_GROUP_NAME** - **BUILD_OPENSTACK_IMAGE_SCRIPT_FILENAME** - **MAX_METRICS_FILE_SIZE** +- **CREATE_SERVER_TIMEOUT** --- - + ## function `build_image` @@ -56,7 +57,7 @@ Build and upload an image to OpenStack. --- - + ## function `create_instance_config` @@ -92,7 +93,7 @@ Create an instance config from charm data. --- - + ## class `InstanceConfig` The configuration values for creating a single runner instance. @@ -131,7 +132,7 @@ __init__( --- - + ## class `ProxyStringValues` Wrapper class to proxy values to string. @@ -150,7 +151,7 @@ Wrapper class to proxy values to string. --- - + ## class `OpenstackUpdateImageError` Represents an error while updating image on Openstack. @@ -161,7 +162,7 @@ Represents an error while updating image on Openstack. --- - + ## class `GithubRunnerRemoveError` Represents an error removing registered runner from Github. @@ -172,7 +173,7 @@ Represents an error removing registered runner from Github. --- - + ## class `OpenstackRunnerManager` Runner manager for OpenStack-based instances. @@ -185,7 +186,7 @@ Runner manager for OpenStack-based instances. - `unit_num`: The juju unit number. - `instance_name`: Prefix of the name for the set of runners. - + ### method `__init__` @@ -214,7 +215,7 @@ Construct OpenstackRunnerManager object. --- - + ### method `flush` @@ -231,7 +232,7 @@ Flush Openstack servers. --- - + ### method `get_github_runner_info` @@ -248,7 +249,7 @@ Get information on GitHub for the runners. --- - + ### method `reconcile` diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py index 4af360095..fb47f625a 100644 --- a/src/openstack_cloud/openstack_manager.py +++ b/src/openstack_cloud/openstack_manager.py @@ -90,6 +90,8 @@ PRE_JOB_SCRIPT = RUNNER_APPLICATION / "pre-job.sh" MAX_METRICS_FILE_SIZE = 1024 +CREATE_SERVER_TIMEOUT = 5 * 60 + class _PullFileError(Exception): """Represents an error while pulling a file from the runner instance.""" @@ -794,7 +796,7 @@ def _create_runner(args: _CreateRunnerArgs) -> None: security_groups=[SECURITY_GROUP_NAME], userdata=cloud_userdata_str, auto_ip=False, - timeout=120, + timeout=CREATE_SERVER_TIMEOUT, wait=True, ) except openstack.exceptions.ResourceTimeout as err: