diff --git a/cloud/aws-functions/orphaned_cloudformation.py b/cloud/aws-functions/orphaned_cloudformation.py index fbc3f02775..a657f61818 100644 --- a/cloud/aws-functions/orphaned_cloudformation.py +++ b/cloud/aws-functions/orphaned_cloudformation.py @@ -3,10 +3,21 @@ import datetime import boto3 from boto3.exceptions import Boto3Error +from botocore.exceptions import ClientError from utils import get_regions_list +from time import sleep + +def is_stack_to_terminate(stack, aws_region): + cf_client = boto3.client('cloudformation', region_name=aws_region) + + try: + stack_desc = cf_client.describe_stacks(StackName=stack)['Stacks'][0] + tags = stack_desc['Tags'] + + except ClientError as e: + print(e) + return False -def is_stack_to_terminate(stack): - tags = stack.tags tags_dict = {item['Key']: item['Value'] for item in tags} if 'team' not in tags_dict.keys() or ('team' in tags_dict.keys() and tags_dict['team'] != 'cloud'): @@ -16,37 +27,88 @@ def is_stack_to_terminate(stack): stack_lifetime = float(tags_dict['delete-cluster-after-hours']) current_time = datetime.datetime.now().timestamp() - creation_time = int(stack.creation_time.timestamp()) + + + creation_time = int(stack_desc['CreationTime'].timestamp()) if (current_time - creation_time) / 3600 > stack_lifetime + 1: return True return False def get_cloudformation_to_terminate(aws_region): - cf_client = boto3.resource('cloudformation') - statuses = ['ROLLBACK_COMPLETE', 'CREATE_COMPLETE', 'UPDATE_COMPLETE', 'DELETE_FAILED'] + cf_client = boto3.client('cloudformation', region_name=aws_region) + stacks_for_deletion = [] - cloudformation_stacks = [stack for stack in cf_client.stacks.all() if stack.stack_status in statuses] + + cloudformation_stacks = [stack['StackName'] for stack in cf_client.list_stacks(StackStatusFilter=['ROLLBACK_COMPLETE', 'CREATE_COMPLETE', 'UPDATE_COMPLETE', 'DELETE_FAILED'])['StackSummaries']] + if not cloudformation_stacks: logging.info(f"There are no cloudformation_stacks in cloud") for stack in cloudformation_stacks: - if is_stack_to_terminate(stack): - stacks_for_deletion.append(stack.name) + if is_stack_to_terminate(stack, aws_region): + stacks_for_deletion.append(stack) if not stacks_for_deletion: logging.info(f"There are no stacks for deletion") return stacks_for_deletion -def delete_cloudformation_stacks(cloudformation_stack): - cf_client = boto3.client('cloudformation') +def delete_stack(stack_name, aws_region): + cf_client = boto3.client('cloudformation', region_name=aws_region) try: - logging.info(f"Removing cloudformation stack: {cloudformation_stack}") - cf_client.delete_stack(StackName=cloudformation_stack) - except Boto3Error as e: - logging.error(f"Delete of stack failed with error: {e}") + # Initiate the delete operation add timeout + logging.info(f"Removing cloudformation stack: {stack_name}") + waiter_config = { + 'Delay': 30, # Time (in seconds) to wait between attempts + 'MaxAttempts': 10 # Maximum number of attempts (30s * 40 = 1200s or 20 minutes) + } + waiter = cf_client.get_waiter('stack_delete_complete') + print(f"Waiting for stack {stack_name} to be deleted...") + response = cf_client.delete_stack(StackName=stack_name) + waiter.wait(StackName=stack_name, WaiterConfig=waiter_config) + except ClientError as e: + logging.info(f"Error deleting stack: {e}") + +def delete_stack_resources(stack_name, aws_region): + cf_client = boto3.client('cloudformation', region_name=aws_region) + iam_client = boto3.client('iam') + + try: + resources = cf_client.describe_stack_resources(StackName=stack_name) + for resource in resources['StackResources']: + resource_id = resource['PhysicalResourceId'] + resource_type = resource['ResourceType'] + print(f'resource_type {resource_type} stack_name {stack_name}') + try: + print(f"Attempting to delete resource: {resource_id} of type: {resource_type}") + if resource_type == 'AWS::IAM::Role': + iam_client.delete_role(RoleName=resource_id) + elif resource_type == 'AWS::IAM::Policy': + iam_client.delete_policy(PolicyArn=resource_id) + elif resource_type == 'AWS::IAM::InstanceProfile': + try: + response = iam_client.get_instance_profile(InstanceProfileName=resource_id) + roles = response['InstanceProfile']['Roles'] + if roles: + for role in roles: + print(f"Role attached to instance profile {instance_profile_name}: {role['RoleName']}") + iam_client.remove_role_from_instance_profile(InstanceProfileName=resource_id, RoleName=role) + else: + print(f"No roles are attached to instance profile {instance_profile_name}.") + except iam_client.exceptions.NoSuchEntityException: + print(f"Instance profile {instance_profile_name} does not exist.") + except Exception as e: + print(f"An error occurred: {e}") + iam_client.delete_instance_profile(InstanceProfileName=resource_id) + + sleep(2) # Sleep to avoid hitting rate limits + except ClientError as e: + print(f"Failed to delete resource: {resource_id}. Error: {e}") + except ClientError as e: + print(f"Error describing stack resources: {e}") def lambda_handler(event, context): + aws_regions = get_regions_list() for aws_region in aws_regions: @@ -54,5 +116,10 @@ def lambda_handler(event, context): cloudformation_stacks = get_cloudformation_to_terminate(aws_region) for cloudformation_stack in cloudformation_stacks: - logging.info(f"Deleting cloudformation stacks.") - delete_cloudformation_stacks(cloudformation_stack) + try: + logging.info(f"Deleting cloudformation stacks.") + delete_stack_resources(cloudformation_stack, aws_region) + delete_stack(cloudformation_stack, aws_region) + except ClientError as e: + logging.info(f"Failed to delete resource: {resource_id}. Error: {e}") + continue diff --git a/cloud/aws-functions/orphaned_openshift_instances.py b/cloud/aws-functions/orphaned_openshift_instances.py index 9ecdf68609..7f23a02afc 100644 --- a/cloud/aws-functions/orphaned_openshift_instances.py +++ b/cloud/aws-functions/orphaned_openshift_instances.py @@ -21,7 +21,10 @@ def is_instance_to_terminate(instance): instance_lifetime = float(tags_dict['delete-cluster-after-hours']) current_time = datetime.datetime.now().timestamp() - creation_time = instance.launch_time.timestamp() + try: + creation_time = int(tags_dict['creation-time']) + except KeyError as e: + return False if (current_time - creation_time) / 3600 > instance_lifetime: return True diff --git a/cloud/jenkins/pxc_operator_aks_latest.groovy b/cloud/jenkins/pxc_operator_aks_latest.groovy index 2a8b98ec63..57e82e9fe5 100644 --- a/cloud/jenkins/pxc_operator_aks_latest.groovy +++ b/cloud/jenkins/pxc_operator_aks_latest.groovy @@ -1,4 +1,4 @@ -location='westeurope' +location='norwayeast' tests=[] clusters=[] diff --git a/cloud/jenkins/pxc_operator_aks_version.groovy b/cloud/jenkins/pxc_operator_aks_version.groovy index 6e7fc0fa13..6f0922dcc1 100644 --- a/cloud/jenkins/pxc_operator_aks_version.groovy +++ b/cloud/jenkins/pxc_operator_aks_version.groovy @@ -33,7 +33,8 @@ void prepareNode() { } if ("$PLATFORM_VER" == "latest") { - USED_PLATFORM_VER = sh(script: "az aks get-versions --location $location --output json | jq -r '.values | max_by(.patchVersions) | .patchVersions | keys[]' | sort --version-sort | tail -1", , returnStdout: true).trim() + USED_PLATFORM_VER = "1.30" +// sh(script: "az aks get-versions --location $location --output json | jq -r '.values | max_by(.patchVersions) | .patchVersions | keys[]' | sort --version-sort | tail -1", , returnStdout: true).trim() } else { USED_PLATFORM_VER="$PLATFORM_VER" } @@ -176,8 +177,7 @@ void createCluster(String CLUSTER_SUFFIX) { --generate-ssh-keys \ --enable-cluster-autoscaler \ --outbound-type loadbalancer \ - --kubernetes-version $USED_PLATFORM_VER \ - -l $location + --kubernetes-version $USED_PLATFORM_VER az aks get-credentials --subscription eng-cloud-dev --resource-group percona-operators --name $CLUSTER_NAME-$CLUSTER_SUFFIX --overwrite-existing """ }