From d0f3db0e9ac2e3ff0657d3db076fef3a5f7a049d Mon Sep 17 00:00:00 2001 From: Matthias Kotzerke Date: Fri, 26 Jul 2024 20:43:40 +0200 Subject: [PATCH 1/2] Update Docker Image Metaflow Versions to current releases Update Postgres to a more recent Versions that is not deprecated and therefore cheaper (No extended Support Fee). Add Filesystem size variable and Launch Template for Batch, so it's configurable. (Had issues with running out of space after a certain number of flow steps. 30 GB is the current default, same as the aws default size) Use current generation default values for Batch Env Instance Types. Make 1 an allowed Value for Min and DesiredCPUBatch --- aws/cloudformation/metaflow-cfn-template.yml | 40 ++++++++++++++------ 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/aws/cloudformation/metaflow-cfn-template.yml b/aws/cloudformation/metaflow-cfn-template.yml index 7e25428..56d995e 100644 --- a/aws/cloudformation/metaflow-cfn-template.yml +++ b/aws/cloudformation/metaflow-cfn-template.yml @@ -1,12 +1,12 @@ AWSTemplateFormatVersion: '2010-09-09' -Description: Stack for complete deployment of Metaflow (last-updated-date 10/26/2021) +Description: Stack for complete deployment of Metaflow (last-updated-date 07/26/2024) Parameters: SagemakerInstance: Type: String Default: ml.t2.xlarge - AllowedValues: ['ml.t2.large','ml.t2.xlarge','ml.t2.2xlarge'] - Description: 'Instance type for Sagemaker Notebook.' + AllowedValues: ['ml.t2.large','ml.t2.xlarge','ml.t2.2xlarge','ml.g5.xlarge','ml.g5.2xlarge'] + Description: 'Instance type for Sagemaker Notebook. Choose g5 instances if you need a GPU.' VPCCidr: Type: String Default: 10.20.0.0/16 @@ -19,6 +19,11 @@ Parameters: Type: String Default: 10.20.1.0/24 Description: 'CIDR for Metaflow VPC Subnet 2' + FileSystemSizeBatchInstances: + Type: Number + Default: 30 + MinValue: 1 + Description: 'File System Size in GB of launched EC2 instances through Batch.' MaxVCPUBatch: Type: Number Default: 64 @@ -30,18 +35,18 @@ Parameters: Default: 8 MinValue: 0 MaxValue: 16 - AllowedValues: [0,2,4,8,16] + AllowedValues: [0,1,2,4,8,16] Description: 'Minimum VCPUs for Batch Compute Environment [0-16] for EC2 Batch Compute Environment (ignored for Fargate)' DesiredVCPUBatch: Type: Number Default: 8 MinValue: 0 MaxValue: 16 - AllowedValues: [0,2,4,8,16] + AllowedValues: [0,1,2,4,8,16] Description: 'Desired Starting VCPUs for Batch Compute Environment [0-16] for EC2 Batch Compute Environment (ignored for Fargate)' ComputeEnvInstanceTypes: Type: CommaDelimitedList - Default: "c4.large,c4.xlarge,c4.2xlarge,c4.4xlarge,c4.8xlarge" + Default: "r7a.medium,r7a.large,r7a.xlarge,r7a.2xlarge,r7a.4xlarge,r7a.8xlarge,c7a.medium,c7a.large,c7a.xlarge,c7a.2xlarge,c7a.4xlarge,c7a.8xlarge" Description: "The instance types for the compute environment as a comma-separated list" CustomRole: Type: String @@ -92,7 +97,7 @@ Mappings: ServiceName: value: 'metadata-service-v2' ImageUrl: - value: 'netflixoss/metaflow_metadata_service:v2.3.5' + value: 'netflixoss/metaflow_metadata_service:v2.4.11' ContainerPort: value: 8080 ContainerCpu: @@ -113,7 +118,7 @@ Mappings: ServiceName: value: 'metaflow-ui-service' ImageUrl: - value: 'netflixoss/metaflow_metadata_service:v2.3.5' + value: 'netflixoss/metaflow_metadata_service:v2.4.11' ContainerPort: value: 8083 ContainerCpu: @@ -134,7 +139,7 @@ Mappings: ServiceName: value: 'metadata-ui-static' ImageUrl: - value: 'public.ecr.aws/outerbounds/metaflow_ui:v1.2.4' + value: 'public.ecr.aws/outerbounds/metaflow_ui:1.3.13' ContainerPort: value: 3000 ContainerCpu: @@ -596,7 +601,7 @@ Resources: DeleteAutomatedBackups: 'true' StorageType: 'gp2' Engine: 'postgres' - EngineVersion: '11' + EngineVersion: '13.15' MasterUsername: !Join ['', ['{{resolve:secretsmanager:', !Ref MyRDSSecret, ':SecretString:username}}' ]] MasterUserPassword: !Join ['', ['{{resolve:secretsmanager:', !Ref MyRDSSecret, ':SecretString:password}}' ]] VPCSecurityGroups: @@ -1383,8 +1388,19 @@ Resources: Condition: StringEquals: ec2:CreateAction: RunInstances + BatchLaunchTemplateMetaFlow: + Type: AWS::EC2::LaunchTemplate + Properties: + LaunchTemplateName: "BatchLaunchTemplateMetaFlow" + LaunchTemplateData: + BlockDeviceMappings: + - DeviceName: /dev/xvda + Ebs: + VolumeSize: 100 + VolumeType: gp2 ComputeEnvironment: Type: AWS::Batch::ComputeEnvironment + DependsOn: BatchLaunchTemplateMetaFlow Properties: Type: MANAGED ServiceRole: !GetAtt 'BatchExecutionRole.Arn' @@ -1400,6 +1416,8 @@ Resources: InstanceRole: !If [EnableFargateOnBatch, !Ref AWS::NoValue, !GetAtt 'ECSInstanceProfile.Arn'] InstanceTypes: !If [EnableFargateOnBatch, !Ref AWS::NoValue, !Ref ComputeEnvInstanceTypes] DesiredvCpus: !If [EnableFargateOnBatch, !Ref AWS::NoValue, !Ref DesiredVCPUBatch] + LaunchTemplate: + LaunchTemplateId: !Ref BatchLaunchTemplateMetaFlow State: ENABLED JobQueue: DependsOn: ComputeEnvironment @@ -1889,4 +1907,4 @@ Outputs: LoadBalancerUIDNSName: Condition: EnableUI Description: "UI Load Balancer DNS Name" - Value: !GetAtt "LoadBalancerUI.DNSName" + Value: !GetAtt "LoadBalancerUI.DNSName" \ No newline at end of file From 91456d80d8d841b1c0c9dde03266050d784a6ddd Mon Sep 17 00:00:00 2001 From: Matthias Kotzerke Date: Sat, 27 Jul 2024 01:13:08 +0200 Subject: [PATCH 2/2] Deactivate Launch Template for Fargate --- aws/cloudformation/metaflow-cfn-template.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/aws/cloudformation/metaflow-cfn-template.yml b/aws/cloudformation/metaflow-cfn-template.yml index 56d995e..5127f29 100644 --- a/aws/cloudformation/metaflow-cfn-template.yml +++ b/aws/cloudformation/metaflow-cfn-template.yml @@ -1416,8 +1416,11 @@ Resources: InstanceRole: !If [EnableFargateOnBatch, !Ref AWS::NoValue, !GetAtt 'ECSInstanceProfile.Arn'] InstanceTypes: !If [EnableFargateOnBatch, !Ref AWS::NoValue, !Ref ComputeEnvInstanceTypes] DesiredvCpus: !If [EnableFargateOnBatch, !Ref AWS::NoValue, !Ref DesiredVCPUBatch] - LaunchTemplate: - LaunchTemplateId: !Ref BatchLaunchTemplateMetaFlow + LaunchTemplate: !If + - EnableFargateOnBatch + - !Ref AWS::NoValue + - LaunchTemplateSpecification: + LaunchTemplateId: !Ref BatchLaunchTemplateMetaFlow State: ENABLED JobQueue: DependsOn: ComputeEnvironment