Update to Ubuntu 20.04 (#332)

Signed-off-by: Sean Smith <[email protected]>
aws-samples · May 15, 2024 · 8578726 · 8578726
1 parent 07074e6
commit 8578726
Show file tree

Hide file tree

Showing 6 changed files with 37 additions and 94 deletions.
diff --git a/1.architectures/2.aws-parallelcluster/README.md b/1.architectures/2.aws-parallelcluster/README.md
@@ -83,11 +83,11 @@ Alternatively you can refer to these architectures for more specific use cases:
 
 The `.yaml` templates contain placeholder variables that you need to replace before use.
 
-- `PLACEHOLDER_CUSTOM_AMI_ID`: if using a custom AMI then replace with the custom AMI ID (`ami-12356790abcd`).
-- `PLACEHOLDER_PUBLIC_SUBNET`: change to the id of a public subnet to host the head-node (`subnet-12356790abcd`).
-- `PLACEHOLDER_PRIVATE_SUBNET`: change to the id of a public subnet to host the compute nodes (`subnet-12356790abcd`).
+- `CUSTOM_AMI_ID`: if using a custom AMI then replace with the custom AMI ID (`ami-12356790abcd`).
+- `PUBLIC_SUBNET_ID`: change to the id of a public subnet to host the head-node (`subnet-12356790abcd`).
+- `PRIVATE_SUBNET_ID`: change to the id of a public subnet to host the compute nodes (`subnet-12356790abcd`).
 - `PLACEHOLDER_SSH_KEY`: ID of the SSH key you'd like to use to connect to the head-node, use the name of the key. You can also use AWS Systems Manager Session Manager (SSM).
-- `PLACEHOLDER_CAPACITY_RESERVATION_ID`: if using a capacity reservation put the ID here (`cr-12356790abcd`).
+- `CAPACITY_RESERVATION_ID`: if using a capacity reservation put the ID here (`cr-12356790abcd`).
 
 In some of the templates you may need to update these placeholders:
 

diff --git a/1.architectures/2.aws-parallelcluster/distributed-training-p4de-base.yaml b/1.architectures/2.aws-parallelcluster/distributed-training-p4de-base.yaml
@@ -4,13 +4,11 @@
 Imds:
   ImdsSupport: v2.0
 Image:
-  Os: alinux2
+  Os: ubuntu2004
 HeadNode:
   InstanceType: m5.8xlarge
   Networking:
-    SubnetId: PLACEHOLDER_PUBLIC_SUBNET
-  Ssh:
-    KeyName: PLACEHOLDER_SSH_KEY
+    SubnetId: ${PUBLIC_SUBNET_ID}
   LocalStorage:
     RootVolume:
       Size: 500
@@ -40,21 +38,21 @@ Scheduling:
       CapacityType: ONDEMAND
       Networking:
         SubnetIds:
-          - PLACEHOLDER_PRIVATE_SUBNET
+          - ${PRIVATE_SUBNET_ID}
         PlacementGroup:
           Enabled: true # set this to false if using a targeted ODCR
       ComputeSettings:
         LocalStorage:
           EphemeralVolume:
-            MountDir: /local_scratch # each instance has a local scratch on NVMe
+            MountDir: /scratch # each instance has a local scratch on NVMe
           RootVolume:
             Size: 200
       # The capacity reservation section is recommended if you use instances
       # with a targeted ODCRs. You can also use a capacity resource group and
       # CapacityReservationResourceGroupArn if you want to regroup
       # multiple reservations
       CapacityReservationTarget:
-        CapacityReservationId: PLACEHOLDER_CAPACITY_RESERVATION_ID
+        CapacityReservationId: ${CAPACITY_RESERVATION_ID}
       JobExclusiveAllocation: true   # GenAI training likes to gobble all GPUs in an instance
       ComputeResources:
         - Name: distributed-ml
@@ -71,19 +69,11 @@ SharedStorage:
       StorageCapacity: 4800 # size it to your storage and throughput needs
       PerUnitStorageThroughput: 250 # this can be increased to 500 and 100
       DeploymentType: PERSISTENT_2
-  - Name: SharedEBS
-    StorageType: Ebs
-    MountDir: /apps # Store your shared apps & scripts here
-    EbsSettings:
-      VolumeType: gp3
-      Size: 200
-      Throughput: 300
-      Iops: 6000
 Monitoring:
   DetailedMonitoring: true
   Logs:
     CloudWatch:
       Enabled: true # good for debug
   Dashboards:
     CloudWatch:
-      Enabled: false # provide basic dashboards
+      Enabled: true # provide basic dashboards
diff --git a/...ctures/2.aws-parallelcluster/distributed-training-p4de_batch-inference-g5_custom_ami.yaml b/...ctures/2.aws-parallelcluster/distributed-training-p4de_batch-inference-g5_custom_ami.yaml
@@ -5,15 +5,11 @@ Imds:
   ImdsSupport: v2.0
 
 Image:
-  Os: alinux2
-  CustomAmi: PLACEHOLDER_CUSTOM_AMI_ID
-
+  Os: ubuntu2004
 HeadNode:
   InstanceType: m5.8xlarge
   Networking:
-    SubnetId: PLACEHOLDER_PUBLIC_SUBNET
-  Ssh:
-    KeyName: PLACEHOLDER_SSH_KEY
+    SubnetId: ${PUBLIC_SUBNET_ID}
   LocalStorage:
     RootVolume:
       Size: 500
@@ -44,21 +40,21 @@ Scheduling:
       CapacityType: ONDEMAND
       Networking:
         SubnetIds:
-          - PLACEHOLDER_PRIVATE_SUBNET
+          - ${PRIVATE_SUBNET_ID}
         PlacementGroup:
           Enabled: true
       ComputeSettings:
         LocalStorage:
           EphemeralVolume:
-            MountDir: /local_scratch # each instance has a local scratch on NVMe
+            MountDir: /scratch # each instance has a local scratch on NVMe
           RootVolume:
             Size: 200
       # The capacity reservation section is recommended if you use instances
       # with a targeted ODCRs. You can also use a capacity resource group and
       # CapacityReservationResourceGroupArn if you want to regroup
       # multiple reservations
       CapacityReservationTarget:
-        CapacityReservationId: PLACEHOLDER_CAPACITY_RESERVATION_ID
+        CapacityReservationId: ${CAPACITY_RESERVATION_ID}
       JobExclusiveAllocation: true   # GenAI training likes to gobble all GPUs in an instance
       ComputeResources:
         - Name: distributed-ml
@@ -71,11 +67,11 @@ Scheduling:
       CapacityType: ONDEMAND
       Networking:
         SubnetIds:
-          - PLACEHOLDER_PRIVATE_SUBNET
+          - ${PRIVATE_SUBNET_ID}
       ComputeSettings:
         LocalStorage:
           EphemeralVolume:
-            MountDir: /local_scratch # each instance has a local scratch on NVMe
+            MountDir: /scratch # each instance has a local scratch on NVMe
           RootVolume:
             Size: 100
       ComputeResources:
@@ -92,20 +88,11 @@ SharedStorage:
       StorageCapacity: 4800 # size it to your storage and throughput needs
       PerUnitStorageThroughput: 250 # this can be increased to 500 and 100
       DeploymentType: PERSISTENT_2
-  - Name: SharedEBS
-    StorageType: Ebs
-    MountDir: /apps # Store your shared apps & scripts here
-    EbsSettings:
-      VolumeType: gp3
-      Size: 200
-      Throughput: 300
-      Iops: 6000
-
 Monitoring:
   DetailedMonitoring: true
   Logs:
     CloudWatch:
       Enabled: true # good for debug
   Dashboards:
     CloudWatch:
-      Enabled: false # provide basic dashboards
+      Enabled: true # provide basic dashboards
diff --git a/1.architectures/2.aws-parallelcluster/distributed-training-p4de_custom_ami.yaml b/1.architectures/2.aws-parallelcluster/distributed-training-p4de_custom_ami.yaml
@@ -4,14 +4,12 @@
 Imds:
   ImdsSupport: v2.0
 Image:
-  Os: alinux2
-  CustomAmi: PLACEHOLDER_CUSTOM_AMI_ID
+  Os: ubuntu2004
+  CustomAmi: ${AMI_ID}
 HeadNode:
   InstanceType: m5.8xlarge
   Networking:
-    SubnetId: PLACEHOLDER_PUBLIC_SUBNET
-  Ssh:
-    KeyName: PLACEHOLDER_SSH_KEY
+    SubnetId: ${PUBLIC_SUBNET_ID}
   LocalStorage:
     RootVolume:
       Size: 500
@@ -41,21 +39,21 @@ Scheduling:
       CapacityType: ONDEMAND
       Networking:
         SubnetIds:
-          - PLACEHOLDER_PRIVATE_SUBNET
+          - ${PRIVATE_SUBNET_ID}
         PlacementGroup:
           Enabled: true
       ComputeSettings:
         LocalStorage:
           EphemeralVolume:
-            MountDir: /local_scratch # each instance has a local scratch on NVMe
+            MountDir: /scratch # each instance has a local scratch on NVMe
           RootVolume:
             Size: 200
       # The capacity reservation section is recommended if you use instances
       # with a targeted ODCRs. You can also use a capacity resource group and
       # CapacityReservationResourceGroupArn if you want to regroup
       # multiple reservations
       CapacityReservationTarget:
-        CapacityReservationId: PLACEHOLDER_CAPACITY_RESERVATION_ID
+        CapacityReservationId: ${CAPACITY_RESERVATION_ID}
       JobExclusiveAllocation: true   # GenAI training likes to gobble all GPUs in an instance
       ComputeResources:
         - Name: distributed-ml
@@ -72,19 +70,11 @@ SharedStorage:
       StorageCapacity: 4800 # size it to your storage and throughput needs
       PerUnitStorageThroughput: 250 # this can be increased to 500 and 100
       DeploymentType: PERSISTENT_2
-  - Name: SharedEBS
-    StorageType: Ebs
-    MountDir: /apps # Store your shared apps & scripts here
-    EbsSettings:
-      VolumeType: gp3
-      Size: 200
-      Throughput: 300
-      Iops: 6000
 Monitoring:
   DetailedMonitoring: true
   Logs:
     CloudWatch:
       Enabled: true # good for debug
   Dashboards:
     CloudWatch:
-      Enabled: false # provide basic dashboards
+      Enabled: true # provide basic dashboards
diff --git a/1.architectures/2.aws-parallelcluster/distributed-training-p4de_postinstall_scripts.yaml b/1.architectures/2.aws-parallelcluster/distributed-training-p4de_postinstall_scripts.yaml
@@ -4,13 +4,11 @@
 Imds:
   ImdsSupport: v2.0
 Image:
-  Os: alinux2
+  Os: ubuntu2004
 HeadNode:
   InstanceType: m5.8xlarge
   Networking:
-    SubnetId: PLACEHOLDER_PUBLIC_SUBNET
-  Ssh:
-    KeyName: PLACEHOLDER_SSH_KEY
+    SubnetId: ${PUBLIC_SUBNET_ID}
   LocalStorage:
     RootVolume:
       Size: 500
@@ -25,8 +23,6 @@ HeadNode:
       Sequence:
         - Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/docker/postinstall.sh'
         - Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/pyxis/postinstall.sh'
-          Args:
-            - /fsx # cache enroot images on /fsx
 Scheduling:
   Scheduler: slurm
   SlurmSettings:
@@ -47,21 +43,21 @@ Scheduling:
       CapacityType: ONDEMAND
       Networking:
         SubnetIds:
-          - PLACEHOLDER_PRIVATE_SUBNET
+          - ${PRIVATE_SUBNET_ID}
         PlacementGroup:
           Enabled: true  # set this to false if using a targeted ODCR
       ComputeSettings:
         LocalStorage:
           EphemeralVolume:
-            MountDir: /local_scratch # each instance has a local scratch on NVMe
+            MountDir: /scratch # each instance has a local scratch on NVMe
           RootVolume:
             Size: 200
       # The capacity reservation section is recommended if you use instances
       # with a targeted ODCRs. You can also use a capacity resource group and
       # CapacityReservationResourceGroupArn if you want to regroup
       # multiple reservations
       CapacityReservationTarget:
-        CapacityReservationId: PLACEHOLDER_CAPACITY_RESERVATION_ID
+        CapacityReservationId: ${CAPACITY_RESERVATION_ID}
       JobExclusiveAllocation: true   # GenAI training likes to gobble all GPUs in an instance
       ComputeResources:
         - Name: distributed-ml
@@ -75,8 +71,6 @@ Scheduling:
           Sequence:
             - Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/docker/postinstall.sh'
             - Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/pyxis/postinstall.sh'
-              Args:
-                - /fsx # cache enroot images on /fsx
 SharedStorage:
   - MountDir: /fsx
     Name: fsx
@@ -85,19 +79,11 @@ SharedStorage:
       StorageCapacity: 4800 # size it to your storage and throughput needs
       PerUnitStorageThroughput: 250 # this can be increased to 500 and 100
       DeploymentType: PERSISTENT_2
-  - Name: SharedEBS
-    StorageType: Ebs
-    MountDir: /apps # Store your shared apps & scripts here
-    EbsSettings:
-      VolumeType: gp3
-      Size: 200
-      Throughput: 300
-      Iops: 6000
 Monitoring:
   DetailedMonitoring: true
   Logs:
     CloudWatch:
       Enabled: true # good for debug
   Dashboards:
     CloudWatch:
-      Enabled: false # provide basic dashboards
+      Enabled: true # provide basic dashboards
diff --git a/1.architectures/2.aws-parallelcluster/distributed-training-trn1_custom_ami.yaml b/1.architectures/2.aws-parallelcluster/distributed-training-trn1_custom_ami.yaml
@@ -6,14 +6,12 @@
 Imds:
   ImdsSupport: v2.0
 Image:
-  Os: alinux2
+  Os: ubuntu2004
   CustomAmi: PLACEHOLDER_CUSTOM_AMI_ID
 HeadNode:
   InstanceType: m5.8xlarge
   Networking:
-    SubnetId: PLACEHOLDER_PUBLIC_SUBNET
-  Ssh:
-    KeyName: PLACEHOLDER_SSH_KEY
+    SubnetId: ${PUBLIC_SUBNET_ID}
   LocalStorage:
     RootVolume:
       Size: 500
@@ -43,11 +41,11 @@ Scheduling:
       CapacityType: ONDEMAND
       Networking:
         SubnetIds:
-          - PLACEHOLDER_PRIVATE_SUBNET
+          - ${PRIVATE_SUBNET_ID}
       ComputeSettings:
         LocalStorage:
           EphemeralVolume:
-            MountDir: /local_scratch # each instance has a local scratch on NVMe
+            MountDir: /scratch # each instance has a local scratch on NVMe
           RootVolume:
             Size: 200
       # The capacity reservation section is recommended if you use instances
@@ -64,7 +62,7 @@ Scheduling:
           # assumes you are using a capacity reservation.
           # If not comment or remove the 2 lines below
           CapacityReservationTarget:
-            CapacityReservationId: PLACEHOLDER_CAPACITY_RESERVATION_ID
+            CapacityReservationId: ${CAPACITY_RESERVATION_ID}
           Networking:
             PlacementGroup:
               Enabled: true
@@ -77,19 +75,11 @@ SharedStorage:
       StorageCapacity: 4800 # size it to your storage and throughput needs
       PerUnitStorageThroughput: 250 # this can be increased to 500 and 100
       DeploymentType: PERSISTENT_2
-  - Name: SharedEBS
-    StorageType: Ebs
-    MountDir: /apps # Store your shared apps & scripts here
-    EbsSettings:
-      VolumeType: gp3
-      Size: 200
-      Throughput: 300
-      Iops: 6000
 Monitoring:
   DetailedMonitoring: true
   Logs:
     CloudWatch:
       Enabled: true # good for debug
   Dashboards:
     CloudWatch:
-      Enabled: false # provide basic dashboards
+      Enabled: true # provide basic dashboards