From 24b965c2fd87c77ed768581505a5405b3a1bf509 Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Sat, 21 Dec 2024 05:45:43 +0000 Subject: [PATCH 01/15] Initial a3u-gke-gcs example --- .../a3u-gke-gcs/README.md | 109 +++++++ .../a3u-gke-gcs/a3u-gke-gcs.yaml | 282 ++++++++++++++++++ .../a3u-gke-gcs/deployment.yaml | 37 +++ .../kueue-configuration.yaml.tftpl | 28 ++ .../a3u-gke-gcs/nccl-rdma-installer.yaml | 96 ++++++ 5 files changed, 552 insertions(+) create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/README.md create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/README.md b/examples/hypercompute_clusters/a3u-gke-gcs/README.md new file mode 100644 index 0000000000..3a21f46e13 --- /dev/null +++ b/examples/hypercompute_clusters/a3u-gke-gcs/README.md @@ -0,0 +1,109 @@ +# A3-Ultra GKE + GCS Reference Design + +This reference design provides a high-performance and scalable architecture for +deploying AI/ML workloads on Google Kubernetes Engine (GKE) with Google Cloud +Storage (GCS). + +## Key Features + +* **Multi-VPC Design:** Utilizes three VPCs: two for GKE nodes and one dedicated + for GPU RDMA networks. +* **Cloud Storage Fuse Integration:** Enables seamless access to GCS buckets + from within your containers using the Cloud Storage Fuse CSI Driver. Cloud + Storage Fuse is configured to utilize the 12 TB of Local SSD +* **Hierarchical Namespace Buckets:** Leverages GCS buckets with Hierarchical + Namespace enabled, optimizing performance for checkpointing and restarting + workloads. +* **Kueue for Workload Scheduling:** Provides a robust and flexible system for + managing your AI/ML training jobs. +* **Jobset API for Tightly Coupled Workloads:** Facilitates running tightly + coupled AI/ML training jobs efficiently. + +## Deployment Steps + +1. **Build the Cluster Toolkit `gcluster` binary:** + + Follow the instructions [here](https://cloud.google.com/cluster-toolkit/docs/setup/configure-environment). + +2. **(Optional) Create a GCS Bucket for Terraform State:** + + This step is recommended for storing your Terraform state. Use the + following commands, replacing placeholders with your project details: + + ```bash + BUCKET_NAME= + PROJECT_ID= + REGION= + + gcloud storage buckets create gs://${BUCKET_NAME} \ + --project=${PROJECT_ID} \ + --default-storage-class=STANDARD \ + --location=${REGION} \ + --uniform-bucket-level-access + + gcloud storage buckets update gs://${BUCKET_NAME} --versioning + ``` + +3. **Create and Configure GCS Buckets:** + + * Create separate GCS buckets for training data and checkpoint/restart data: + + ```bash + PROJECT_ID= + REGION= + TRAINING_BUCKET_NAME= + CHECKPOINT_BUCKET_NAME= + PROJECT_NUMBER= + + gcloud storage buckets create gs://${TRAINING_BUCKET_NAME} \ + --location=${REGION} \ + --uniform-bucket-level-access \ + --enable-hierarchical-namespace + + gcloud storage buckets create gs://${CHECKPOINT_BUCKET_NAME} \ + --location=${REGION} \ + --uniform-bucket-level-access \ + --enable-hierarchical-namespace + ``` + + * Grant workload identity service accounts (WI SAs) access to the buckets: + + ```bash + + gcloud storage buckets add-iam-policy-binding gs://${TRAINING_BUCKET_NAME} \ + --member "principal://iam.googleapis.com/projects/${PROJECT_NUMBER}/locations/global/workloadIdentityPools/${PROJECT_ID}.svc.id.goog/subject/ns/default/sa/default" \ + --role roles/storage.objectUser + + gcloud storage buckets add-iam-policy-binding gs://${CHECKPOINT_BUCKET_NAME} \ + --member "principal://iam.googleapis.com/projects/$PROJECT_NUMBER}/locations/global/workloadIdentityPools/${PROJECT_ID}.svc.id.goog/subject/ns/default/sa/default" \ + --role roles/storage.objectUser + ``` + +4. **Customize Deployment Configuration:** + + Modify the `deployment.yaml` file to suit your needs. This will include + region/zone, nodepool sizes, reservation name, and checkpoint/training bucket + names. + +5. **Deploy the Cluster:** + + Use the `gcluster` tool to deploy your GKE cluster with the desired configuration: + + ```bash + gcluster deploy -d deployment.yaml a3u-gke-gcs.yaml + ``` + +## Example Workload Job + +Once the cluster has been deployed, there will be instructions on how to get +credentials for the cluster, as well as how to deploy an example workload. This +example workload uses [fio](https://github.com/axboe/fio) to run a series of +benchmarks against the LocalSSD and GCSFuse mounts. + +The instructions will look something like: + +```bash +Use the following commands to: +Submit your job: + kubectl create -f /primary/my-job-.yaml +``` diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml new file mode 100644 index 0000000000..c5b88ab9ae --- /dev/null +++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml @@ -0,0 +1,282 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +blueprint_name: a3u-gke-gcs + +vars: + project_id: # Insert GCP project + deployment_name: # Unique name of this cluster, like a3u-gke-gcs + region: # Region, e.g. europe-west1 + zone: # Zone, e.g. europe-west1-b + + # Cidr block containing the IP of the machine calling terraform and kubectl + # The value can be more specific if the IPs are known which will run kubectl + # e.g. the local system running Terraform or a remote node + authorized_cidr: 0.0.0.0/0 + extended_reservation: # Reservation name, e.g. //reservationBlocks/ + + nccl_installer_path: $(ghpc_stage("./nccl-rdma-installer.yaml")) + mtu_size: 8896 + static_node_count: # Number of A3-Ultra nodes, e.g. 2 + # Number of H200 GPUs (for later use by Kueue), which + # should be 8 x `static_node_count` + num_gpus: + training_bucket_name: # Name of bucket that holds training data + checkpoint_bucket_name: # Name of bucket used for checkpoints + system_node_pool_disk_size_gb: 200 + a3ultra_node_pool_disk_size_gb: 100 + +deployment_groups: +- group: primary + modules: + - id: gke-a3-ultra-net-0 + source: modules/network/vpc + settings: + network_name: $(vars.deployment_name)-net-0 + mtu: 8896 + subnetworks: + - subnet_name: $(vars.deployment_name)-sub-0 + subnet_region: $(vars.region) + subnet_ip: 192.168.0.0/18 + secondary_ranges_list: + - subnetwork_name: $(vars.deployment_name)-sub-0 + ranges: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: gke-a3-ultra-net-1 + source: modules/network/vpc + settings: + network_name: $(vars.deployment_name)-net-1 + mtu: $(vars.mtu_size) + subnetworks: + - subnet_name: gke-a3u-gcs-sub-1 + subnet_region: $(vars.region) + subnet_ip: 192.168.64.0/18 + + - id: gke-a3-ultra-rdma-net + source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/network/rdma-vpc?ref=98c49fe + settings: + network_name: $(vars.deployment_name)-rdma-net + mtu: $(vars.mtu_size) + network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-roce + network_routing_mode: REGIONAL + subnetworks_template: + name_prefix: $(vars.deployment_name)-rdma-sub + count: 8 + ip_range: 192.168.128.0/18 + region: $(vars.region) + + - id: a3-ultragpu-cluster + source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/scheduler/gke-cluster?ref=e0c690b + use: [gke-a3-ultra-net-0] + settings: + release_channel: RAPID + system_node_pool_machine_type: "e2-standard-16" + system_node_pool_disk_size_gb: $(vars.system_node_pool_disk_size_gb) + system_node_pool_taints: [] + enable_dcgm_monitoring: true + enable_gcsfuse_csi: true + enable_private_endpoint: false # Allows access from authorized public IPs + master_authorized_networks: + - cidr_block: $(vars.authorized_cidr) # Allows your machine to run the kubectl command. Required for multi network setup. + display_name: "kubectl-access-network" + maintenance_exclusions: + - name: no-minor-or-node-upgrades-indefinite + start_time: "2024-12-01T00:00:00Z" + end_time: "2025-12-22T00:00:00Z" + exclusion_scope: NO_MINOR_OR_NODE_UPGRADES + additional_networks: + $(concat( + [{ + network=gke-a3-ultra-net-1.network_name, + subnetwork=gke-a3-ultra-net-1.subnetwork_name, + subnetwork_project=vars.project_id, + nic_type="GVNIC", + queue_count=null, + network_ip=null, + stack_type=null, + access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}], + ipv6_access_config=[], + alias_ip_range=[] + }], + gke-a3-ultra-rdma-net.subnetwork_interfaces_gke + )) + outputs: [instructions] + + - id: a3-ultragpu-pool + source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/compute/gke-node-pool?ref=e0c690b + use: [a3-ultragpu-cluster] + settings: + machine_type: a3-ultragpu-8g + auto_upgrade: true + zones: [$(vars.zone)] + disk_type: hyperdisk-balanced + disk_size_gb: $(vars.a3ultra_node_pool_disk_size_gb) + static_node_count: $(vars.static_node_count) + guest_accelerator: + - type: nvidia-h200-141gb + count: 8 + gpu_driver_installation_config: + gpu_driver_version: "LATEST" + reservation_affinity: + consume_reservation_type: SPECIFIC_RESERVATION + specific_reservations: + - name: $(vars.extended_reservation) + additional_networks: + $(concat( + [{ + network=gke-a3-ultra-net-1.network_name, + subnetwork=gke-a3-ultra-net-1.subnetwork_name, + subnetwork_project=vars.project_id, + nic_type="GVNIC", + queue_count=null, + network_ip=null, + stack_type=null, + access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}], + ipv6_access_config=[], + alias_ip_range=[] + }], + gke-a3-ultra-rdma-net.subnetwork_interfaces_gke + )) + outputs: [instructions] + + - id: topology-aware-scheduler-install + source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/compute/gke-topology-scheduler?ref=e0c690b + use: [a3-ultragpu-cluster] + + # Install Kueue, Jobset, and NCCL installer + - id: workload-manager-install + source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply?ref=e0c690b + use: [a3-ultragpu-cluster] + settings: + kueue: + install: true + version: v0.9.1 + config_path: $(ghpc_stage("kueue-configuration.yaml.tftpl")) + config_template_vars: + num_gpus: $(vars.num_gpus) + jobset: + install: true + version: v0.7.1 + apply_manifests: + - source: $(vars.nccl_installer_path) + + # Create a remote mount of $(vars.training_bucket_name) + # using mount options optimized for reading training + # data. + - id: gcs-training + source: modules/file-system/pre-existing-network-storage + settings: + remote_mount: $(vars.training_bucket_name) + local_mount: /training-data + fs_type: gcsfuse + mount_options: "implicit-dirs, metadata-cache:ttl-secs:-1, metadata-cache:stat-cache-max-size-mb:-1, metadata-cache:type-cache-max-size-mb:-1, file-cache:max-size-mb:-1, file-cache:cache-file-for-range-read:true, file-system:kernel-list-cache-ttl-secs:-1" + + # Create a remote mount of $(vars.checkpoint_bucket_name) + # using mount options optimized for writing and reading + # checkpoint data. + - id: gcs-checkpointing + source: modules/file-system/pre-existing-network-storage + settings: + remote_mount: $(vars.checkpoint_bucket_name) + local_mount: /checkpoint-data + fs_type: gcsfuse + mount_options: "implicit-dirs, metadata-cache:ttl-secs:0, file-cache:max-size-mb:-1, file-cache:cache-file-for-range-read:true, file-system:kernel-list-cache-ttl-secs:0, file-cache:enable-parallel-downloads:true, rename-dir-limit=200000" + + # Persistent Volume for training data + - id: training-pv + source: modules/file-system/gke-persistent-volume + use: [gcs-training, a3-ultragpu-cluster] + settings: + gcs_bucket_name: $(vars.training_bucket_name) + capacity_gb: 1000000 + + # Persistent Volume for checkpoint data + - id: checkpointing-pv + source: modules/file-system/gke-persistent-volume + use: [gcs-checkpointing, a3-ultragpu-cluster] + settings: + gcs_bucket_name: $(vars.checkpoint_bucket_name) + capacity_gb: 1000000 + + # This is an example job that will install and run an `fio` + # benchmark against the training and checkpointing buckets. + - id: fio-bench-job-template + source: modules/compute/gke-job-template + use: [checkpointing-pv, training-pv, a3-ultragpu-pool] + settings: + ephemeral_volumes: + - type: local-ssd + mount_path: /scratch-data + size_gb: 1000 # Use 1 out of 12 TB for local scratch + + k8s_service_account_name: default + image: ubuntu:latest + + command: + - bash + - -c + - | + + set -eux + export DEBIAN_FRONTEND=noninteractive + + # Install fio + apt update -y && apt install -y fio + + # Use a tag to create a unique path for tests + TAG=`date +%s` + + # Verify mountpoints + df -h + mountpoint /scratch-data + mountpoint /checkpoint-data + mountpoint /training-data + + # Create temporary directory for fio benchmarks + mkdir -p /{scratch,training,checkpoint}-data/fio-benchmarks-${TAG} + + # The following will take roughly 10 minutes to complete + + # Perform scratch data write performance test + fio --ioengine=libaio --filesize=10G --ramp_time=2s --runtime=1m \ + --numjobs=32 --create_serialize=0 --direct=1 --verify=0 \ + --randrepeat=0 --group_reporting --directory=/scratch-data/fio-benchmarks-${TAG} \ + --name=scratch --blocksize=100m --iodepth=64 --readwrite=write + + # Perform training data reading performance test + fio --ioengine=libaio --filesize=1G --ramp_time=2s --runtime=1m \ + --numjobs=32 --create_serialize=0 --direct=1 --verify=0 \ + --randrepeat=0 --group_reporting --directory=/training-data/fio-benchmarks-${TAG} \ + --name=training --blocksize=1m --iodepth=64 --readwrite=randread + + # Perform checkpoint data writing performance test + fio --ioengine=libaio --filesize=10G --ramp_time=2s --runtime=1m \ + --numjobs=32 --create_serialize=0 --direct=1 --verify=0 \ + --randrepeat=0 --group_reporting --directory=/checkpoint-data/fio-benchmarks-${TAG} \ + --name=checkpoint --blocksize=100m --iodepth=64 --readwrite=write + + # Perform checkpoint data reading performance test + fio --ioengine=libaio --filesize=10G --ramp_time=2s --runtime=1m \ + --numjobs=32 --create_serialize=0 --direct=1 --verify=0 \ + --randrepeat=0 --group_reporting --directory=/checkpoint-data/fio-benchmarks-${TAG} \ + --name=checkpoint --blocksize=100m --iodepth=64 --readwrite=read + + # Clean up temporary directories for fio benchmarks + rm -rf /{scratch-training,checkpoint}-data/fio-benchmarks-${TAG} + + outputs: [instructions] diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml new file mode 100644 index 0000000000..0d9966b3e0 --- /dev/null +++ b/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml @@ -0,0 +1,37 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform_backend_defaults: + type: gcs + configuration: + bucket: + +vars: + project_id: + # This should be unique across all of your Cluster + # Toolkit Deployments. + deployment_name: a3u-gke-gcs + region: + zone: + static_node_count: + # This should be 8 x static_node_count. + num_gpus: + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + # e.g. the local system running Terraform or a remote node + # To allow all (IAM restrictions still enforced), use 0.0.0.0/0 + authorized_cidr: + extended_reservation: + training_bucket_name: + checkpoint_bucket_name: diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl b/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl new file mode 100644 index 0000000000..97ae9d91f6 --- /dev/null +++ b/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl @@ -0,0 +1,28 @@ +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ResourceFlavor +metadata: + name: "default-flavor" +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ClusterQueue +metadata: + name: "cluster-queue" +spec: + namespaceSelector: {} + resourceGroups: + - coveredResources: ["nvidia.com/gpu"] + flavors: + - name: "default-flavor" + resources: + - name: "nvidia.com/gpu" + nominalQuota: ${num_gpus} + +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + namespace: "default" + name: "local-queue" +spec: + clusterQueue: "cluster-queue" diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml new file mode 100644 index 0000000000..486255755a --- /dev/null +++ b/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml @@ -0,0 +1,96 @@ +# Copyright 2024 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nccl-rdma-installer + namespace: kube-system + labels: + k8s-app: nccl-rdma-installer +spec: + selector: + matchLabels: + k8s-app: nccl-rdma-installer + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nccl-rdma-installer + k8s-app: nccl-rdma-installer + spec: + priorityClassName: system-node-critical + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-accelerator + operator: In + values: + - nvidia-h200-141gb + tolerations: + - operator: "Exists" + hostNetwork: true + hostPID: true + volumes: + - name: library-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia/lib64 + type: DirectoryOrCreate + - name: gib + hostPath: + path: /home/kubernetes/bin/gib + initContainers: + - name: disable-log-martian + image: alpine:latest + command: ["/bin/sh"] + securityContext: + privileged: true + args: + - -c + - | + sysctl -w net.ipv4.conf.eth2.log_martians=0 + sysctl -w net.ipv4.conf.eth3.log_martians=0 + sysctl -w net.ipv4.conf.eth4.log_martians=0 + sysctl -w net.ipv4.conf.eth5.log_martians=0 + sysctl -w net.ipv4.conf.eth6.log_martians=0 + sysctl -w net.ipv4.conf.eth7.log_martians=0 + sysctl -w net.ipv4.conf.eth8.log_martians=0 + sysctl -w net.ipv4.conf.eth9.log_martians=0 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2 + name: nccl-rdma-installer + resources: + requests: + cpu: 150m + securityContext: + privileged: true + volumeMounts: + - name: library-dir-host + mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64 + - name: gib + mountPath: /usr/local/home/kubernetes/bin/gib + command: ["/bin/sh", "-c"] + args: + - | + set -ex + /scripts/container_entry.sh install --install-nccl + cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64 + cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib + ibv_devinfo || exit 1 + echo "installation finishes" + containers: + - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830" + name: pause From 437027a50fde0c1f8b5936f7df291fdd3d765868 Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Sat, 21 Dec 2024 07:35:22 +0000 Subject: [PATCH 02/15] Pass mount_options through to GKE PV --- modules/file-system/gke-persistent-volume/main.tf | 11 +++++++---- .../gke-persistent-volume/templates/gcs-pv.yaml.tftpl | 6 +++++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/modules/file-system/gke-persistent-volume/main.tf b/modules/file-system/gke-persistent-volume/main.tf index d12c5d6d39..34602b32aa 100644 --- a/modules/file-system/gke-persistent-volume/main.tf +++ b/modules/file-system/gke-persistent-volume/main.tf @@ -35,6 +35,8 @@ locals { pv_name = "${local.base_name}-pv" pvc_name = "${local.base_name}-pvc" + list_mount_options = split(",", var.network_storage.mount_options) + filestore_pv_contents = templatefile( "${path.module}/templates/filestore-pv.yaml.tftpl", { @@ -61,10 +63,11 @@ locals { gcs_pv_contents = templatefile( "${path.module}/templates/gcs-pv.yaml.tftpl", { - pv_name = local.pv_name - capacity = "${var.capacity_gb}Gi" - labels = local.labels - bucket_name = local.is_gcs ? var.gcs_bucket_name : "" + pv_name = local.pv_name + capacity = "${var.capacity_gb}Gi" + labels = local.labels + mount_options = local.is_gcs ? local.list_mount_options : null + bucket_name = local.is_gcs ? var.gcs_bucket_name : "" } ) diff --git a/modules/file-system/gke-persistent-volume/templates/gcs-pv.yaml.tftpl b/modules/file-system/gke-persistent-volume/templates/gcs-pv.yaml.tftpl index 5a1fde209e..aa0e570a8b 100644 --- a/modules/file-system/gke-persistent-volume/templates/gcs-pv.yaml.tftpl +++ b/modules/file-system/gke-persistent-volume/templates/gcs-pv.yaml.tftpl @@ -13,8 +13,12 @@ spec: storage: ${capacity} accessModes: - ReadWriteMany + %{~ if mount_options != null ~} mountOptions: - - implicit-dirs + %{~ for key in mount_options ~} + - ${key} + %{~ endfor ~} + %{~ endif ~} csi: driver: gcsfuse.csi.storage.gke.io volumeHandle: ${bucket_name} From 2b82d68ec934e0ac046934814cac70bdee4f900a Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Sat, 21 Dec 2024 07:44:37 +0000 Subject: [PATCH 03/15] Use folded style for mount options --- .../a3u-gke-gcs/a3u-gke-gcs.yaml | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml index c5b88ab9ae..131b6590bf 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml @@ -184,7 +184,14 @@ deployment_groups: remote_mount: $(vars.training_bucket_name) local_mount: /training-data fs_type: gcsfuse - mount_options: "implicit-dirs, metadata-cache:ttl-secs:-1, metadata-cache:stat-cache-max-size-mb:-1, metadata-cache:type-cache-max-size-mb:-1, file-cache:max-size-mb:-1, file-cache:cache-file-for-range-read:true, file-system:kernel-list-cache-ttl-secs:-1" + mount_options: >- + implicit-dirs, + metadata-cache:ttl-secs:-1, + metadata-cache:stat-cache-max-size-mb:-1, + metadata-cache:type-cache-max-size-mb:-1, + file-cache:max-size-mb:-1, + file-cache:cache-file-for-range-read:true, + file-system:kernel-list-cache-ttl-secs:-1 # Create a remote mount of $(vars.checkpoint_bucket_name) # using mount options optimized for writing and reading @@ -195,7 +202,14 @@ deployment_groups: remote_mount: $(vars.checkpoint_bucket_name) local_mount: /checkpoint-data fs_type: gcsfuse - mount_options: "implicit-dirs, metadata-cache:ttl-secs:0, file-cache:max-size-mb:-1, file-cache:cache-file-for-range-read:true, file-system:kernel-list-cache-ttl-secs:0, file-cache:enable-parallel-downloads:true, rename-dir-limit=200000" + mount_options: >- + implicit-dirs, + metadata-cache:ttl-secs:0, + file-cache:max-size-mb:-1, + file-cache:cache-file-for-range-read:true, + file-system:kernel-list-cache-ttl-secs:0, + file-cache:enable-parallel-downloads:true, + rename-dir-limit=200000 # Persistent Volume for training data - id: training-pv From f0b877be55e16677d372b3de06f096bdef25478c Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Mon, 6 Jan 2025 22:52:27 +0000 Subject: [PATCH 04/15] Address comments/defaults, update nccl-plugin version --- .../a3u-gke-gcs/README.md | 2 +- .../a3u-gke-gcs/a3u-gke-gcs.yaml | 67 +++++++++++++------ .../a3u-gke-gcs/deployment.yaml | 22 +++++- .../kueue-configuration.yaml.tftpl | 47 ++++++++++--- .../a3u-gke-gcs/nccl-rdma-installer.yaml | 2 +- 5 files changed, 106 insertions(+), 34 deletions(-) diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/README.md b/examples/hypercompute_clusters/a3u-gke-gcs/README.md index 3a21f46e13..7669903cf0 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/README.md +++ b/examples/hypercompute_clusters/a3u-gke-gcs/README.md @@ -13,7 +13,7 @@ Storage (GCS). Storage Fuse is configured to utilize the 12 TB of Local SSD * **Hierarchical Namespace Buckets:** Leverages GCS buckets with Hierarchical Namespace enabled, optimizing performance for checkpointing and restarting - workloads. + workloads. (Requires GKE 1.31 or later). * **Kueue for Workload Scheduling:** Provides a robust and flexible system for managing your AI/ML training jobs. * **Jobset API for Tightly Coupled Workloads:** Facilitates running tightly diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml index 131b6590bf..8c844dfd5b 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml @@ -15,28 +15,46 @@ blueprint_name: a3u-gke-gcs vars: - project_id: # Insert GCP project - deployment_name: # Unique name of this cluster, like a3u-gke-gcs - region: # Region, e.g. europe-west1 - zone: # Zone, e.g. europe-west1-b + # The following variables should be over-written in the deployment.yaml file. + # Your GCP Project ID + project_id: - # Cidr block containing the IP of the machine calling terraform and kubectl - # The value can be more specific if the IPs are known which will run kubectl - # e.g. the local system running Terraform or a remote node - authorized_cidr: 0.0.0.0/0 - extended_reservation: # Reservation name, e.g. //reservationBlocks/ + # This should be unique across all of your Cluster + # Toolkit Deployments. + deployment_name: a3u-gke-gcs + + # The GCP Region used for this deployment. + region: + + # The GCP Zone used for this deployment. + zone: + + # The number of nodes to be created + static_node_count: - nccl_installer_path: $(ghpc_stage("./nccl-rdma-installer.yaml")) - mtu_size: 8896 - static_node_count: # Number of A3-Ultra nodes, e.g. 2 # Number of H200 GPUs (for later use by Kueue), which - # should be 8 x `static_node_count` + # This should be 8 x static_node_count. num_gpus: - training_bucket_name: # Name of bucket that holds training data - checkpoint_bucket_name: # Name of bucket used for checkpoints + + # Cidr block containing the IP of the machine calling terraform. + # To allow all (IAM restrictions still enforced), use 0.0.0.0/0 + # To allow only your IP address, use /32 + authorized_cidr: + + # The name of the compute engine reservation of A3-Ultra nodes in the form of + # //reservationBlocks/ + extended_reservation: + + # The name of the GCS bucket used for training data + training_bucket_name: + + # The following variables do not need to be modified + nccl_installer_path: $(ghpc_stage("./nccl-rdma-installer.yaml")) + mtu_size: 8896 system_node_pool_disk_size_gb: 200 a3ultra_node_pool_disk_size_gb: 100 + deployment_groups: - group: primary modules: @@ -68,7 +86,7 @@ deployment_groups: subnet_ip: 192.168.64.0/18 - id: gke-a3-ultra-rdma-net - source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/network/rdma-vpc?ref=98c49fe + source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/network/rdma-vpc settings: network_name: $(vars.deployment_name)-rdma-net mtu: $(vars.mtu_size) @@ -81,7 +99,7 @@ deployment_groups: region: $(vars.region) - id: a3-ultragpu-cluster - source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/scheduler/gke-cluster?ref=e0c690b + source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/scheduler/gke-cluster use: [gke-a3-ultra-net-0] settings: release_channel: RAPID @@ -118,7 +136,7 @@ deployment_groups: outputs: [instructions] - id: a3-ultragpu-pool - source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/compute/gke-node-pool?ref=e0c690b + source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/compute/gke-node-pool use: [a3-ultragpu-cluster] settings: machine_type: a3-ultragpu-8g @@ -155,17 +173,17 @@ deployment_groups: outputs: [instructions] - id: topology-aware-scheduler-install - source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/compute/gke-topology-scheduler?ref=e0c690b + source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/compute/gke-topology-scheduler use: [a3-ultragpu-cluster] # Install Kueue, Jobset, and NCCL installer - id: workload-manager-install - source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply?ref=e0c690b + source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply use: [a3-ultragpu-cluster] settings: kueue: install: true - version: v0.9.1 + version: v0.10.0 config_path: $(ghpc_stage("kueue-configuration.yaml.tftpl")) config_template_vars: num_gpus: $(vars.num_gpus) @@ -233,6 +251,11 @@ deployment_groups: source: modules/compute/gke-job-template use: [checkpointing-pv, training-pv, a3-ultragpu-pool] settings: + + # By adding an ephemeral volume, this will ensure that the job adds: + # nodeSelector: + # cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + # which is the best practice for using local-ssd for ephemeral storage. ephemeral_volumes: - type: local-ssd mount_path: /scratch-data @@ -291,6 +314,6 @@ deployment_groups: --name=checkpoint --blocksize=100m --iodepth=64 --readwrite=read # Clean up temporary directories for fio benchmarks - rm -rf /{scratch-training,checkpoint}-data/fio-benchmarks-${TAG} + rm -rf /{scratch,training,checkpoint}-data/fio-benchmarks-${TAG} outputs: [instructions] diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml index 0d9966b3e0..48b8b200d8 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml @@ -15,23 +15,41 @@ terraform_backend_defaults: type: gcs configuration: + # The GCS bucket used for storing terraform state bucket: vars: + # Your GCP Project ID project_id: + # This should be unique across all of your Cluster # Toolkit Deployments. deployment_name: a3u-gke-gcs + + # The GCP Region used for this deployment. region: + + # The GCP Zone used for this deployment. zone: + + # The number of nodes to be created static_node_count: + + # Number of H200 GPUs (for later use by Kueue), which # This should be 8 x static_node_count. num_gpus: + # Cidr block containing the IP of the machine calling terraform. - # The following line must be updated for this example to work. - # e.g. the local system running Terraform or a remote node # To allow all (IAM restrictions still enforced), use 0.0.0.0/0 + # To allow only your IP address, use /32 authorized_cidr: + + # The name of the compute engine reservation of A3-Ultra nodes in the form of + # //reservationBlocks/ extended_reservation: + + # The name of the GCS bucket used for training data training_bucket_name: + + # The name of the GCS bucket used for checkpoint/restart data. checkpoint_bucket_name: diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl b/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl index 97ae9d91f6..97cbaede33 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl +++ b/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl @@ -1,28 +1,59 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: kueue.x-k8s.io/v1alpha1 +kind: Topology +metadata: + name: "gke-default" +spec: + levels: + - nodeLabel: "cloud.google.com/gce-topology-block" + - nodeLabel: "cloud.google.com/gce-topology-subblock" + - nodeLabel: "cloud.google.com/gce-topology-host" + - nodeLabel: "kubernetes.io/hostname" --- -apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor +apiVersion: kueue.x-k8s.io/v1beta1 metadata: - name: "default-flavor" + name: "a3u" +spec: + nodeLabels: + cloud.google.com/gke-nodepool: "a3-ultragpu-8g-a3-ultragpu-pool" + topologyName: "gke-default" + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: NoSchedule --- apiVersion: kueue.x-k8s.io/v1beta1 kind: ClusterQueue metadata: - name: "cluster-queue" + name: "a3u" spec: - namespaceSelector: {} + namespaceSelector: {} # match all. resourceGroups: - coveredResources: ["nvidia.com/gpu"] flavors: - - name: "default-flavor" + - name: "a3u" resources: - name: "nvidia.com/gpu" nominalQuota: ${num_gpus} - --- apiVersion: kueue.x-k8s.io/v1beta1 kind: LocalQueue metadata: namespace: "default" - name: "local-queue" + name: "a3u" spec: - clusterQueue: "cluster-queue" + clusterQueue: "a3u" diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml index 486255755a..092ba1baf3 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml @@ -70,7 +70,7 @@ spec: sysctl -w net.ipv4.conf.eth7.log_martians=0 sysctl -w net.ipv4.conf.eth8.log_martians=0 sysctl -w net.ipv4.conf.eth9.log_martians=0 - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.3 name: nccl-rdma-installer resources: requests: From 02068fae5144ac6851a3016481f2f9bc7d59de06 Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Tue, 7 Jan 2025 18:30:48 +0000 Subject: [PATCH 05/15] Fix sources for modules --- .../hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml index 8c844dfd5b..a6c3489c89 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml @@ -86,7 +86,7 @@ deployment_groups: subnet_ip: 192.168.64.0/18 - id: gke-a3-ultra-rdma-net - source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/network/rdma-vpc + source: modules/network/gpu-rdma-vpc settings: network_name: $(vars.deployment_name)-rdma-net mtu: $(vars.mtu_size) @@ -99,7 +99,7 @@ deployment_groups: region: $(vars.region) - id: a3-ultragpu-cluster - source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/scheduler/gke-cluster + source: modules/scheduler/gke-cluster use: [gke-a3-ultra-net-0] settings: release_channel: RAPID @@ -136,7 +136,7 @@ deployment_groups: outputs: [instructions] - id: a3-ultragpu-pool - source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/compute/gke-node-pool + source: modules/compute/gke-node-pool use: [a3-ultragpu-cluster] settings: machine_type: a3-ultragpu-8g @@ -173,12 +173,12 @@ deployment_groups: outputs: [instructions] - id: topology-aware-scheduler-install - source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/compute/gke-topology-scheduler + source: community/modules/compute/gke-topology-scheduler use: [a3-ultragpu-cluster] # Install Kueue, Jobset, and NCCL installer - id: workload-manager-install - source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply + source: modules/management/kubectl-apply use: [a3-ultragpu-cluster] settings: kueue: From 3671f74764376ab5255da9cd8e3dc2eab4e76292 Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Wed, 8 Jan 2025 00:03:31 +0000 Subject: [PATCH 06/15] Add link to CSI docs --- examples/hypercompute_clusters/a3u-gke-gcs/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/README.md b/examples/hypercompute_clusters/a3u-gke-gcs/README.md index 7669903cf0..4040151504 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/README.md +++ b/examples/hypercompute_clusters/a3u-gke-gcs/README.md @@ -9,8 +9,8 @@ Storage (GCS). * **Multi-VPC Design:** Utilizes three VPCs: two for GKE nodes and one dedicated for GPU RDMA networks. * **Cloud Storage Fuse Integration:** Enables seamless access to GCS buckets - from within your containers using the Cloud Storage Fuse CSI Driver. Cloud - Storage Fuse is configured to utilize the 12 TB of Local SSD + from within your containers using the [Cloud Storage Fuse CSI Driver](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/cloud-storage-fuse-csi-driver). + Cloud Storage Fuse is configured to utilize the 12 TB of Local SSD * **Hierarchical Namespace Buckets:** Leverages GCS buckets with Hierarchical Namespace enabled, optimizing performance for checkpointing and restarting workloads. (Requires GKE 1.31 or later). From 9e465d2951734cda4c1c400e2d816009c5464b13 Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Wed, 8 Jan 2025 00:11:47 +0000 Subject: [PATCH 07/15] Add note about parallel downloads --- examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml index a6c3489c89..0020e31c7e 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml @@ -202,6 +202,9 @@ deployment_groups: remote_mount: $(vars.training_bucket_name) local_mount: /training-data fs_type: gcsfuse + # In addition to the mount options below, if the dataset is a large + # compressed files, `file-cache:enable-parallel-downloads:true` can boost + # perf (at the cost of each node pulling the file to each node's lssd) mount_options: >- implicit-dirs, metadata-cache:ttl-secs:-1, From dfde71987bf69b2dace91b66fb9b4a5d4e21fa79 Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Wed, 8 Jan 2025 00:14:58 +0000 Subject: [PATCH 08/15] Remove parallel download note --- examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml index 0020e31c7e..a6c3489c89 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml @@ -202,9 +202,6 @@ deployment_groups: remote_mount: $(vars.training_bucket_name) local_mount: /training-data fs_type: gcsfuse - # In addition to the mount options below, if the dataset is a large - # compressed files, `file-cache:enable-parallel-downloads:true` can boost - # perf (at the cost of each node pulling the file to each node's lssd) mount_options: >- implicit-dirs, metadata-cache:ttl-secs:-1, From d9e3f7b9d06687e2c0b673068659aff7ed5cd933 Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Thu, 16 Jan 2025 21:30:08 +0000 Subject: [PATCH 09/15] Adding Ramble based system benchmarks Also adopt latest nccl-rdma installer. --- .../a3u-gke-gcs/README.md | 5 + .../a3u-gke-gcs/a3u-gke-gcs.yaml | 4 +- .../a3u-gke-gcs/nccl-rdma-installer.yaml | 17 +- .../a3u-gke-gcs/system_benchmarks/README.md | 132 ++++ .../system_benchmarks/ramble-hpl.yaml | 553 +++++++++++++++ .../system_benchmarks/ramble-nccl.yaml | 526 ++++++++++++++ .../system_benchmarks/ramble-nemo.yaml | 646 ++++++++++++++++++ 7 files changed, 1872 insertions(+), 11 deletions(-) create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/README.md b/examples/hypercompute_clusters/a3u-gke-gcs/README.md index 4040151504..37ca752ee0 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/README.md +++ b/examples/hypercompute_clusters/a3u-gke-gcs/README.md @@ -107,3 +107,8 @@ Use the following commands to: Submit your job: kubectl create -f /primary/my-job-.yaml ``` + +## Running System Benchmarks with Ramble + +To run a series of NCCL, HPL, and NeMo test benchmarks on your cluster, see +`system_benchmarks/README.md`. diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml index a6c3489c89..ce6e3a34f0 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml @@ -81,7 +81,7 @@ deployment_groups: network_name: $(vars.deployment_name)-net-1 mtu: $(vars.mtu_size) subnetworks: - - subnet_name: gke-a3u-gcs-sub-1 + - subnet_name: $(vars.deployment_name)-sub-1 subnet_region: $(vars.region) subnet_ip: 192.168.64.0/18 @@ -189,7 +189,7 @@ deployment_groups: num_gpus: $(vars.num_gpus) jobset: install: true - version: v0.7.1 + version: v0.7.2 apply_manifests: - source: $(vars.nccl_installer_path) diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml index 092ba1baf3..1186759a7b 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml @@ -62,14 +62,14 @@ spec: args: - -c - | - sysctl -w net.ipv4.conf.eth2.log_martians=0 - sysctl -w net.ipv4.conf.eth3.log_martians=0 - sysctl -w net.ipv4.conf.eth4.log_martians=0 - sysctl -w net.ipv4.conf.eth5.log_martians=0 - sysctl -w net.ipv4.conf.eth6.log_martians=0 - sysctl -w net.ipv4.conf.eth7.log_martians=0 - sysctl -w net.ipv4.conf.eth8.log_martians=0 - sysctl -w net.ipv4.conf.eth9.log_martians=0 + sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0 + sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0 + sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0 + sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0 + sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0 + sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0 + sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0 + sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0 - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.3 name: nccl-rdma-installer resources: @@ -89,7 +89,6 @@ spec: /scripts/container_entry.sh install --install-nccl cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64 cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib - ibv_devinfo || exit 1 echo "installation finishes" containers: - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830" diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md new file mode 100644 index 0000000000..63bce8c2db --- /dev/null +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md @@ -0,0 +1,132 @@ +Running System Benchmarks with Ramble +===================================== + +[Ramble](https://github.com/GoogleCloudPlatform/ramble) is an open source +multi-platform experimentation framework written in python. It can be used +to easily reproduce benchmark results across systems, and here +we will use it to run a series of system benchmarks. + +Currently the following benchmarks are supported: + +* NCCL tests (all-gather, all-reduce, reduce-scatter) +* HPL-NVIDIA +* Mixtral 8x7b and LLama3.1 70B via NeMo + +All benchmarks use Kueue for topology aware scheduling, and use JobSet +to orchestrate multi-node workloads. + +For NCCL tests, run: + + ```bash + kubectl apply -f ramble-nccl.yaml + ``` + +For HPL tests, run: + + ```bash + kubectl apply -f ramble-hpl.yaml + ``` + +For NeMo tests, run: + + ```bash + kubectl apply -f ramble-nemo.yaml + ``` + +Where applicable, the NeMo workloads configurations have been chosen to +reproduce those found in +[AI-Hypercomputer/gpu-recipes](https://github.com/AI-Hypercomputer/gpu-recipes). + +For any of the above, the following will be created: + +* A `ramble` namespace in your K8s cluster +* A Kueue `LocalQueue` in the `ramble` namespace. +* A "ramble" service account (and associated RBAC configs) that has access to + the core, batch, jobset, and kueue apis in the `ramble` namespace, as well as + read access to the kueue "clusterqueues" resources across the cluster. +* Configmaps to various scripts/configurations. +* A K8s `Job` that works as the ramble controller process, which creates a + series of `Jobset` objects for each individual benchmark. + +Once created, this will first create a K8s job called +"ramble-{nccl,hpl,nemo}-runner". This controller job orchestrates the running +and analysis of the benchmarks. It installs everything it needs within a +self-contained pod, creates an ssh keypair for multi-node communication, and +uses Ramble to create JobSet's for each benchmark. Once those benchmarks +are complete, it provides a summary of the results. Full benchmark logs can +otherwise be found in the logs for each of the created JobSet/Job/Pod's +themselves. + +For each benchmark, multiple node scales will be submitted, up to your maximum +node scale of your cluster. This can be controlled with the `n_nodes` variable +in the `ramble.yaml` configMap. + +Note: The following depends on several tightly coupled settings, in particular +making sure that the subnet names in your GKE cluster match those defined in +the "ramble.yaml" config file. If you modify the names of your subnets +(including by changing the "deployment" name), then you will need to modify +the K8s yaml files. Specifically, the following variables may need to be +modified in the `ramble.yaml` configmap in each of the +ramble-{nccl,hpl,nemo}.yaml files: + + gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool # The nodepool name + sysnet_subnet_prefix: a3u-gke-gcs-sub + gpu_subnet_prefix: a3u-gke-gcs-rdma-sub + cluster_queue: a3u + +Expected Results +---------------- + +For ramble-nccl.yaml, at the end of the logs of the created `ramble-nccl-runner` +job, you should see something like: + + ```bash + kubectl -n ramble logs job/ramble-nccl-runner + ... + ---- SUMMARY for >1GB Message Sizes ---- + workload n_nodes msg_size busbw + all-gather 2 1073741824 XXX.XX + all-gather 2 2147483648 XXX.XX + all-gather 2 4294967296 XXX.XX + all-gather 2 8589934592 XXX.XX + ... + all-reduce 2 1073741824 XXX.XX + ... + reduce-scatter 2 1073741824 XXX.XX + ... + + -------- Benchmarking Complete ------- + ``` + + ```bash + kubectl -n ramble logs job/ramble-hpl-runner + ... + --------------- SUMMARY --------------- + workload n_nodes GFlop/s GFlops/s/GPU + calculator 1 X.XXXe+05 X.XXXe+04 + calculator 2 X.XXXe+05 X.XXXe+04 + calculator 4 X.XXXe+06 X.XXXe+04 + calculator 8 X.XXXe+06 X.XXXe+04 + + -------- Benchmarking Complete ------- + ``` + + ```bash + kubectl -n ramble logs job/ramble-nemo-runner + ... + --------------- SUMMARY --------------- + nemo_config n_nodes step train_step_timing + mixtral_8x7b 8 0-10/10 XX.XX + llama3_1_70b 8 0-10/10 XX.XX + + -------- Benchmarking Complete ------- + ``` + +Cleaning Up +----------- + +To remove all resources created by these benchmarks, you can run: + + kubectl delete -f ramble-nccl.yaml + kubectl delete -f ramble-hpl.yaml + kubectl delete -f ramble-nemo.yaml diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml new file mode 100644 index 0000000000..e5f5e4d2a9 --- /dev/null +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml @@ -0,0 +1,553 @@ +# Copyright 2025 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: ramble + namespace: ramble +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + namespace: "ramble" + name: "a3u" +spec: + clusterQueue: "a3u" +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: ramble + namespace: ramble +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: ramble + name: ramble-editor +rules: +- apiGroups: ["", "batch", "jobset.x-k8s.io", "kueue.x-k8s.io"] # "" indicates the core API group + resources: ["*"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kueue-reader +rules: +- apiGroups: ["kueue.x-k8s.io"] + resources: ["clusterqueues"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ramble-editor + namespace: ramble +subjects: +- kind: ServiceAccount + name: ramble + apiGroup: "" +roleRef: + kind: Role + name: ramble-editor + apiGroup: "" +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: ramble-kueue-reader +subjects: +- kind: ServiceAccount + name: ramble + namespace: ramble + apiGroup: "" +roleRef: + kind: ClusterRole + name: kueue-reader + apiGroup: "" +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: ramble-hpl-configs + namespace: ramble +data: + execute_hpl.tpl: | + #!/bin/bash + set -e + cd "{experiment_run_dir}" + kubectl delete -n {gke_namespace} configmap {experiment_name} || true + kubectl create -n {gke_namespace} configmap {experiment_name} --from-file={experiment_run_dir}/HPL.dat + printf "Submitting {experiment_name}\n" + kubectl create -f jobset 2>&1 | tee klog + + collect_logs.tpl: | + #!/bin/bash + set -e + jobname=$(head -n 1 {experiment_run_dir}/klog | awk -F " |/" '{print $2}') + printf "Waiting for up to a day for ${jobname} to complete.\n" + kubectl wait --timeout=86400s jobs/${jobname}-w-0 --for=condition=complete + kubectl logs --tail=-1 -f -l batch.kubernetes.io/job-completion-index=0,job-name=${jobname}-w-0 | tee {log_file} + + ramble.yaml: | + ramble: + variables: + ssh_port: 22 + batch_submit: '{execute_hpl}' + mpi_command: >- + mpirun + -n {n_ranks} + -N {processes_per_node} + --bind-to none + --hostfile /tmp/hostfile + --mca btl self,tcp + --mca btl_tcp_if_include eth0 + --mca orte_keep_fqdn_hostnames 1 + --mca plm_rsh_no_tree_spawn 1 + -x {mpi_env_vars} + --mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p {ssh_port}" + mpi_env_vars: >- + $(echo + ${!NCCL*} + ${!OMPI*} + LD_LIBRARY_PATH + ${!HPL*} + ${!UCX*} + | sed 's/ / -x /g') + + + container_name: hpl + container_uri: "nvcr.io/nvidia/hpc-benchmarks:24.09" + gke_container_name: hpl + gke_namespace: ramble + jobset_name: "hpl-{n_nodes}" + processes_per_node: 8 + + # Potentially need to be modified + gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool + sysnet_subnet_prefix: a3u-gke-gcs-sub + gpu_subnet_prefix: a3u-gke-gcs-rdma-sub + cluster_queue: a3u + env_vars: + set: + CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 + HPL_FCT_COMM_POLICY: 1 + HPL_P2P_AS_BCAST: '{hpl_p2p_as_bcast}' + HPL_USE_NVSHMEM: 0 + NVSHMEM_DISABLE_CUDA_VMM: 1 + OMPI_MCA_btl: openib + OMPI_MCA_pml: "^ucx" + UCX_MAX_RNDV_RAILS: 4 + UCX_NET_DEVICES: "mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1" + NCCL_NET: gIB + prepend: + - paths: + LD_LIBRARY_PATH: /usr/local/gib/lib64 + + + applications: + nvidia-hpl: + workloads: + calculator: + experiments: + hpl-{n_nodes}: + variables: + n_nodes: [1,2,4,8,16,24,32] + + # 0 = ncclBcast, 1 = ncclSend/Recv + hpl_p2p_as_bcast: '0' + + # Percent of memory to use (default 85) + percent_mem: 85 + + # Memory per node in GB + memory_per_node: '1200' + + # Other Recommended Settings + block_size: '1024' + PMAP: 1 + SWAP: 1 + swapping_threshold: 192 + L1: 1 + U: 0 + Equilibration: 0 + pfact: 0 + nbmin: 2 + rfact: 0 + bcast: 3 + depth: 1 + + internals: + custom_executables: + mpi_head_node: + template: + - source /usr/local/gib/scripts/set_nccl_env.sh + - if [[ "${NODE_RANK}" -eq "0" ]]; then + redirect: '' + log_file: '' + wait_worker_nodes: + template: + - else + - while ping -c 1 ${WORKERS_BASENAME}-0.${POSTFIX}; do + - sleep 5 + - done + - fi + redirect: '' + log_file: '' + tail_log: + template: + - tail -f {log_file} & + - export TAIL_PID=$! + redirect: '' + log_file: '' + kill_tail: + template: + - kill -9 $TAIL_PID + redirect: '' + log_file: '' + executable_injection: + - name: mpi_head_node + order: before + - name: wait_worker_nodes + order: after + - name: tail_log + order: before + - name: kill_tail + order: after + formatted_executables: + yaml_command: + indentation: 18 + join_separator: \n + commands: + - mkdir -p {experiment_run_dir} + - ulimit -l unlimited + - cp /configs/HPL.dat {experiment_run_dir}/ + - '{unformatted_command}' + + jobset.tpl: | + apiVersion: jobset.x-k8s.io/v1alpha2 + kind: JobSet + metadata: + generateName: {jobset_name}- + namespace: {gke_namespace} + labels: + kueue.x-k8s.io/queue-name: {cluster_queue} + spec: + ttlSecondsAfterFinished: 86400 + network: + enableDNSHostnames: true + publishNotReadyAddresses: true + replicatedJobs: + - name: w + template: + spec: + parallelism: {n_nodes} + completions: {n_nodes} + template: + metadata: + annotations: + kueue.x-k8s.io/podset-preferred-topology: "kubernetes.io/hostname" + networking.gke.io/default-interface: 'eth0' + networking.gke.io/interfaces: | + [ + \{"interfaceName":"eth0","network":"default"\}, + \{"interfaceName":"eth1","network":"{sysnet_subnet_prefix}-1"\}, + \{"interfaceName":"eth2","network":"{gpu_subnet_prefix}-0"\}, + \{"interfaceName":"eth3","network":"{gpu_subnet_prefix}-1"\}, + \{"interfaceName":"eth4","network":"{gpu_subnet_prefix}-2"\}, + \{"interfaceName":"eth5","network":"{gpu_subnet_prefix}-3"\}, + \{"interfaceName":"eth6","network":"{gpu_subnet_prefix}-4"\}, + \{"interfaceName":"eth7","network":"{gpu_subnet_prefix}-5"\}, + \{"interfaceName":"eth8","network":"{gpu_subnet_prefix}-6"\}, + \{"interfaceName":"eth9","network":"{gpu_subnet_prefix}-7"\} + ] + spec: + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-nodepool: {gke_nodepool} + tolerations: + - key: cloud.google.com/gke-queued + effect: NoSchedule + value: "true" + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + setHostnameAsFQDN: true + volumes: + - name: mpi-id + secret: + secretName: mpi-ssh-hpl + items: + - key: ssh-privatekey + path: "id_rsa" + - key: ssh-publickey + path: "id_rsa.pub" + - name: gib + hostPath: + path: /home/kubernetes/bin/gib + - name: nvidia + hostPath: + path: /home/kubernetes/bin/nvidia + - name: lib64 + hostPath: + path: /lib64 + - name: shared-memory + emptyDir: + medium: "Memory" + sizeLimit: 250Gi + - name: sys + hostPath: + path: /sys + - name: proc-sys + hostPath: + path: /proc/sys + - name: local-ssd + hostPath: + path: /mnt/stateful_partition/kube-ephemeral-ssd + - name: hpl-config + configMap: + name: {experiment_name} + initContainers: + - name: gpu-healthcheck + image: alpine:latest + command: ["/bin/sh", "-c"] + args: + - | + apk add --no-cache bash # Install bash + /bin/bash -c "set -ex + NUM_GPUS=$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | wc -l) + if [ \${NUM_GPUS} -lt 8 ]; then + echo \"Error: Only \${NUM_GPUS} GPUs and expected 8\" + exit 1 + fi + gpu_errors=(\$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=ecc.errors.uncorrected.volatile.total --format=csv,noheader,nounits)) + for gpu_index in \${!gpu_errors[@]}; do + if [ \${gpu_errors[\$gpu_index]} == '[N/A]' ]; then + echo 'Error: ERR detected in GPU index '\$gpu_index + exit 1 + elif [ \${gpu_errors[\$gpu_index]} -gt 0 ]; then + echo 'Error: Unrecoverable ECC errors detected in GPU index '\$gpu_index + exit 1 + fi + done + echo \${NUM_GPUS} GPUs found with no ERR or Unrecoverable ECC errors" + volumeMounts: + - name: nvidia + mountPath: /usr/local/nvidia + - name: lib64 + mountPath: /lib64 + securityContext: + privileged: true + env: + - name: LD_LIBRARY_PATH + value: /usr/local/nvidia/lib64 + containers: + - name: {gke_container_name} + stdin: true + tty: true + image: {container_uri} + env: + - name: OMPI_ALLOW_RUN_AS_ROOT + value: "1" + - name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM + value: "1" + - name: MY_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - bash + - -c + - | + set -x + + # Setup SSH + export DEBIAN_FRONTEND=noninteractive + + apt update -qq -y + apt install -qq -y iputils-ping openssh-server + + mkdir -p /run/sshd ~/.ssh + chmod 700 ~/.ssh + cp /secrets/ssh/* ~/.ssh/ + cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys + chmod 600 ~/.ssh/* + mkdir -p /run/sshd + /sbin/sshd + + # Load all the cuda libs + /sbin/ldconfig + + export POSTFIX=$(hostname | cut -d . -f 2-) + export WORKERS_BASENAME=$(hostname | cut -d . -f 1 | rev | cut -d - -f 2- | rev ) + export NODE_RANK=$JOB_COMPLETION_INDEX + + # For every host, get the entity and add to hostfile + for i in `seq 0 $(({n_nodes}-1))`; do + OTHER=${WORKERS_BASENAME}-${i}.${POSTFIX} + until ssh -p {ssh_port} -o StrictHostKeyChecking=no $OTHER hostname; + do + echo ... + sleep 10 + done + echo ${OTHER} port={ssh_port} slots={processes_per_node} | tee -a /tmp/hostfile; + done + cat /tmp/hostfile + + {yaml_command} + + exit 0 + + volumeMounts: + - name: mpi-id + mountPath: "/secrets/ssh" + readOnly: true + - name: nvidia + mountPath: /usr/local/nvidia + - name: gib + mountPath: /usr/local/gib + - name: shared-memory + mountPath: /dev/shm + - name: local-ssd + mountPath: /ssd + - name: hpl-config + mountPath: /configs + resources: + limits: + nvidia.com/gpu: 8 + requests: + nvidia.com/gpu: 8 + + restartPolicy: Never + +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ramble-hpl-runner + namespace: ramble +spec: + template: + spec: + volumes: + - name: config + configMap: + name: ramble-hpl-configs + items: + - key: jobset.tpl + path: jobset.tpl + - key: ramble.yaml + path: ramble.yaml + - key: execute_hpl.tpl + path: execute_hpl.tpl + - key: collect_logs.tpl + path: collect_logs.tpl + + serviceAccountName: ramble + containers: + - name: ramble-controller + image: ubuntu:latest + + volumeMounts: + - name: config + mountPath: /opt/configs/ + readOnly: true + + command: + - bash + - -c + - | + export DEBIAN_FRONTEND=noninteractive + + set -e + printf "Installing system dependencies\n" + apt update -qq -y > /dev/null + apt install -qq -y build-essential python3-venv jq git curl > /dev/null + + printf "Installing kubectl\n" + curl -s -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl + + # Use current unix timestamp as a unique tag + # for jobs submitted + TAG=$(date +%s) + TEST_DIR=/workspace/hpl-tests-"${TAG}" + SOFTWARE_INSTALL=/opt + + mkdir -p ${SOFTWARE_INSTALL} ${TEST_DIR} + + printf "Cloning ramble and cluster-toolkit\n" + git clone --depth 1 -c feature.manyFiles=true https://github.com/GoogleCloudPlatform/ramble.git "${SOFTWARE_INSTALL}"/ramble + + printf "Setting up ramble python environment, and installing requirements\n" + python3 -m venv "${SOFTWARE_INSTALL}"/ramble/env || true + source "${SOFTWARE_INSTALL}"/ramble/env/bin/activate + pip install -q -r "${SOFTWARE_INSTALL}"/ramble/requirements.txt + + # Activate ramble + . ${SOFTWARE_INSTALL}/ramble/share/ramble/setup-env.sh + + ramble workspace create -a -d "${TEST_DIR}" + + cp /opt/configs/* ${RAMBLE_WORKSPACE}/configs/ + + cd ${RAMBLE_WORKSPACE} + + # Set up SSH + printf "Creating ssh keypair for MPI workloads\n" + ssh-keygen -b 2048 -f mpi_id -N "" + kubectl create secret generic mpi-ssh-hpl --from-file=ssh-privatekey=./mpi_id --from-file=ssh-publickey=./mpi_id.pub || true + + # Get number of GPUs / nodes available in this cluster from Kueue: + AVAILABLE_GPUS=$( + kubectl get clusterqueues.kueue.x-k8s.io -o json | + jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name="a3u") | + .resources[] | select (.name="nvidia.com/gpu") | .nominalQuota' + ) + + N_NODES=$((AVAILABLE_GPUS / 8)) + + printf "\n--- Available Benchmarks on %s nodes --\n" ${N_NODES} + ramble workspace info --where '{n_nodes} <= '"${N_NODES}" + + printf "\n--------- Setting up Benchmarks -------\n" + ramble workspace setup --where '{n_nodes} <= '"${N_NODES}" + + printf "\n----------- Running Benchmarks --------\n" + ramble on --where '{n_nodes} <= '"${N_NODES}" + + printf "\n------- Collecting benchmark logs -----\n" + ramble on --executor "{experiment_run_dir}/collect_logs" --where '{n_nodes} <= '"${N_NODES}" + + printf "\n------- Analyzing benchmark logs ------\n" + ramble workspace analyze -f json --where '{n_nodes} <= '"${N_NODES}" + + printf "\n--------------- SUMMARY ---------------\n" + jq -r '["workload","n_nodes","GFlop/s ","GFlops/s/GPU"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context | + { + experiment_name: $exp.name, + workload: $exp.workload_name, + n_nodes: $exp.n_nodes, + Context: $context.name + } + + ($context.foms | from_entries ) + | [.workload, .n_nodes, .GFlops, ."Per GPU GFlops"]) + | @tsv' results.latest.json + printf "\n-------- Benchmarking Complete -------\n" + + restartPolicy: Never + backoffLimit: 4 diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml new file mode 100644 index 0000000000..ae7753516e --- /dev/null +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml @@ -0,0 +1,526 @@ +# Copyright 2025 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: ramble + namespace: ramble +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + namespace: "ramble" + name: "a3u" +spec: + clusterQueue: "a3u" +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: ramble + namespace: ramble +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: ramble + name: ramble-editor +rules: +- apiGroups: ["", "batch", "jobset.x-k8s.io", "kueue.x-k8s.io"] # "" indicates the core API group + resources: ["*"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kueue-reader +rules: +- apiGroups: ["kueue.x-k8s.io"] + resources: ["clusterqueues"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ramble-editor + namespace: ramble +subjects: +- kind: ServiceAccount + name: ramble + apiGroup: "" +roleRef: + kind: Role + name: ramble-editor + apiGroup: "" +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: ramble-kueue-reader +subjects: +- kind: ServiceAccount + name: ramble + namespace: ramble + apiGroup: "" +roleRef: + kind: ClusterRole + name: kueue-reader + apiGroup: "" +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: ramble-nccl-configs + namespace: ramble +data: + execute_nccl.tpl: | + #!/bin/bash + set -e + cd "{experiment_run_dir}" + printf "Submitting ${experiment_name}\n" + kubectl create -f jobset 2>&1 | tee klog + + collect_logs.tpl: | + #!/bin/bash + set -e + jobname=$(head -n 1 {experiment_run_dir}/klog | awk -F " |/" '{print $2}') + printf "Waiting for up to a day for ${jobname} to complete.\n" + kubectl wait --timeout=86400s jobs/${jobname}-w-0 --for=condition=complete + kubectl logs --tail=-1 -f -l batch.kubernetes.io/job-completion-index=0,job-name=${jobname}-w-0 | tee {log_file} + + ramble.yaml: | + ramble: + variables: + ssh_port: 222 + batch_submit: '{execute_nccl}' + mpi_command: >- + mpirun + -n {n_ranks} + -N {processes_per_node} + --bind-to none + --hostfile /tmp/hostfile + --mca btl self,tcp + --mca btl_tcp_if_include eth0 + --mca orte_keep_fqdn_hostnames 1 + --mca plm_rsh_no_tree_spawn 1 + -x {mpi_env_vars} + --mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p {ssh_port}" + mpi_env_vars: >- + $(echo + ${!NCCL*} + ${!OMPI*} + LD_LIBRARY_PATH + | sed 's/ / -x /g') + + container_name: nccl-tests + container_uri: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.3 + gke_container_name: nccl + gke_namespace: ramble + gpus_per_node: 8 + nccl-tests_path: /third_party/nccl-tests-mpi + processes_per_node: 8 + + # Potentially need to be modified + gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool + sysnet_subnet_prefix: a3u-gke-gcs-sub + gpu_subnet_prefix: a3u-gke-gcs-rdma-sub + cluster_queue: a3u + env_vars: + set: + CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 + NCCL_NET: gIB + prepend: + - paths: + LD_LIBRARY_PATH: /usr/local/gib/lib64 + + + applications: + nccl-tests: + workloads: + '{workload}': + experiments: + '{workload}-{n_nodes}': + variables: + n_nodes: [2,4,8,16,32] + + jobset_name: ['ag-{n_nodes}', 'ar-{n_nodes}', 'rs-{n_nodes}'] + workload: [all-gather, all-reduce, reduce-scatter] + binary: [all_gather_perf, all_reduce_perf, reduce_scatter_perf] + zips: + bench: + - jobset_name + - workload + - binary + matrix: + - bench + - n_nodes + + internals: + custom_executables: + mpi_head_node: + template: + - cd /third_party/nccl-tests/build/ + - source /usr/local/gib/scripts/set_nccl_env.sh + - if [[ "${NODE_RANK}" -eq "0" ]]; then + redirect: '' + log_file: '' + wait_worker_nodes: + template: + - else + - while ping -c 1 ${WORKERS_BASENAME}-0.${POSTFIX}; do + - sleep 5 + - done + - fi + redirect: '' + log_file: '' + tail_log: + template: + - tail -f {log_file} & + - export TAIL_PID=$! + redirect: '' + log_file: '' + kill_tail: + template: + - kill -9 $TAIL_PID + redirect: '' + log_file: '' + executable_injection: + - name: mpi_head_node + order: before + - name: wait_worker_nodes + order: after + - name: tail_log + order: before + - name: kill_tail + order: after + formatted_executables: + yaml_command: + indentation: 18 + join_separator: \n + commands: + - mkdir -p {experiment_run_dir} + - ulimit -l unlimited + - '{unformatted_command}' + + jobset.tpl: | + apiVersion: jobset.x-k8s.io/v1alpha2 + kind: JobSet + metadata: + generateName: {jobset_name}- + namespace: {gke_namespace} + labels: + kueue.x-k8s.io/queue-name: {cluster_queue} + spec: + ttlSecondsAfterFinished: 86400 + network: + enableDNSHostnames: true + publishNotReadyAddresses: true + replicatedJobs: + - name: w + template: + spec: + parallelism: {n_nodes} + completions: {n_nodes} + template: + metadata: + annotations: + kueue.x-k8s.io/podset-preferred-topology: "kubernetes.io/hostname" + networking.gke.io/default-interface: 'eth0' + networking.gke.io/interfaces: | + [ + \{"interfaceName":"eth0","network":"default"\}, + \{"interfaceName":"eth1","network":"{sysnet_subnet_prefix}-1"\}, + \{"interfaceName":"eth2","network":"{gpu_subnet_prefix}-0"\}, + \{"interfaceName":"eth3","network":"{gpu_subnet_prefix}-1"\}, + \{"interfaceName":"eth4","network":"{gpu_subnet_prefix}-2"\}, + \{"interfaceName":"eth5","network":"{gpu_subnet_prefix}-3"\}, + \{"interfaceName":"eth6","network":"{gpu_subnet_prefix}-4"\}, + \{"interfaceName":"eth7","network":"{gpu_subnet_prefix}-5"\}, + \{"interfaceName":"eth8","network":"{gpu_subnet_prefix}-6"\}, + \{"interfaceName":"eth9","network":"{gpu_subnet_prefix}-7"\} + ] + spec: + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-nodepool: {gke_nodepool} + tolerations: + - key: cloud.google.com/gke-queued + effect: NoSchedule + value: "true" + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + setHostnameAsFQDN: true + volumes: + - name: mpi-id + secret: + secretName: mpi-ssh-nccl + items: + - key: ssh-privatekey + path: "id_rsa" + - key: ssh-publickey + path: "id_rsa.pub" + - name: gib + hostPath: + path: /home/kubernetes/bin/gib + - name: nvidia + hostPath: + path: /home/kubernetes/bin/nvidia + - name: lib64 + hostPath: + path: /lib64 + - name: shared-memory + emptyDir: + medium: "Memory" + sizeLimit: 250Gi + - name: sys + hostPath: + path: /sys + - name: proc-sys + hostPath: + path: /proc/sys + - name: local-ssd + hostPath: + path: /mnt/stateful_partition/kube-ephemeral-ssd + initContainers: + - name: gpu-healthcheck + image: alpine:latest + command: ["/bin/sh", "-c"] + args: + - | + apk add --no-cache bash # Install bash + /bin/bash -c "set -ex + NUM_GPUS=$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | wc -l) + if [ \${NUM_GPUS} -lt 8 ]; then + echo \"Error: Only \${NUM_GPUS} GPUs and expected 8\" + exit 1 + fi + gpu_errors=(\$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=ecc.errors.uncorrected.volatile.total --format=csv,noheader,nounits)) + for gpu_index in \${!gpu_errors[@]}; do + if [ \${gpu_errors[\$gpu_index]} == '[N/A]' ]; then + echo 'Error: ERR detected in GPU index '\$gpu_index + exit 1 + elif [ \${gpu_errors[\$gpu_index]} -gt 0 ]; then + echo 'Error: Unrecoverable ECC errors detected in GPU index '\$gpu_index + exit 1 + fi + done + echo \${NUM_GPUS} GPUs found with no ERR or Unrecoverable ECC errors" + volumeMounts: + - name: nvidia + mountPath: /usr/local/nvidia + - name: lib64 + mountPath: /lib64 + securityContext: + privileged: true + env: + - name: LD_LIBRARY_PATH + value: /usr/local/nvidia/lib64 + containers: + - name: {gke_container_name} + stdin: true + tty: true + image: {container_uri} + env: + - name: OMPI_ALLOW_RUN_AS_ROOT + value: "1" + - name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM + value: "1" + - name: MY_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - bash + - -c + - | + set -x + + # Setup SSH + export DEBIAN_FRONTEND=noninteractive + + apt update -qq -y + apt install -qq -y iputils-ping openssh-server + + mkdir -p /run/sshd ~/.ssh + chmod 700 ~/.ssh + cp /secrets/ssh/* ~/.ssh/ + cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys + chmod 600 ~/.ssh/* + mkdir -p /run/sshd + /sbin/sshd + + # Load all the cuda libs + /sbin/ldconfig + + export POSTFIX=$(hostname | cut -d . -f 2-) + export WORKERS_BASENAME=$(hostname | cut -d . -f 1 | rev | cut -d - -f 2- | rev ) + export NODE_RANK=$JOB_COMPLETION_INDEX + + # For every host, get the entity and add to hostfile + for i in `seq 0 $(({n_nodes}-1))`; do + OTHER=${WORKERS_BASENAME}-${i}.${POSTFIX} + until ssh -p {ssh_port} -o StrictHostKeyChecking=no $OTHER hostname; + do + echo ... + sleep 10 + done + echo ${OTHER} port={ssh_port} slots={processes_per_node} | tee -a /tmp/hostfile; + done + cat /tmp/hostfile + + {yaml_command} + + exit 0 + + volumeMounts: + - name: mpi-id + mountPath: "/secrets/ssh" + readOnly: true + - name: nvidia + mountPath: /usr/local/nvidia + - name: gib + mountPath: /usr/local/gib + - name: shared-memory + mountPath: /dev/shm + - name: local-ssd + mountPath: /ssd + resources: + limits: + nvidia.com/gpu: 8 + requests: + nvidia.com/gpu: 8 + + restartPolicy: Never + +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ramble-nccl-runner + namespace: ramble +spec: + template: + spec: + volumes: + - name: config + configMap: + name: ramble-nccl-configs + items: + - key: jobset.tpl + path: jobset.tpl + - key: ramble.yaml + path: ramble.yaml + - key: execute_nccl.tpl + path: execute_nccl.tpl + - key: collect_logs.tpl + path: collect_logs.tpl + + serviceAccountName: ramble + containers: + - name: ramble-controller + image: ubuntu:latest + + volumeMounts: + - name: config + mountPath: /opt/configs/ + readOnly: true + + command: + - bash + - -c + - | + export DEBIAN_FRONTEND=noninteractive + + set -e + printf "Installing system dependencies\n" + apt update -qq -y > /dev/null + apt install -qq -y build-essential python3-venv jq git curl > /dev/null + + printf "Installing kubectl\n" + curl -s -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl + + # Use current unix timestamp as a unique tag + # for jobs submitted + TAG=$(date +%s) + TEST_DIR=/workspace/nccl-tests-"${TAG}" + SOFTWARE_INSTALL=/opt + + mkdir -p ${SOFTWARE_INSTALL} ${TEST_DIR} + + printf "Cloning ramble and cluster-toolkit\n" + git clone --depth 1 -c feature.manyFiles=true https://github.com/GoogleCloudPlatform/ramble.git "${SOFTWARE_INSTALL}"/ramble + + printf "Setting up ramble python environment, and installing requirements\n" + python3 -m venv "${SOFTWARE_INSTALL}"/ramble/env || true + source "${SOFTWARE_INSTALL}"/ramble/env/bin/activate + pip install -q -r "${SOFTWARE_INSTALL}"/ramble/requirements.txt + + # Activate ramble + . ${SOFTWARE_INSTALL}/ramble/share/ramble/setup-env.sh + + ramble workspace create -a -d "${TEST_DIR}" + + cp /opt/configs/* ${RAMBLE_WORKSPACE}/configs/ + + cd ${RAMBLE_WORKSPACE} + + # Set up SSH + printf "Creating ssh keypair for MPI workloads\n" + ssh-keygen -b 2048 -f mpi_id -N "" + kubectl create secret generic mpi-ssh-nccl --from-file=ssh-privatekey=./mpi_id --from-file=ssh-publickey=./mpi_id.pub || true + + # Get number of GPUs / nodes available in this cluster from Kueue: + AVAILABLE_GPUS=$( + kubectl get clusterqueues.kueue.x-k8s.io -o json | + jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name="a3u") | + .resources[] | select (.name="nvidia.com/gpu") | .nominalQuota' + ) + + N_NODES=$((AVAILABLE_GPUS / 8)) + + printf "\n--- Available Benchmarks on %s nodes --\n" ${N_NODES} + ramble workspace info --where '{n_nodes} <= '"${N_NODES}" + + printf "\n--------- Setting up Benchmarks -------\n" + ramble workspace setup --where '{n_nodes} <= '"${N_NODES}" + + printf "\n----------- Running Benchmarks --------\n" + ramble on --where '{n_nodes} <= '"${N_NODES}" + + printf "\n------- Collecting benchmark logs -----\n" + ramble on --executor "{experiment_run_dir}/collect_logs" --where '{n_nodes} <= '"${N_NODES}" + + printf "\n------- Analyzing benchmark logs ------\n" + ramble workspace analyze -f json --where '{n_nodes} <= '"${N_NODES}" + + printf "\n---- SUMMARY for >1GB Message Sizes ----\n" + jq -r '["workload","n_nodes","msg_size","busbw"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context | + { + experiment_name: $exp.name, + workload: $exp.workload_name, + n_nodes: $exp.n_nodes, + Context: $context.name + } + + ($context.foms | from_entries ) + | select(.Size | tonumber > 1000000000) + | [.workload, .n_nodes, .Size, ."Out of Place Bus Bandwidth"]) + | @tsv' results.latest.json + printf "\n-------- Benchmarking Complete -------\n" + + restartPolicy: Never + backoffLimit: 4 diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml new file mode 100644 index 0000000000..fe5642b4f1 --- /dev/null +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml @@ -0,0 +1,646 @@ +# Copyright 2025 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: ramble + namespace: ramble +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + namespace: "ramble" + name: "a3u" +spec: + clusterQueue: "a3u" +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: ramble + namespace: ramble +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: ramble + name: ramble-editor +rules: +- apiGroups: ["", "batch", "jobset.x-k8s.io", "kueue.x-k8s.io"] # "" indicates the core API group + resources: ["*"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kueue-reader +rules: +- apiGroups: ["kueue.x-k8s.io"] + resources: ["clusterqueues"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ramble-editor + namespace: ramble +subjects: +- kind: ServiceAccount + name: ramble + apiGroup: "" +roleRef: + kind: Role + name: ramble-editor + apiGroup: "" +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: ramble-kueue-reader +subjects: +- kind: ServiceAccount + name: ramble + namespace: ramble + apiGroup: "" +roleRef: + kind: ClusterRole + name: kueue-reader + apiGroup: "" +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: ramble-nemo-configs + namespace: ramble +data: + execute_nemo.tpl: | + #!/bin/bash + set -e + cd "{experiment_run_dir}" + kubectl delete -n {gke_namespace} configmap {experiment_name} || true + kubectl create -n {gke_namespace} configmap {experiment_name} --from-file={nemo_generated_config_path}/{nemo_generated_config_name} + printf "Submitting {experiment_name}\n" + kubectl create -f jobset 2>&1 | tee klog + + collect_logs.tpl: | + #!/bin/bash + set -e + jobname=$(head -n 1 {experiment_run_dir}/klog | awk -F " |/" '{print $2}') + printf "Waiting for up to a day for ${jobname} to complete.\n" + kubectl wait --timeout=86400s jobs/${jobname}-w-0 --for=condition=complete + kubectl logs --tail=-1 -f -l batch.kubernetes.io/job-completion-index=0,job-name=${jobname}-w-0 | tee {log_file} + + ramble.yaml: | + ramble: + variables: + ssh_port: 22 + batch_submit: '{execute_nemo}' + mpi_command: >- + mpirun + -n {n_ranks} + -N {processes_per_node} + --bind-to none + --hostfile /tmp/hostfile + --mca btl self,tcp + --mca btl_tcp_if_include eth0 + --mca orte_keep_fqdn_hostnames 1 + --mca plm_rsh_no_tree_spawn 1 + -x {mpi_env_vars} + --mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p {ssh_port}" + mpi_env_vars: >- + $(echo + ${!NCCL*} + ${!OMPI*} + LD_LIBRARY_PATH + ${!CUDA*} + ${!GLOO*} + ${!NVIDIA*} + ${!NVTE*} + ${!OMP*} + ${!TORCH*} + ${!TQDM*} + TRANSFORMERS_OFFLINE + PYTHONPATH + | sed 's/ / -x /g') + + container_name: nemo + container_uri: nvcr.io/nvidia/nemo:{nemo_version} + gke_container_name: nemo + gke_namespace: ramble + gpus_per_node: 8 + n_threads: 12 + nemo_launcher_tag: 24.07 + nemo_version: 24.07 + processes_per_node: 8 + + # Potentially need to be modified + gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool + sysnet_subnet_prefix: a3u-gke-gcs-sub + gpu_subnet_prefix: a3u-gke-gcs-rdma-sub + cluster_queue: a3u + env_vars: + set: + CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 + OMP_NUM_THREADS: '{n_threads}' + TRANSFORMERS_OFFLINE: 0 + TORCH_NCCL_AVOID_RECORD_STREAMS: 1 + NCCL_NVLS_ENABLE: 0 + GLOO_SOCKET_IFNAME: eth0,eth1 + # SM_MARGIN environment vars prevent send-receive stalling execution + # (results in reduced step time) + NVTE_FWD_LAYERNORM_SM_MARGIN: 8 + NVTE_BWD_LAYERNORM_SM_MARGIN: 8 + NCCL_NET: gIB + prepend: + - paths: + LD_LIBRARY_PATH: /usr/local/gib/lib64 + + + applications: + py-nemo: + workloads: + pretraining: + experiments: + mixtral-{n_nodes}-nodes: + variables: + n_nodes: [8,16,32] + jobset_name: 'm8x7b-{n_nodes}' + + nemo_stage: training + nemo_model: mixtral + nemo_config_name: mixtral_8x7b + + max_steps: 10 + trainer.max_steps: '{max_steps}' + trainer.val_check_interval: 50 + trainer.log_every_n_steps: 1 + trainer.enable_model_summary: false + + model.tokenizer.library: megatron + model.tokenizer.type: GPT2BPETokenizer + model.tokenizer.model: null + model.tokenizer.delimiter : null + model.tokenizer.vocab_file: gpt2-vocab.json + model.tokenizer.merge_file: gpt2-merges.txt + + exp_manager.exp_dir: '{experiment_run_dir}' + + run.time_limit: 01:00:00 + + model.data.data_impl: mock + model.data.data_prefix: [] + model.data.splits_string: 90,8,2 + model.data.num_workers: 4 + + model.optim.contiguous_grad_buffer: true + model.optim.contiguous_param_buffer: true + + model.global_batch_size: 1024 + model.micro_batch_size: 2 + model.virtual_pipeline_model_parallel_size: null + model.pipeline_model_parallel_size: 1 + model.gc_interval: 100 + model.fp8_params: true + + model.nsys_profile.ranks: [0, 8] + model.nsys_profile.start_step: 27 + model.nsys_profile.end_step: 29 + + # Checkpoint saving & logging + exp_manager.resume_if_exists: false + exp_manager.create_checkpoint_callback: false + + llama3-{n_nodes}-nodes: + variables: + n_nodes: [8,16,32] + + jobset_name: 'llama-{n_nodes}' + nemo_stage: training + nemo_model: llama + nemo_config_name: llama3_1_70b + + max_steps: 10 + trainer.max_steps: '{max_steps}' + trainer.val_check_interval: 200 + trainer.log_every_n_steps: 1 + trainer.limit_val_batches: 5 + trainer.limit_test_batches: 5 + + model.tokenizer.library: megatron + model.tokenizer.type: GPT2BPETokenizer + model.tokenizer.model: null + model.tokenizer.delimiter : null + model.tokenizer.vocab_file: gpt2-vocab.json + model.tokenizer.merge_file: gpt2-merges.txt + + exp_manager.exp_dir: '{experiment_run_dir}' + exp_manager.checkpoint_callback_params.model_parallel_size: ${multiply:$\{model.tensor_model_parallel_size}, $\{model.pipeline_model_parallel_size}} + run.time_limit: 0-03:30:00 + + model.data.data_impl: mock + model.data.data_prefix: [] + model.data.splits_string: 90,8,2 + + model.optim.grad_sync_dtype: bf16 + + model.global_batch_size: 1024 + model.virtual_pipeline_model_parallel_size: 20 + model.tensor_model_parallel_size: 2 + model.context_parallel_size: 1 + + model.fp8: true + model.fp8_e4m3: true + model.fp8_hybrid: true + model.fp8_params: true + + model.ub_tp_comm_overlap: false + + model.nsys_profile.ranks: [0, 8] + model.nsys_profile.start_step: 17 + model.nsys_profile.end_step: 19 + + # Checkpoint saving & logging + exp_manager.resume_if_exists: false + exp_manager.create_checkpoint_callback: false + exp_manager.create_dllogger_logger: false + + internals: + custom_executables: + mpi_head_node: + template: + - echo "Downloading GPT vocabulary files" + - wget -P /opt/NeMo/ https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json + - wget -P /opt/NeMo/ https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt + - source /usr/local/gib/scripts/set_nccl_env.sh + - if [[ "${NODE_RANK}" -eq "0" ]]; then + redirect: '' + log_file: '' + wait_worker_nodes: + template: + - else + - while ping -c 1 ${WORKERS_BASENAME}-0.${POSTFIX}; do + - sleep 5 + - done + - fi + redirect: '' + log_file: '' + tail_log: + template: + - tail -f {log_file} & + - export TAIL_PID=$! + redirect: '' + log_file: '' + kill_tail: + template: + - kill -9 $TAIL_PID + redirect: '' + log_file: '' + executable_injection: + - name: mpi_head_node + order: before + - name: wait_worker_nodes + order: after + - name: tail_log + order: before + - name: kill_tail + order: after + formatted_executables: + yaml_command: + indentation: 18 + join_separator: \n + commands: + - mkdir -p {experiment_run_dir} + - ulimit -l unlimited + - cp /configs/nemo.yaml {experiment_run_dir}/ + - '{unformatted_command}' + + jobset.tpl: | + apiVersion: jobset.x-k8s.io/v1alpha2 + kind: JobSet + metadata: + generateName: {jobset_name}- + namespace: {gke_namespace} + labels: + kueue.x-k8s.io/queue-name: {cluster_queue} + spec: + ttlSecondsAfterFinished: 86400 + network: + enableDNSHostnames: true + publishNotReadyAddresses: true + replicatedJobs: + - name: w + template: + spec: + parallelism: {n_nodes} + completions: {n_nodes} + template: + metadata: + annotations: + kueue.x-k8s.io/podset-preferred-topology: "kubernetes.io/hostname" + networking.gke.io/default-interface: 'eth0' + networking.gke.io/interfaces: | + [ + \{"interfaceName":"eth0","network":"default"\}, + \{"interfaceName":"eth1","network":"{sysnet_subnet_prefix}-1"\}, + \{"interfaceName":"eth2","network":"{gpu_subnet_prefix}-0"\}, + \{"interfaceName":"eth3","network":"{gpu_subnet_prefix}-1"\}, + \{"interfaceName":"eth4","network":"{gpu_subnet_prefix}-2"\}, + \{"interfaceName":"eth5","network":"{gpu_subnet_prefix}-3"\}, + \{"interfaceName":"eth6","network":"{gpu_subnet_prefix}-4"\}, + \{"interfaceName":"eth7","network":"{gpu_subnet_prefix}-5"\}, + \{"interfaceName":"eth8","network":"{gpu_subnet_prefix}-6"\}, + \{"interfaceName":"eth9","network":"{gpu_subnet_prefix}-7"\} + ] + spec: + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-nodepool: {gke_nodepool} + tolerations: + - key: cloud.google.com/gke-queued + effect: NoSchedule + value: "true" + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + setHostnameAsFQDN: true + volumes: + - name: mpi-id + secret: + secretName: mpi-ssh-nemo + items: + - key: ssh-privatekey + path: "id_rsa" + - key: ssh-publickey + path: "id_rsa.pub" + - name: gib + hostPath: + path: /home/kubernetes/bin/gib + - name: nvidia + hostPath: + path: /home/kubernetes/bin/nvidia + - name: lib64 + hostPath: + path: /lib64 + - name: shared-memory + emptyDir: + medium: "Memory" + sizeLimit: 250Gi + - name: sys + hostPath: + path: /sys + - name: proc-sys + hostPath: + path: /proc/sys + - name: local-ssd + hostPath: + path: /mnt/stateful_partition/kube-ephemeral-ssd + - name: nemo-config + configMap: + name: {experiment_name} + initContainers: + - name: gpu-healthcheck + image: alpine:latest + command: ["/bin/sh", "-c"] + args: + - | + apk add --no-cache bash # Install bash + /bin/bash -c "set -ex + NUM_GPUS=$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | wc -l) + if [ \${NUM_GPUS} -lt 8 ]; then + echo \"Error: Only \${NUM_GPUS} GPUs and expected 8\" + exit 1 + fi + gpu_errors=(\$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=ecc.errors.uncorrected.volatile.total --format=csv,noheader,nounits)) + for gpu_index in \${!gpu_errors[@]}; do + if [ \${gpu_errors[\$gpu_index]} == '[N/A]' ]; then + echo 'Error: ERR detected in GPU index '\$gpu_index + exit 1 + elif [ \${gpu_errors[\$gpu_index]} -gt 0 ]; then + echo 'Error: Unrecoverable ECC errors detected in GPU index '\$gpu_index + exit 1 + fi + done + echo \${NUM_GPUS} GPUs found with no ERR or Unrecoverable ECC errors" + volumeMounts: + - name: nvidia + mountPath: /usr/local/nvidia + - name: lib64 + mountPath: /lib64 + securityContext: + privileged: true + env: + - name: LD_LIBRARY_PATH + value: /usr/local/nvidia/lib64 + containers: + - name: {gke_container_name} + stdin: true + tty: true + image: {container_uri} + env: + - name: OMPI_ALLOW_RUN_AS_ROOT + value: "1" + - name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM + value: "1" + - name: MY_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - bash + - -c + - | + set -x + + # Setup SSH + export DEBIAN_FRONTEND=noninteractive + + apt update -qq -y + apt install -qq -y iputils-ping openssh-server + + mkdir -p /run/sshd ~/.ssh + chmod 700 ~/.ssh + cp /secrets/ssh/* ~/.ssh/ + cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys + chmod 600 ~/.ssh/* + mkdir -p /run/sshd + /sbin/sshd + + # Load all the cuda libs + /sbin/ldconfig + + export POSTFIX=$(hostname | cut -d . -f 2-) + export WORKERS_BASENAME=$(hostname | cut -d . -f 1 | rev | cut -d - -f 2- | rev ) + export NODE_RANK=$JOB_COMPLETION_INDEX + + # For every host, get the entity and add to hostfile + for i in `seq 0 $(({n_nodes}-1))`; do + OTHER=${WORKERS_BASENAME}-${i}.${POSTFIX} + until ssh -p {ssh_port} -o StrictHostKeyChecking=no $OTHER hostname; + do + echo ... + sleep 10 + done + echo ${OTHER} port={ssh_port} slots={processes_per_node} | tee -a /tmp/hostfile; + done + cat /tmp/hostfile + + export MASTER_ADDR=${WORKERS_BASENAME}-0.${POSTFIX} + export MASTER_PORT=5678 + + {yaml_command} + + exit 0 + + volumeMounts: + - name: mpi-id + mountPath: "/secrets/ssh" + readOnly: true + - name: nvidia + mountPath: /usr/local/nvidia + - name: gib + mountPath: /usr/local/gib + - name: shared-memory + mountPath: /dev/shm + - name: local-ssd + mountPath: /ssd + - name: nemo-config + mountPath: /configs + resources: + limits: + nvidia.com/gpu: 8 + requests: + nvidia.com/gpu: 8 + + restartPolicy: Never + +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ramble-nemo-runner + namespace: ramble +spec: + template: + spec: + volumes: + - name: config + configMap: + name: ramble-nemo-configs + items: + - key: jobset.tpl + path: jobset.tpl + - key: ramble.yaml + path: ramble.yaml + - key: execute_nemo.tpl + path: execute_nemo.tpl + - key: collect_logs.tpl + path: collect_logs.tpl + + serviceAccountName: ramble + containers: + - name: ramble-controller + image: ubuntu:latest + + volumeMounts: + - name: config + mountPath: /opt/configs/ + readOnly: true + + command: + - bash + - -c + - | + export DEBIAN_FRONTEND=noninteractive + + set -e + printf "Installing system dependencies\n" + apt update -qq -y > /dev/null + apt install -qq -y build-essential python3-venv jq git curl > /dev/null + + printf "Installing kubectl\n" + curl -s -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl + + # Use current unix timestamp as a unique tag + # for jobs submitted + TAG=$(date +%s) + TEST_DIR=/workspace/nemo-tests-"${TAG}" + SOFTWARE_INSTALL=/opt + + mkdir -p ${SOFTWARE_INSTALL} ${TEST_DIR} + + printf "Cloning ramble and cluster-toolkit\n" + git clone --depth 1 -c feature.manyFiles=true https://github.com/GoogleCloudPlatform/ramble.git "${SOFTWARE_INSTALL}"/ramble + + printf "Setting up ramble python environment, and installing requirements\n" + python3 -m venv "${SOFTWARE_INSTALL}"/ramble/env || true + source "${SOFTWARE_INSTALL}"/ramble/env/bin/activate + pip install -q -r "${SOFTWARE_INSTALL}"/ramble/requirements.txt + + # Activate ramble + . ${SOFTWARE_INSTALL}/ramble/share/ramble/setup-env.sh + + ramble workspace create -a -d "${TEST_DIR}" + + cp /opt/configs/* ${RAMBLE_WORKSPACE}/configs/ + + cd ${RAMBLE_WORKSPACE} + + # Set up SSH + printf "Creating ssh keypair for MPI workloads\n" + ssh-keygen -b 2048 -f mpi_id -N "" + kubectl create secret generic mpi-ssh-nemo --from-file=ssh-privatekey=./mpi_id --from-file=ssh-publickey=./mpi_id.pub || true + + # Get number of GPUs / nodes available in this cluster from Kueue: + AVAILABLE_GPUS=$( + kubectl get clusterqueues.kueue.x-k8s.io -o json | + jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name="a3u") | + .resources[] | select (.name="nvidia.com/gpu") | .nominalQuota' + ) + + N_NODES=$((AVAILABLE_GPUS / 8)) + + printf "\n--- Available Benchmarks on %s nodes --\n" ${N_NODES} + ramble workspace info --where '{n_nodes} <= '"${N_NODES}" + + printf "\n--------- Setting up Benchmarks -------\n" + ramble workspace setup --where '{n_nodes} <= '"${N_NODES}" + + printf "\n----------- Running Benchmarks --------\n" + ramble on --where '{n_nodes} <= '"${N_NODES}" + + printf "\n------- Collecting benchmark logs -----\n" + ramble on --executor "{experiment_run_dir}/collect_logs" --where '{n_nodes} <= '"${N_NODES}" + + printf "\n------- Analyzing benchmark logs ------\n" + ramble workspace analyze -f json --where '{n_nodes} <= '"${N_NODES}" + + printf "\n--------------- SUMMARY ---------------\n" + jq -r '["nemo_config","n_nodes","step","train_step_timing"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context | + { + name: $exp.RAMBLE_VARIABLES.nemo_config_name, + workload: $exp.workload_name, + n_nodes: $exp.n_nodes, + Context: $context.name + } + + ($context.foms | from_entries ) + | select (.Context == "0-5/5") + | [.name, .n_nodes, .Context, .train_step_timing]) + | @tsv' results.latest.json + printf "\n-------- Benchmarking Complete -------\n" + + restartPolicy: Never + backoffLimit: 4 From ee88546422c96f510a969cebaa5c40b501cb8c47 Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Fri, 17 Jan 2025 05:29:42 +0000 Subject: [PATCH 10/15] Update README for benchmarks --- .../a3u-gke-gcs/system_benchmarks/README.md | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md index 63bce8c2db..ade615a8f0 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md @@ -49,19 +49,30 @@ For any of the above, the following will be created: series of `Jobset` objects for each individual benchmark. Once created, this will first create a K8s job called -"ramble-{nccl,hpl,nemo}-runner". This controller job orchestrates the running -and analysis of the benchmarks. It installs everything it needs within a -self-contained pod, creates an ssh keypair for multi-node communication, and -uses Ramble to create JobSet's for each benchmark. Once those benchmarks -are complete, it provides a summary of the results. Full benchmark logs can -otherwise be found in the logs for each of the created JobSet/Job/Pod's +"ramble-{nccl,hpl,nemo}-runner" in the ramble workspace. This controller job +orchestrates the running and analysis of the benchmarks. It installs everything +it needs within a self-contained pod, creates an ssh keypair for multi-node +communication, and uses Ramble to create JobSet's for each benchmark. Once those +benchmarks are complete, it provides a summary of the results. Full benchmark +logs can otherwise be found in the logs for each of the created JobSet/Job/Pod's themselves. +If you were to run all of the above commands, you would initially see something +like this: + + ```bash + $ kubectl -n ramble get jobs + NAME STATUS COMPLETIONS DURATION AGE + ramble-hpl-runner Running 0/1 30s 30s + ramble-nccl-runner Running 0/1 43s 43s + ramble-nemo-runner Running 0/1 22s 22s + ``` + For each benchmark, multiple node scales will be submitted, up to your maximum node scale of your cluster. This can be controlled with the `n_nodes` variable in the `ramble.yaml` configMap. -Note: The following depends on several tightly coupled settings, in particular +Note: The benchmarks depends on several tightly coupled settings, in particular making sure that the subnet names in your GKE cluster match those defined in the "ramble.yaml" config file. If you modify the names of your subnets (including by changing the "deployment" name), then you will need to modify @@ -74,8 +85,8 @@ ramble-{nccl,hpl,nemo}.yaml files: gpu_subnet_prefix: a3u-gke-gcs-rdma-sub cluster_queue: a3u -Expected Results ----------------- +Viewing the Results +------------------- For ramble-nccl.yaml, at the end of the logs of the created `ramble-nccl-runner` job, you should see something like: From 5758a2695b381cd07e3a857737c9c46c64f21e68 Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Sat, 18 Jan 2025 23:13:30 +0000 Subject: [PATCH 11/15] Simplify/improve system benchmarks Simplify nemo examples, add instructions to get ramble workspace. --- .../system_benchmarks/ramble-hpl.yaml | 28 +++- .../system_benchmarks/ramble-nccl.yaml | 29 +++- .../system_benchmarks/ramble-nemo.yaml | 141 ++++++++---------- 3 files changed, 117 insertions(+), 81 deletions(-) diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml index e5f5e4d2a9..bfeeecfbdb 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml @@ -129,7 +129,6 @@ data: ${!UCX*} | sed 's/ / -x /g') - container_name: hpl container_uri: "nvcr.io/nvidia/hpc-benchmarks:24.09" gke_container_name: hpl @@ -142,6 +141,7 @@ data: sysnet_subnet_prefix: a3u-gke-gcs-sub gpu_subnet_prefix: a3u-gke-gcs-rdma-sub cluster_queue: a3u + env_vars: set: CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 @@ -158,7 +158,6 @@ data: - paths: LD_LIBRARY_PATH: /usr/local/gib/lib64 - applications: nvidia-hpl: workloads: @@ -536,6 +535,9 @@ spec: printf "\n------- Analyzing benchmark logs ------\n" ramble workspace analyze -f json --where '{n_nodes} <= '"${N_NODES}" + printf "\n------- Archiving ramble workspace ------\n" + ramble workspace archive -t --where '{n_nodes} <= '"${N_NODES}" + printf "\n--------------- SUMMARY ---------------\n" jq -r '["workload","n_nodes","GFlop/s ","GFlops/s/GPU"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context | { @@ -549,5 +551,27 @@ spec: | @tsv' results.latest.json printf "\n-------- Benchmarking Complete -------\n" + ARCHIVE_TAR=$(readlink archive/archive.latest.tar.gz) + ARCHIVE_PATH=${RAMBLE_WORKSPACE}/archive/${ARCHIVE_TAR} + RESULTS_FILE=$(basename $(readlink results.latest.json)) + RESULTS_PATH=${RAMBLE_WORKSPACE}/${RESULTS_FILE} + + printf "\n# To copy the full results from container:\n" + printf "kubectl cp %s:%s %s\n" $(hostname) ${RESULTS_PATH} ${RESULTS_FILE} + printf "\n# To copy the ramble workspace archive from container:\n" + printf "kubectl cp %s:%s ./%s\n" $(hostname) ${ARCHIVE_PATH} ${ARCHIVE_TAR} + + printf "\n# To re-activate ramble workspace, first access runner:\n" + printf "kubectl exec -it %s -- /bin/bash\n" $(hostname) + printf "# Then run:\n" + printf "cd ${RAMBLE_WORKSPACE}\n" + printf "source "${SOFTWARE_INSTALL}"/ramble/env/bin/activate\n" + printf ". ${SOFTWARE_INSTALL}/ramble/share/ramble/setup-env.sh\n" + printf "ramble workspace activate .\n" + + printf "\n- Sleeping for 1 day to allow introspection -\n" + sleep 86400 + + restartPolicy: Never backoffLimit: 4 diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml index ae7753516e..e28e2ad1d9 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml @@ -90,7 +90,7 @@ data: #!/bin/bash set -e cd "{experiment_run_dir}" - printf "Submitting ${experiment_name}\n" + printf "Submitting {experiment_name}\n" kubectl create -f jobset 2>&1 | tee klog collect_logs.tpl: | @@ -138,6 +138,7 @@ data: sysnet_subnet_prefix: a3u-gke-gcs-sub gpu_subnet_prefix: a3u-gke-gcs-rdma-sub cluster_queue: a3u + env_vars: set: CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 @@ -146,7 +147,6 @@ data: - paths: LD_LIBRARY_PATH: /usr/local/gib/lib64 - applications: nccl-tests: workloads: @@ -508,6 +508,9 @@ spec: printf "\n------- Analyzing benchmark logs ------\n" ramble workspace analyze -f json --where '{n_nodes} <= '"${N_NODES}" + printf "\n------- Archiving ramble workspace ------\n" + ramble workspace archive -t --where '{n_nodes} <= '"${N_NODES}" + printf "\n---- SUMMARY for >1GB Message Sizes ----\n" jq -r '["workload","n_nodes","msg_size","busbw"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context | { @@ -522,5 +525,27 @@ spec: | @tsv' results.latest.json printf "\n-------- Benchmarking Complete -------\n" + ARCHIVE_TAR=$(readlink archive/archive.latest.tar.gz) + ARCHIVE_PATH=${RAMBLE_WORKSPACE}/archive/${ARCHIVE_TAR} + RESULTS_FILE=$(basename $(readlink results.latest.json)) + RESULTS_PATH=${RAMBLE_WORKSPACE}/${RESULTS_FILE} + + printf "\n# To copy the full results from container:\n" + printf "kubectl cp %s:%s %s\n" $(hostname) ${RESULTS_PATH} ${RESULTS_FILE} + printf "\n# To copy the ramble workspace archive from container:\n" + printf "kubectl cp %s:%s ./%s\n" $(hostname) ${ARCHIVE_PATH} ${ARCHIVE_TAR} + + printf "\n# To re-activate ramble workspace, first access runner:\n" + printf "kubectl exec -it %s -- /bin/bash\n" $(hostname) + printf "# Then run:\n" + printf "cd ${RAMBLE_WORKSPACE}\n" + printf "source "${SOFTWARE_INSTALL}"/ramble/env/bin/activate\n" + printf ". ${SOFTWARE_INSTALL}/ramble/share/ramble/setup-env.sh\n" + printf "ramble workspace activate .\n" + + printf "\n- Sleeping for 1 day to allow introspection -\n" + sleep 86400 + + restartPolicy: Never backoffLimit: 4 diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml index fe5642b4f1..90ac428240 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml @@ -146,11 +146,36 @@ data: nemo_version: 24.07 processes_per_node: 8 + # Shared NeMo Configurations + trainer.max_steps: 10 + trainer.val_check_interval: null + trainer.limit_val_batches: 0.0 + trainer.log_every_n_steps: 1 + trainer.enable_model_summary: false + + model.tokenizer.library: megatron + model.tokenizer.type: GPT2BPETokenizer + model.tokenizer.model: null + model.tokenizer.delimiter : null + model.tokenizer.vocab_file: gpt2-vocab.json + model.tokenizer.merge_file: gpt2-merges.txt + model.data.data_impl: mock + model.data.data_prefix: [] + model.data.splits_string: 98,1,1 + + exp_manager.resume_if_exists: false + exp_manager.create_checkpoint_callback: false + exp_manager.create_dllogger_logger: false + exp_manager.checkpoint_callback_params.save_top_k: 1 + exp_manager.checkpoint_callback_params.model_parallel_size: ${multiply:$\{model.tensor_model_parallel_size}, $\{model.pipeline_model_parallel_size}} + exp_manager.exp_dir: '{experiment_run_dir}' + # Potentially need to be modified gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool sysnet_subnet_prefix: a3u-gke-gcs-sub gpu_subnet_prefix: a3u-gke-gcs-rdma-sub cluster_queue: a3u + env_vars: set: CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 @@ -168,7 +193,6 @@ data: - paths: LD_LIBRARY_PATH: /usr/local/gib/lib64 - applications: py-nemo: workloads: @@ -176,106 +200,44 @@ data: experiments: mixtral-{n_nodes}-nodes: variables: - n_nodes: [8,16,32] + n_nodes: [8, 16, 32] jobset_name: 'm8x7b-{n_nodes}' nemo_stage: training nemo_model: mixtral nemo_config_name: mixtral_8x7b - max_steps: 10 - trainer.max_steps: '{max_steps}' - trainer.val_check_interval: 50 - trainer.log_every_n_steps: 1 - trainer.enable_model_summary: false - - model.tokenizer.library: megatron - model.tokenizer.type: GPT2BPETokenizer - model.tokenizer.model: null - model.tokenizer.delimiter : null - model.tokenizer.vocab_file: gpt2-vocab.json - model.tokenizer.merge_file: gpt2-merges.txt - - exp_manager.exp_dir: '{experiment_run_dir}' - - run.time_limit: 01:00:00 - - model.data.data_impl: mock - model.data.data_prefix: [] - model.data.splits_string: 90,8,2 model.data.num_workers: 4 - - model.optim.contiguous_grad_buffer: true - model.optim.contiguous_param_buffer: true - + model.fp8_params: true + model.gc_interval: 0 model.global_batch_size: 1024 model.micro_batch_size: 2 - model.virtual_pipeline_model_parallel_size: null + model.moe_grouped_gemm: false + model.optim.contiguous_grad_buffer: true + model.optim.contiguous_param_buffer: true model.pipeline_model_parallel_size: 1 - model.gc_interval: 100 - model.fp8_params: true - - model.nsys_profile.ranks: [0, 8] - model.nsys_profile.start_step: 27 - model.nsys_profile.end_step: 29 - - # Checkpoint saving & logging - exp_manager.resume_if_exists: false - exp_manager.create_checkpoint_callback: false + model.virtual_pipeline_model_parallel_size: null llama3-{n_nodes}-nodes: variables: - n_nodes: [8,16,32] + n_nodes: [8, 16, 32] jobset_name: 'llama-{n_nodes}' nemo_stage: training nemo_model: llama nemo_config_name: llama3_1_70b - max_steps: 10 - trainer.max_steps: '{max_steps}' - trainer.val_check_interval: 200 - trainer.log_every_n_steps: 1 - trainer.limit_val_batches: 5 - trainer.limit_test_batches: 5 - - model.tokenizer.library: megatron - model.tokenizer.type: GPT2BPETokenizer - model.tokenizer.model: null - model.tokenizer.delimiter : null - model.tokenizer.vocab_file: gpt2-vocab.json - model.tokenizer.merge_file: gpt2-merges.txt - - exp_manager.exp_dir: '{experiment_run_dir}' - exp_manager.checkpoint_callback_params.model_parallel_size: ${multiply:$\{model.tensor_model_parallel_size}, $\{model.pipeline_model_parallel_size}} - run.time_limit: 0-03:30:00 - - model.data.data_impl: mock - model.data.data_prefix: [] - model.data.splits_string: 90,8,2 - - model.optim.grad_sync_dtype: bf16 - - model.global_batch_size: 1024 - model.virtual_pipeline_model_parallel_size: 20 - model.tensor_model_parallel_size: 2 + model.data.num_workers: 2 model.context_parallel_size: 1 - model.fp8: true model.fp8_e4m3: true model.fp8_hybrid: true model.fp8_params: true - + model.global_batch_size: 1024 + model.optim.grad_sync_dtype: bf16 + model.tensor_model_parallel_size: 2 model.ub_tp_comm_overlap: false - - model.nsys_profile.ranks: [0, 8] - model.nsys_profile.start_step: 17 - model.nsys_profile.end_step: 19 - - # Checkpoint saving & logging - exp_manager.resume_if_exists: false - exp_manager.create_checkpoint_callback: false - exp_manager.create_dllogger_logger: false + model.virtual_pipeline_model_parallel_size: 20 internals: custom_executables: @@ -628,6 +590,9 @@ spec: printf "\n------- Analyzing benchmark logs ------\n" ramble workspace analyze -f json --where '{n_nodes} <= '"${N_NODES}" + printf "\n------- Archiving ramble workspace ------\n" + ramble workspace archive -t --where '{n_nodes} <= '"${N_NODES}" + printf "\n--------------- SUMMARY ---------------\n" jq -r '["nemo_config","n_nodes","step","train_step_timing"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context | { @@ -637,10 +602,32 @@ spec: Context: $context.name } + ($context.foms | from_entries ) - | select (.Context == "0-5/5") + | select (.Context == "0-10/10") | [.name, .n_nodes, .Context, .train_step_timing]) | @tsv' results.latest.json printf "\n-------- Benchmarking Complete -------\n" + ARCHIVE_TAR=$(readlink archive/archive.latest.tar.gz) + ARCHIVE_PATH=${RAMBLE_WORKSPACE}/archive/${ARCHIVE_TAR} + RESULTS_FILE=$(basename $(readlink results.latest.json)) + RESULTS_PATH=${RAMBLE_WORKSPACE}/${RESULTS_FILE} + + printf "\n# To copy the full results from container:\n" + printf "kubectl cp %s:%s %s\n" $(hostname) ${RESULTS_PATH} ${RESULTS_FILE} + printf "\n# To copy the ramble workspace archive from container:\n" + printf "kubectl cp %s:%s ./%s\n" $(hostname) ${ARCHIVE_PATH} ${ARCHIVE_TAR} + + printf "\n# To re-activate ramble workspace, first access runner:\n" + printf "kubectl exec -it %s -- /bin/bash\n" $(hostname) + printf "# Then run:\n" + printf "cd ${RAMBLE_WORKSPACE}\n" + printf "source "${SOFTWARE_INSTALL}"/ramble/env/bin/activate\n" + printf ". ${SOFTWARE_INSTALL}/ramble/share/ramble/setup-env.sh\n" + printf "ramble workspace activate .\n" + + printf "\n- Sleeping for 1 day to allow introspection -\n" + sleep 86400 + + restartPolicy: Never backoffLimit: 4 From 8e823412e0a017bc7d716116239d34750feb64a2 Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Sat, 18 Jan 2025 23:25:12 +0000 Subject: [PATCH 12/15] Update kueue config to best practices --- .../a3u-gke-gcs/a3u-gke-gcs.yaml | 3 ++- .../kueue-configuration.yaml.tftpl | 24 ++++++++++++++----- .../system_benchmarks/ramble-hpl.yaml | 3 ++- .../system_benchmarks/ramble-nccl.yaml | 3 ++- .../system_benchmarks/ramble-nemo.yaml | 3 ++- 5 files changed, 26 insertions(+), 10 deletions(-) diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml index ce6e3a34f0..7bc4e79d26 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml @@ -186,7 +186,8 @@ deployment_groups: version: v0.10.0 config_path: $(ghpc_stage("kueue-configuration.yaml.tftpl")) config_template_vars: - num_gpus: $(vars.num_gpus) + num_gpus: $(a3-ultragpu-pool.static_gpu_count) + node_pool_name: $(a3-ultragpu-pool.node_pool_name) jobset: install: true version: v0.7.2 diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl b/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl index 97cbaede33..1beffe206d 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl +++ b/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl @@ -26,26 +26,38 @@ spec: kind: ResourceFlavor apiVersion: kueue.x-k8s.io/v1beta1 metadata: - name: "a3u" + name: "a3-ultra-tas" spec: nodeLabels: - cloud.google.com/gke-nodepool: "a3-ultragpu-8g-a3-ultragpu-pool" + cloud.google.com/gke-nodepool: ${node_pool_name} topologyName: "gke-default" tolerations: - key: "nvidia.com/gpu" operator: "Exists" effect: NoSchedule --- +kind: ResourceFlavor +apiVersion: kueue.x-k8s.io/v1beta1 +metadata: + name: "a3-ultra" +spec: + nodeLabels: + cloud.google.com/gke-nodepool: ${node_pool_name} +--- apiVersion: kueue.x-k8s.io/v1beta1 kind: ClusterQueue metadata: - name: "a3u" + name: "a3-ultra" spec: namespaceSelector: {} # match all. resourceGroups: - coveredResources: ["nvidia.com/gpu"] flavors: - - name: "a3u" + - name: "a3-ultra" + resources: + - name: "nvidia.com/gpu" + nominalQuota: ${num_gpus} + - name: "a3-ultra-tas" resources: - name: "nvidia.com/gpu" nominalQuota: ${num_gpus} @@ -54,6 +66,6 @@ apiVersion: kueue.x-k8s.io/v1beta1 kind: LocalQueue metadata: namespace: "default" - name: "a3u" + name: "a3-ultra" spec: - clusterQueue: "a3u" + clusterQueue: "a3-ultra" diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml index bfeeecfbdb..93e1d57462 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml @@ -140,7 +140,7 @@ data: gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool sysnet_subnet_prefix: a3u-gke-gcs-sub gpu_subnet_prefix: a3u-gke-gcs-rdma-sub - cluster_queue: a3u + cluster_queue: a3-ultra-tas env_vars: set: @@ -278,6 +278,7 @@ data: restartPolicy: Never nodeSelector: cloud.google.com/gke-nodepool: {gke_nodepool} + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" tolerations: - key: cloud.google.com/gke-queued effect: NoSchedule diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml index e28e2ad1d9..9abc8afc60 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml @@ -137,7 +137,7 @@ data: gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool sysnet_subnet_prefix: a3u-gke-gcs-sub gpu_subnet_prefix: a3u-gke-gcs-rdma-sub - cluster_queue: a3u + cluster_queue: a3-ultra-tas env_vars: set: @@ -256,6 +256,7 @@ data: restartPolicy: Never nodeSelector: cloud.google.com/gke-nodepool: {gke_nodepool} + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" tolerations: - key: cloud.google.com/gke-queued effect: NoSchedule diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml index 90ac428240..3ea6338ed0 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml @@ -174,7 +174,7 @@ data: gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool sysnet_subnet_prefix: a3u-gke-gcs-sub gpu_subnet_prefix: a3u-gke-gcs-rdma-sub - cluster_queue: a3u + cluster_queue: a3-ultra-tas env_vars: set: @@ -330,6 +330,7 @@ data: restartPolicy: Never nodeSelector: cloud.google.com/gke-nodepool: {gke_nodepool} + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" tolerations: - key: cloud.google.com/gke-queued effect: NoSchedule From 642239f45794300d40c74989dd201f664b2299c0 Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Sat, 18 Jan 2025 23:40:48 +0000 Subject: [PATCH 13/15] Remove manual num_gpus --- examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml | 4 ---- examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml | 4 ---- 2 files changed, 8 deletions(-) diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml index 7bc4e79d26..a38a901f71 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml @@ -32,10 +32,6 @@ vars: # The number of nodes to be created static_node_count: - # Number of H200 GPUs (for later use by Kueue), which - # This should be 8 x static_node_count. - num_gpus: - # Cidr block containing the IP of the machine calling terraform. # To allow all (IAM restrictions still enforced), use 0.0.0.0/0 # To allow only your IP address, use /32 diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml index 48b8b200d8..a47ac61692 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml @@ -35,10 +35,6 @@ vars: # The number of nodes to be created static_node_count: - # Number of H200 GPUs (for later use by Kueue), which - # This should be 8 x static_node_count. - num_gpus: - # Cidr block containing the IP of the machine calling terraform. # To allow all (IAM restrictions still enforced), use 0.0.0.0/0 # To allow only your IP address, use /32 From deaab229f0b1f9c061cdceb9ac2b9bbb0aed00a8 Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Sun, 19 Jan 2025 00:29:38 +0000 Subject: [PATCH 14/15] Update reference workloads to new kueue config. --- .../a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml | 4 ++-- .../a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml | 6 +++--- .../a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml index 93e1d57462..7ecff65571 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml @@ -23,9 +23,9 @@ apiVersion: kueue.x-k8s.io/v1beta1 kind: LocalQueue metadata: namespace: "ramble" - name: "a3u" + name: "a3-ultra-tas" spec: - clusterQueue: "a3u" + clusterQueue: "a3-ultra" --- apiVersion: v1 kind: ServiceAccount diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml index 9abc8afc60..22cdd510ca 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml @@ -23,9 +23,9 @@ apiVersion: kueue.x-k8s.io/v1beta1 kind: LocalQueue metadata: namespace: "ramble" - name: "a3u" + name: "a3-ultra-tas" spec: - clusterQueue: "a3u" + clusterQueue: "a3-ultra" --- apiVersion: v1 kind: ServiceAccount @@ -488,7 +488,7 @@ spec: # Get number of GPUs / nodes available in this cluster from Kueue: AVAILABLE_GPUS=$( kubectl get clusterqueues.kueue.x-k8s.io -o json | - jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name="a3u") | + jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name=="a3-ultra-tas") | .resources[] | select (.name="nvidia.com/gpu") | .nominalQuota' ) diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml index 3ea6338ed0..a0d4cf0f9b 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml @@ -23,9 +23,9 @@ apiVersion: kueue.x-k8s.io/v1beta1 kind: LocalQueue metadata: namespace: "ramble" - name: "a3u" + name: "a3-ultra-tas" spec: - clusterQueue: "a3u" + clusterQueue: "a3-ultra" --- apiVersion: v1 kind: ServiceAccount From 456678c567ae0ca37c1bc9b1da063f606bb99a13 Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Mon, 20 Jan 2025 14:52:38 +0000 Subject: [PATCH 15/15] One more fix for new kueue config --- .../a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml | 2 +- .../a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml index 7ecff65571..0c2911abd5 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml @@ -515,7 +515,7 @@ spec: # Get number of GPUs / nodes available in this cluster from Kueue: AVAILABLE_GPUS=$( kubectl get clusterqueues.kueue.x-k8s.io -o json | - jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name="a3u") | + jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name=="a3-ultra-tas") | .resources[] | select (.name="nvidia.com/gpu") | .nominalQuota' ) diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml index a0d4cf0f9b..caf807d9b5 100644 --- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml +++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml @@ -570,7 +570,7 @@ spec: # Get number of GPUs / nodes available in this cluster from Kueue: AVAILABLE_GPUS=$( kubectl get clusterqueues.kueue.x-k8s.io -o json | - jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name="a3u") | + jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name=="a3-ultra-tas") | .resources[] | select (.name="nvidia.com/gpu") | .nominalQuota' )