From 24b965c2fd87c77ed768581505a5405b3a1bf509 Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Sat, 21 Dec 2024 05:45:43 +0000
Subject: [PATCH 01/15] Initial a3u-gke-gcs example

---
 .../a3u-gke-gcs/README.md                     | 109 +++++++
 .../a3u-gke-gcs/a3u-gke-gcs.yaml              | 282 ++++++++++++++++++
 .../a3u-gke-gcs/deployment.yaml               |  37 +++
 .../kueue-configuration.yaml.tftpl            |  28 ++
 .../a3u-gke-gcs/nccl-rdma-installer.yaml      |  96 ++++++
 5 files changed, 552 insertions(+)
 create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/README.md
 create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
 create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml
 create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl
 create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/README.md b/examples/hypercompute_clusters/a3u-gke-gcs/README.md
new file mode 100644
index 0000000000..3a21f46e13
--- /dev/null
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/README.md
@@ -0,0 +1,109 @@
+# A3-Ultra GKE + GCS Reference Design
+
+This reference design provides a high-performance and scalable architecture for
+deploying AI/ML workloads on Google Kubernetes Engine (GKE) with Google Cloud
+Storage (GCS).
+
+## Key Features
+
+* **Multi-VPC Design:** Utilizes three VPCs: two for GKE nodes and one dedicated
+  for GPU RDMA networks.
+* **Cloud Storage Fuse Integration:** Enables seamless access to GCS buckets
+  from within your containers using the Cloud Storage Fuse CSI Driver. Cloud
+  Storage Fuse is configured to utilize the 12 TB of Local SSD
+* **Hierarchical Namespace Buckets:** Leverages GCS buckets with Hierarchical
+  Namespace enabled, optimizing performance for checkpointing and restarting
+  workloads.
+* **Kueue for Workload Scheduling:** Provides a robust and flexible system for
+  managing your AI/ML training jobs.
+* **Jobset API for Tightly Coupled Workloads:** Facilitates running tightly
+  coupled AI/ML training jobs efficiently.
+
+## Deployment Steps
+
+1. **Build the Cluster Toolkit `gcluster` binary:**
+
+   Follow the instructions [here](https://cloud.google.com/cluster-toolkit/docs/setup/configure-environment).
+
+2. **(Optional) Create a GCS Bucket for Terraform State:**
+
+   This step is recommended for storing your Terraform state. Use the
+   following commands, replacing placeholders with your project details:
+
+   ```bash
+   BUCKET_NAME=<your-bucket-name>
+   PROJECT_ID=<your-gcp-project>
+   REGION=<your-preferred-region>
+
+   gcloud storage buckets create gs://${BUCKET_NAME} \
+       --project=${PROJECT_ID} \
+       --default-storage-class=STANDARD \
+       --location=${REGION} \
+       --uniform-bucket-level-access
+
+   gcloud storage buckets update gs://${BUCKET_NAME} --versioning
+   ```
+
+3. **Create and Configure GCS Buckets:**
+
+   * Create separate GCS buckets for training data and checkpoint/restart data:
+
+     ```bash
+     PROJECT_ID=<your-gcp-project>
+     REGION=<your-preferred-region>
+     TRAINING_BUCKET_NAME=<training-bucket-name>
+     CHECKPOINT_BUCKET_NAME=<checkpoint-bucket-name>
+     PROJECT_NUMBER=<your-project-number>
+
+     gcloud storage buckets create gs://${TRAINING_BUCKET_NAME} \
+         --location=${REGION} \
+         --uniform-bucket-level-access \
+         --enable-hierarchical-namespace
+
+     gcloud storage buckets create gs://${CHECKPOINT_BUCKET_NAME} \
+         --location=${REGION} \
+         --uniform-bucket-level-access \
+         --enable-hierarchical-namespace
+     ```
+
+   * Grant workload identity service accounts (WI SAs) access to the buckets:
+
+     ```bash
+
+     gcloud storage buckets add-iam-policy-binding gs://${TRAINING_BUCKET_NAME} \
+         --member "principal://iam.googleapis.com/projects/${PROJECT_NUMBER}/locations/global/workloadIdentityPools/${PROJECT_ID}.svc.id.goog/subject/ns/default/sa/default" \
+         --role roles/storage.objectUser
+
+     gcloud storage buckets add-iam-policy-binding gs://${CHECKPOINT_BUCKET_NAME} \
+         --member "principal://iam.googleapis.com/projects/$PROJECT_NUMBER}/locations/global/workloadIdentityPools/${PROJECT_ID}.svc.id.goog/subject/ns/default/sa/default" \
+         --role roles/storage.objectUser
+     ```
+
+4. **Customize Deployment Configuration:**
+
+   Modify the `deployment.yaml` file to suit your needs. This will include
+   region/zone, nodepool sizes, reservation name, and checkpoint/training bucket
+   names.
+
+5. **Deploy the Cluster:**
+
+   Use the `gcluster` tool to deploy your GKE cluster with the desired configuration:
+
+   ```bash
+   gcluster deploy -d deployment.yaml a3u-gke-gcs.yaml
+   ```
+
+## Example Workload Job
+
+Once the cluster has been deployed, there will be instructions on how to get
+credentials for the cluster, as well as how to deploy an example workload. This
+example workload uses [fio](https://github.com/axboe/fio) to run a series of
+benchmarks against the LocalSSD and GCSFuse mounts.
+
+The instructions will look something like:
+
+```bash
+Use the following commands to:
+Submit your job:
+  kubectl create -f <PATH/TO/DEPLOYMENT_DIR>/primary/my-job-<some-id>.yaml
+```
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
new file mode 100644
index 0000000000..c5b88ab9ae
--- /dev/null
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
@@ -0,0 +1,282 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+blueprint_name: a3u-gke-gcs
+
+vars:
+  project_id: # Insert GCP project
+  deployment_name: # Unique name of this cluster, like a3u-gke-gcs
+  region: # Region, e.g. europe-west1
+  zone: # Zone, e.g. europe-west1-b
+
+  # Cidr block containing the IP of the machine calling terraform and kubectl
+  # The value can be more specific if the IPs are known which will run kubectl
+  # e.g. the local system running Terraform or a remote node
+  authorized_cidr: 0.0.0.0/0
+  extended_reservation:  # Reservation name, e.g. <project>/<reservation-name>/reservationBlocks/<reservation-block-name>
+
+  nccl_installer_path: $(ghpc_stage("./nccl-rdma-installer.yaml"))
+  mtu_size: 8896
+  static_node_count: # Number of A3-Ultra nodes, e.g. 2
+  # Number of H200 GPUs (for later use by Kueue), which
+  # should be 8 x `static_node_count`
+  num_gpus:
+  training_bucket_name: # Name of bucket that holds training data
+  checkpoint_bucket_name: # Name of bucket used for checkpoints
+  system_node_pool_disk_size_gb: 200
+  a3ultra_node_pool_disk_size_gb: 100
+
+deployment_groups:
+- group: primary
+  modules:
+  - id: gke-a3-ultra-net-0
+    source: modules/network/vpc
+    settings:
+      network_name: $(vars.deployment_name)-net-0
+      mtu: 8896
+      subnetworks:
+      - subnet_name: $(vars.deployment_name)-sub-0
+        subnet_region: $(vars.region)
+        subnet_ip: 192.168.0.0/18
+      secondary_ranges_list:
+      - subnetwork_name: $(vars.deployment_name)-sub-0
+        ranges:
+        - range_name: pods
+          ip_cidr_range: 10.4.0.0/14
+        - range_name: services
+          ip_cidr_range: 10.0.32.0/20
+
+  - id: gke-a3-ultra-net-1
+    source: modules/network/vpc
+    settings:
+      network_name: $(vars.deployment_name)-net-1
+      mtu: $(vars.mtu_size)
+      subnetworks:
+      - subnet_name: gke-a3u-gcs-sub-1
+        subnet_region: $(vars.region)
+        subnet_ip: 192.168.64.0/18
+
+  - id: gke-a3-ultra-rdma-net
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/network/rdma-vpc?ref=98c49fe
+    settings:
+      network_name: $(vars.deployment_name)-rdma-net
+      mtu: $(vars.mtu_size)
+      network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-roce
+      network_routing_mode: REGIONAL
+      subnetworks_template:
+        name_prefix: $(vars.deployment_name)-rdma-sub
+        count: 8
+        ip_range: 192.168.128.0/18
+        region: $(vars.region)
+
+  - id: a3-ultragpu-cluster
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/scheduler/gke-cluster?ref=e0c690b
+    use: [gke-a3-ultra-net-0]
+    settings:
+      release_channel: RAPID
+      system_node_pool_machine_type: "e2-standard-16"
+      system_node_pool_disk_size_gb: $(vars.system_node_pool_disk_size_gb)
+      system_node_pool_taints: []
+      enable_dcgm_monitoring: true
+      enable_gcsfuse_csi: true
+      enable_private_endpoint: false # Allows access from authorized public IPs
+      master_authorized_networks:
+      - cidr_block: $(vars.authorized_cidr) # Allows your machine to run the kubectl command. Required for multi network setup.
+        display_name: "kubectl-access-network"
+      maintenance_exclusions:
+      - name: no-minor-or-node-upgrades-indefinite
+        start_time: "2024-12-01T00:00:00Z"
+        end_time: "2025-12-22T00:00:00Z"
+        exclusion_scope: NO_MINOR_OR_NODE_UPGRADES
+      additional_networks:
+        $(concat(
+          [{
+            network=gke-a3-ultra-net-1.network_name,
+            subnetwork=gke-a3-ultra-net-1.subnetwork_name,
+            subnetwork_project=vars.project_id,
+            nic_type="GVNIC",
+            queue_count=null,
+            network_ip=null,
+            stack_type=null,
+            access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}],
+            ipv6_access_config=[],
+            alias_ip_range=[]
+          }],
+         gke-a3-ultra-rdma-net.subnetwork_interfaces_gke
+        ))
+    outputs: [instructions]
+
+  - id: a3-ultragpu-pool
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/compute/gke-node-pool?ref=e0c690b
+    use: [a3-ultragpu-cluster]
+    settings:
+      machine_type: a3-ultragpu-8g
+      auto_upgrade: true
+      zones: [$(vars.zone)]
+      disk_type: hyperdisk-balanced
+      disk_size_gb: $(vars.a3ultra_node_pool_disk_size_gb)
+      static_node_count: $(vars.static_node_count)
+      guest_accelerator:
+      - type: nvidia-h200-141gb
+        count: 8
+        gpu_driver_installation_config:
+          gpu_driver_version: "LATEST"
+      reservation_affinity:
+        consume_reservation_type: SPECIFIC_RESERVATION
+        specific_reservations:
+        - name: $(vars.extended_reservation)
+      additional_networks:
+        $(concat(
+          [{
+            network=gke-a3-ultra-net-1.network_name,
+            subnetwork=gke-a3-ultra-net-1.subnetwork_name,
+            subnetwork_project=vars.project_id,
+            nic_type="GVNIC",
+            queue_count=null,
+            network_ip=null,
+            stack_type=null,
+            access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}],
+            ipv6_access_config=[],
+            alias_ip_range=[]
+          }],
+         gke-a3-ultra-rdma-net.subnetwork_interfaces_gke
+        ))
+    outputs: [instructions]
+
+  - id: topology-aware-scheduler-install
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/compute/gke-topology-scheduler?ref=e0c690b
+    use: [a3-ultragpu-cluster]
+
+  # Install Kueue, Jobset, and NCCL installer
+  - id: workload-manager-install
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply?ref=e0c690b
+    use: [a3-ultragpu-cluster]
+    settings:
+      kueue:
+        install: true
+        version: v0.9.1
+        config_path: $(ghpc_stage("kueue-configuration.yaml.tftpl"))
+        config_template_vars:
+          num_gpus: $(vars.num_gpus)
+      jobset:
+        install: true
+        version: v0.7.1
+      apply_manifests:
+      - source: $(vars.nccl_installer_path)
+
+  # Create a remote mount of $(vars.training_bucket_name)
+  # using mount options optimized for reading training
+  # data.
+  - id: gcs-training
+    source: modules/file-system/pre-existing-network-storage
+    settings:
+      remote_mount: $(vars.training_bucket_name)
+      local_mount: /training-data
+      fs_type: gcsfuse
+      mount_options: "implicit-dirs, metadata-cache:ttl-secs:-1, metadata-cache:stat-cache-max-size-mb:-1, metadata-cache:type-cache-max-size-mb:-1, file-cache:max-size-mb:-1, file-cache:cache-file-for-range-read:true, file-system:kernel-list-cache-ttl-secs:-1"
+
+  # Create a remote mount of $(vars.checkpoint_bucket_name)
+  # using mount options optimized for writing and reading
+  # checkpoint data.
+  - id: gcs-checkpointing
+    source: modules/file-system/pre-existing-network-storage
+    settings:
+      remote_mount: $(vars.checkpoint_bucket_name)
+      local_mount: /checkpoint-data
+      fs_type: gcsfuse
+      mount_options: "implicit-dirs, metadata-cache:ttl-secs:0, file-cache:max-size-mb:-1, file-cache:cache-file-for-range-read:true, file-system:kernel-list-cache-ttl-secs:0, file-cache:enable-parallel-downloads:true, rename-dir-limit=200000"
+
+  # Persistent Volume for training data
+  - id: training-pv
+    source: modules/file-system/gke-persistent-volume
+    use: [gcs-training, a3-ultragpu-cluster]
+    settings:
+      gcs_bucket_name: $(vars.training_bucket_name)
+      capacity_gb: 1000000
+
+  # Persistent Volume for checkpoint data
+  - id: checkpointing-pv
+    source: modules/file-system/gke-persistent-volume
+    use: [gcs-checkpointing, a3-ultragpu-cluster]
+    settings:
+      gcs_bucket_name: $(vars.checkpoint_bucket_name)
+      capacity_gb: 1000000
+
+  # This is an example job that will install and run an `fio`
+  # benchmark against the training and checkpointing buckets.
+  - id: fio-bench-job-template
+    source: modules/compute/gke-job-template
+    use: [checkpointing-pv, training-pv, a3-ultragpu-pool]
+    settings:
+      ephemeral_volumes:
+      - type: local-ssd
+        mount_path: /scratch-data
+        size_gb: 1000  # Use 1 out of 12 TB for local scratch
+
+      k8s_service_account_name: default
+      image: ubuntu:latest
+
+      command:
+      - bash
+      - -c
+      - |
+
+        set -eux
+        export DEBIAN_FRONTEND=noninteractive
+
+        # Install fio
+        apt update -y && apt install -y fio
+
+        # Use a tag to create a unique path for tests
+        TAG=`date +%s`
+
+        # Verify mountpoints
+        df -h
+        mountpoint /scratch-data
+        mountpoint /checkpoint-data
+        mountpoint /training-data
+
+        # Create temporary directory for fio benchmarks
+        mkdir -p /{scratch,training,checkpoint}-data/fio-benchmarks-${TAG}
+
+        # The following will take roughly 10 minutes to complete
+
+        # Perform scratch data write performance test
+        fio --ioengine=libaio --filesize=10G --ramp_time=2s --runtime=1m \
+          --numjobs=32 --create_serialize=0 --direct=1 --verify=0 \
+          --randrepeat=0 --group_reporting --directory=/scratch-data/fio-benchmarks-${TAG} \
+          --name=scratch --blocksize=100m --iodepth=64 --readwrite=write
+
+        # Perform training data reading performance test
+        fio --ioengine=libaio --filesize=1G --ramp_time=2s --runtime=1m \
+          --numjobs=32 --create_serialize=0 --direct=1 --verify=0 \
+          --randrepeat=0 --group_reporting --directory=/training-data/fio-benchmarks-${TAG} \
+          --name=training --blocksize=1m --iodepth=64 --readwrite=randread
+
+        # Perform checkpoint data writing performance test
+        fio --ioengine=libaio --filesize=10G --ramp_time=2s --runtime=1m \
+          --numjobs=32 --create_serialize=0 --direct=1 --verify=0 \
+          --randrepeat=0 --group_reporting --directory=/checkpoint-data/fio-benchmarks-${TAG} \
+          --name=checkpoint --blocksize=100m --iodepth=64 --readwrite=write
+
+        # Perform checkpoint data reading performance test
+        fio --ioengine=libaio --filesize=10G --ramp_time=2s --runtime=1m \
+          --numjobs=32 --create_serialize=0 --direct=1 --verify=0 \
+          --randrepeat=0 --group_reporting --directory=/checkpoint-data/fio-benchmarks-${TAG} \
+          --name=checkpoint --blocksize=100m --iodepth=64 --readwrite=read
+
+        # Clean up temporary directories for fio benchmarks
+        rm -rf /{scratch-training,checkpoint}-data/fio-benchmarks-${TAG}
+
+    outputs: [instructions]
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml
new file mode 100644
index 0000000000..0d9966b3e0
--- /dev/null
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml
@@ -0,0 +1,37 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+terraform_backend_defaults:
+  type: gcs
+  configuration:
+    bucket:
+
+vars:
+  project_id:
+  # This should be unique across all of your Cluster
+  # Toolkit Deployments.
+  deployment_name: a3u-gke-gcs
+  region:
+  zone:
+  static_node_count:
+  # This should be 8 x static_node_count.
+  num_gpus:
+  # Cidr block containing the IP of the machine calling terraform.
+  # The following line must be updated for this example to work.
+  # e.g. the local system running Terraform or a remote node
+  # To allow all (IAM restrictions still enforced), use 0.0.0.0/0
+  authorized_cidr:
+  extended_reservation:
+  training_bucket_name:
+  checkpoint_bucket_name:
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl b/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl
new file mode 100644
index 0000000000..97ae9d91f6
--- /dev/null
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl
@@ -0,0 +1,28 @@
+---
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: ResourceFlavor
+metadata:
+  name: "default-flavor"
+---
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: ClusterQueue
+metadata:
+  name: "cluster-queue"
+spec:
+  namespaceSelector: {}
+  resourceGroups:
+  - coveredResources: ["nvidia.com/gpu"]
+    flavors:
+    - name: "default-flavor"
+      resources:
+      - name: "nvidia.com/gpu"
+        nominalQuota: ${num_gpus}
+
+---
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: LocalQueue
+metadata:
+  namespace: "default"
+  name: "local-queue"
+spec:
+  clusterQueue: "cluster-queue"
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml
new file mode 100644
index 0000000000..486255755a
--- /dev/null
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml
@@ -0,0 +1,96 @@
+# Copyright 2024 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nccl-rdma-installer
+  namespace: kube-system
+  labels:
+    k8s-app: nccl-rdma-installer
+spec:
+  selector:
+    matchLabels:
+      k8s-app: nccl-rdma-installer
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        name: nccl-rdma-installer
+        k8s-app: nccl-rdma-installer
+    spec:
+      priorityClassName: system-node-critical
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: cloud.google.com/gke-accelerator
+                operator: In
+                values:
+                - nvidia-h200-141gb
+      tolerations:
+      - operator: "Exists"
+      hostNetwork: true
+      hostPID: true
+      volumes:
+      - name: library-dir-host
+        hostPath:
+          path: /home/kubernetes/bin/nvidia/lib64
+          type: DirectoryOrCreate
+      - name: gib
+        hostPath:
+          path: /home/kubernetes/bin/gib
+      initContainers:
+      - name: disable-log-martian
+        image: alpine:latest
+        command: ["/bin/sh"]
+        securityContext:
+          privileged: true
+        args:
+        - -c
+        - |
+          sysctl -w net.ipv4.conf.eth2.log_martians=0
+          sysctl -w net.ipv4.conf.eth3.log_martians=0
+          sysctl -w net.ipv4.conf.eth4.log_martians=0
+          sysctl -w net.ipv4.conf.eth5.log_martians=0
+          sysctl -w net.ipv4.conf.eth6.log_martians=0
+          sysctl -w net.ipv4.conf.eth7.log_martians=0
+          sysctl -w net.ipv4.conf.eth8.log_martians=0
+          sysctl -w net.ipv4.conf.eth9.log_martians=0
+      - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+        name: nccl-rdma-installer
+        resources:
+          requests:
+            cpu: 150m
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: library-dir-host
+          mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
+        - name: gib
+          mountPath: /usr/local/home/kubernetes/bin/gib
+        command: ["/bin/sh", "-c"]
+        args:
+        - |
+          set -ex
+          /scripts/container_entry.sh install --install-nccl
+          cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
+          cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
+          ibv_devinfo || exit 1
+          echo "installation finishes"
+      containers:
+      - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
+        name: pause

From 437027a50fde0c1f8b5936f7df291fdd3d765868 Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Sat, 21 Dec 2024 07:35:22 +0000
Subject: [PATCH 02/15] Pass mount_options through to GKE PV

---
 modules/file-system/gke-persistent-volume/main.tf     | 11 +++++++----
 .../gke-persistent-volume/templates/gcs-pv.yaml.tftpl |  6 +++++-
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/modules/file-system/gke-persistent-volume/main.tf b/modules/file-system/gke-persistent-volume/main.tf
index d12c5d6d39..34602b32aa 100644
--- a/modules/file-system/gke-persistent-volume/main.tf
+++ b/modules/file-system/gke-persistent-volume/main.tf
@@ -35,6 +35,8 @@ locals {
   pv_name  = "${local.base_name}-pv"
   pvc_name = "${local.base_name}-pvc"
 
+  list_mount_options = split(",", var.network_storage.mount_options)
+
   filestore_pv_contents = templatefile(
     "${path.module}/templates/filestore-pv.yaml.tftpl",
     {
@@ -61,10 +63,11 @@ locals {
   gcs_pv_contents = templatefile(
     "${path.module}/templates/gcs-pv.yaml.tftpl",
     {
-      pv_name     = local.pv_name
-      capacity    = "${var.capacity_gb}Gi"
-      labels      = local.labels
-      bucket_name = local.is_gcs ? var.gcs_bucket_name : ""
+      pv_name       = local.pv_name
+      capacity      = "${var.capacity_gb}Gi"
+      labels        = local.labels
+      mount_options = local.is_gcs ? local.list_mount_options : null
+      bucket_name   = local.is_gcs ? var.gcs_bucket_name : ""
     }
   )
 
diff --git a/modules/file-system/gke-persistent-volume/templates/gcs-pv.yaml.tftpl b/modules/file-system/gke-persistent-volume/templates/gcs-pv.yaml.tftpl
index 5a1fde209e..aa0e570a8b 100644
--- a/modules/file-system/gke-persistent-volume/templates/gcs-pv.yaml.tftpl
+++ b/modules/file-system/gke-persistent-volume/templates/gcs-pv.yaml.tftpl
@@ -13,8 +13,12 @@ spec:
     storage: ${capacity}
   accessModes:
   - ReadWriteMany
+  %{~ if mount_options != null ~}
   mountOptions:
-    - implicit-dirs
+  %{~ for key in mount_options ~}
+  - ${key}
+  %{~ endfor ~}
+  %{~ endif ~}
   csi:
     driver: gcsfuse.csi.storage.gke.io
     volumeHandle: ${bucket_name}

From 2b82d68ec934e0ac046934814cac70bdee4f900a Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Sat, 21 Dec 2024 07:44:37 +0000
Subject: [PATCH 03/15] Use folded style for mount options

---
 .../a3u-gke-gcs/a3u-gke-gcs.yaml               | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
index c5b88ab9ae..131b6590bf 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
@@ -184,7 +184,14 @@ deployment_groups:
       remote_mount: $(vars.training_bucket_name)
       local_mount: /training-data
       fs_type: gcsfuse
-      mount_options: "implicit-dirs, metadata-cache:ttl-secs:-1, metadata-cache:stat-cache-max-size-mb:-1, metadata-cache:type-cache-max-size-mb:-1, file-cache:max-size-mb:-1, file-cache:cache-file-for-range-read:true, file-system:kernel-list-cache-ttl-secs:-1"
+      mount_options: >-
+        implicit-dirs,
+        metadata-cache:ttl-secs:-1,
+        metadata-cache:stat-cache-max-size-mb:-1,
+        metadata-cache:type-cache-max-size-mb:-1,
+        file-cache:max-size-mb:-1,
+        file-cache:cache-file-for-range-read:true,
+        file-system:kernel-list-cache-ttl-secs:-1
 
   # Create a remote mount of $(vars.checkpoint_bucket_name)
   # using mount options optimized for writing and reading
@@ -195,7 +202,14 @@ deployment_groups:
       remote_mount: $(vars.checkpoint_bucket_name)
       local_mount: /checkpoint-data
       fs_type: gcsfuse
-      mount_options: "implicit-dirs, metadata-cache:ttl-secs:0, file-cache:max-size-mb:-1, file-cache:cache-file-for-range-read:true, file-system:kernel-list-cache-ttl-secs:0, file-cache:enable-parallel-downloads:true, rename-dir-limit=200000"
+      mount_options: >-
+        implicit-dirs,
+        metadata-cache:ttl-secs:0,
+        file-cache:max-size-mb:-1,
+        file-cache:cache-file-for-range-read:true,
+        file-system:kernel-list-cache-ttl-secs:0,
+        file-cache:enable-parallel-downloads:true,
+        rename-dir-limit=200000
 
   # Persistent Volume for training data
   - id: training-pv

From f0b877be55e16677d372b3de06f096bdef25478c Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Mon, 6 Jan 2025 22:52:27 +0000
Subject: [PATCH 04/15] Address comments/defaults, update nccl-plugin version

---
 .../a3u-gke-gcs/README.md                     |  2 +-
 .../a3u-gke-gcs/a3u-gke-gcs.yaml              | 67 +++++++++++++------
 .../a3u-gke-gcs/deployment.yaml               | 22 +++++-
 .../kueue-configuration.yaml.tftpl            | 47 ++++++++++---
 .../a3u-gke-gcs/nccl-rdma-installer.yaml      |  2 +-
 5 files changed, 106 insertions(+), 34 deletions(-)

diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/README.md b/examples/hypercompute_clusters/a3u-gke-gcs/README.md
index 3a21f46e13..7669903cf0 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/README.md
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/README.md
@@ -13,7 +13,7 @@ Storage (GCS).
   Storage Fuse is configured to utilize the 12 TB of Local SSD
 * **Hierarchical Namespace Buckets:** Leverages GCS buckets with Hierarchical
   Namespace enabled, optimizing performance for checkpointing and restarting
-  workloads.
+  workloads. (Requires GKE 1.31 or later).
 * **Kueue for Workload Scheduling:** Provides a robust and flexible system for
   managing your AI/ML training jobs.
 * **Jobset API for Tightly Coupled Workloads:** Facilitates running tightly
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
index 131b6590bf..8c844dfd5b 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
@@ -15,28 +15,46 @@
 blueprint_name: a3u-gke-gcs
 
 vars:
-  project_id: # Insert GCP project
-  deployment_name: # Unique name of this cluster, like a3u-gke-gcs
-  region: # Region, e.g. europe-west1
-  zone: # Zone, e.g. europe-west1-b
+  # The following variables should be over-written in the deployment.yaml file.
+  # Your GCP Project ID
+  project_id:
 
-  # Cidr block containing the IP of the machine calling terraform and kubectl
-  # The value can be more specific if the IPs are known which will run kubectl
-  # e.g. the local system running Terraform or a remote node
-  authorized_cidr: 0.0.0.0/0
-  extended_reservation:  # Reservation name, e.g. <project>/<reservation-name>/reservationBlocks/<reservation-block-name>
+  # This should be unique across all of your Cluster
+  # Toolkit Deployments.
+  deployment_name: a3u-gke-gcs
+
+  # The GCP Region used for this deployment.
+  region:
+
+  # The GCP Zone used for this deployment.
+  zone:
+
+  # The number of nodes to be created
+  static_node_count:
 
-  nccl_installer_path: $(ghpc_stage("./nccl-rdma-installer.yaml"))
-  mtu_size: 8896
-  static_node_count: # Number of A3-Ultra nodes, e.g. 2
   # Number of H200 GPUs (for later use by Kueue), which
-  # should be 8 x `static_node_count`
+  # This should be 8 x static_node_count.
   num_gpus:
-  training_bucket_name: # Name of bucket that holds training data
-  checkpoint_bucket_name: # Name of bucket used for checkpoints
+
+  # Cidr block containing the IP of the machine calling terraform.
+  # To allow all (IAM restrictions still enforced), use 0.0.0.0/0
+  # To allow only your IP address, use <YOUR-IP-ADDRESS>/32
+  authorized_cidr:
+
+  # The name of the compute engine reservation of A3-Ultra nodes in the form of
+  # <project>/<reservation-name>/reservationBlocks/<reservation-block-name>
+  extended_reservation:
+
+  # The name of the GCS bucket used for training data
+  training_bucket_name:
+
+  # The following variables do not need to be modified
+  nccl_installer_path: $(ghpc_stage("./nccl-rdma-installer.yaml"))
+  mtu_size: 8896
   system_node_pool_disk_size_gb: 200
   a3ultra_node_pool_disk_size_gb: 100
 
+
 deployment_groups:
 - group: primary
   modules:
@@ -68,7 +86,7 @@ deployment_groups:
         subnet_ip: 192.168.64.0/18
 
   - id: gke-a3-ultra-rdma-net
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/network/rdma-vpc?ref=98c49fe
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/network/rdma-vpc
     settings:
       network_name: $(vars.deployment_name)-rdma-net
       mtu: $(vars.mtu_size)
@@ -81,7 +99,7 @@ deployment_groups:
         region: $(vars.region)
 
   - id: a3-ultragpu-cluster
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/scheduler/gke-cluster?ref=e0c690b
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/scheduler/gke-cluster
     use: [gke-a3-ultra-net-0]
     settings:
       release_channel: RAPID
@@ -118,7 +136,7 @@ deployment_groups:
     outputs: [instructions]
 
   - id: a3-ultragpu-pool
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/compute/gke-node-pool?ref=e0c690b
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/compute/gke-node-pool
     use: [a3-ultragpu-cluster]
     settings:
       machine_type: a3-ultragpu-8g
@@ -155,17 +173,17 @@ deployment_groups:
     outputs: [instructions]
 
   - id: topology-aware-scheduler-install
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/compute/gke-topology-scheduler?ref=e0c690b
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/compute/gke-topology-scheduler
     use: [a3-ultragpu-cluster]
 
   # Install Kueue, Jobset, and NCCL installer
   - id: workload-manager-install
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply?ref=e0c690b
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply
     use: [a3-ultragpu-cluster]
     settings:
       kueue:
         install: true
-        version: v0.9.1
+        version: v0.10.0
         config_path: $(ghpc_stage("kueue-configuration.yaml.tftpl"))
         config_template_vars:
           num_gpus: $(vars.num_gpus)
@@ -233,6 +251,11 @@ deployment_groups:
     source: modules/compute/gke-job-template
     use: [checkpointing-pv, training-pv, a3-ultragpu-pool]
     settings:
+
+      # By adding an ephemeral volume, this will ensure that the job adds:
+      # nodeSelector:
+      #   cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
+      # which is the best practice for using local-ssd for ephemeral storage.
       ephemeral_volumes:
       - type: local-ssd
         mount_path: /scratch-data
@@ -291,6 +314,6 @@ deployment_groups:
           --name=checkpoint --blocksize=100m --iodepth=64 --readwrite=read
 
         # Clean up temporary directories for fio benchmarks
-        rm -rf /{scratch-training,checkpoint}-data/fio-benchmarks-${TAG}
+        rm -rf /{scratch,training,checkpoint}-data/fio-benchmarks-${TAG}
 
     outputs: [instructions]
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml
index 0d9966b3e0..48b8b200d8 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml
@@ -15,23 +15,41 @@
 terraform_backend_defaults:
   type: gcs
   configuration:
+    # The GCS bucket used for storing terraform state
     bucket:
 
 vars:
+  # Your GCP Project ID
   project_id:
+
   # This should be unique across all of your Cluster
   # Toolkit Deployments.
   deployment_name: a3u-gke-gcs
+
+  # The GCP Region used for this deployment.
   region:
+
+  # The GCP Zone used for this deployment.
   zone:
+
+  # The number of nodes to be created
   static_node_count:
+
+  # Number of H200 GPUs (for later use by Kueue), which
   # This should be 8 x static_node_count.
   num_gpus:
+
   # Cidr block containing the IP of the machine calling terraform.
-  # The following line must be updated for this example to work.
-  # e.g. the local system running Terraform or a remote node
   # To allow all (IAM restrictions still enforced), use 0.0.0.0/0
+  # To allow only your IP address, use <YOUR-IP-ADDRESS>/32
   authorized_cidr:
+
+  # The name of the compute engine reservation of A3-Ultra nodes in the form of
+  # <project>/<reservation-name>/reservationBlocks/<reservation-block-name>
   extended_reservation:
+
+  # The name of the GCS bucket used for training data
   training_bucket_name:
+
+  # The name of the GCS bucket used for checkpoint/restart data.
   checkpoint_bucket_name:
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl b/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl
index 97ae9d91f6..97cbaede33 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl
@@ -1,28 +1,59 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: kueue.x-k8s.io/v1alpha1
+kind: Topology
+metadata:
+  name: "gke-default"
+spec:
+  levels:
+  - nodeLabel: "cloud.google.com/gce-topology-block"
+  - nodeLabel: "cloud.google.com/gce-topology-subblock"
+  - nodeLabel: "cloud.google.com/gce-topology-host"
+  - nodeLabel: "kubernetes.io/hostname"
 ---
-apiVersion: kueue.x-k8s.io/v1beta1
 kind: ResourceFlavor
+apiVersion: kueue.x-k8s.io/v1beta1
 metadata:
-  name: "default-flavor"
+  name: "a3u"
+spec:
+  nodeLabels:
+    cloud.google.com/gke-nodepool: "a3-ultragpu-8g-a3-ultragpu-pool"
+  topologyName: "gke-default"
+  tolerations:
+  - key: "nvidia.com/gpu"
+    operator: "Exists"
+    effect: NoSchedule
 ---
 apiVersion: kueue.x-k8s.io/v1beta1
 kind: ClusterQueue
 metadata:
-  name: "cluster-queue"
+  name: "a3u"
 spec:
-  namespaceSelector: {}
+  namespaceSelector: {} # match all.
   resourceGroups:
   - coveredResources: ["nvidia.com/gpu"]
     flavors:
-    - name: "default-flavor"
+    - name: "a3u"
       resources:
       - name: "nvidia.com/gpu"
         nominalQuota: ${num_gpus}
-
 ---
 apiVersion: kueue.x-k8s.io/v1beta1
 kind: LocalQueue
 metadata:
   namespace: "default"
-  name: "local-queue"
+  name: "a3u"
 spec:
-  clusterQueue: "cluster-queue"
+  clusterQueue: "a3u"
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml
index 486255755a..092ba1baf3 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml
@@ -70,7 +70,7 @@ spec:
           sysctl -w net.ipv4.conf.eth7.log_martians=0
           sysctl -w net.ipv4.conf.eth8.log_martians=0
           sysctl -w net.ipv4.conf.eth9.log_martians=0
-      - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+      - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.3
         name: nccl-rdma-installer
         resources:
           requests:

From 02068fae5144ac6851a3016481f2f9bc7d59de06 Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Tue, 7 Jan 2025 18:30:48 +0000
Subject: [PATCH 05/15] Fix sources for modules

---
 .../hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
index 8c844dfd5b..a6c3489c89 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
@@ -86,7 +86,7 @@ deployment_groups:
         subnet_ip: 192.168.64.0/18
 
   - id: gke-a3-ultra-rdma-net
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/network/rdma-vpc
+    source: modules/network/gpu-rdma-vpc
     settings:
       network_name: $(vars.deployment_name)-rdma-net
       mtu: $(vars.mtu_size)
@@ -99,7 +99,7 @@ deployment_groups:
         region: $(vars.region)
 
   - id: a3-ultragpu-cluster
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/scheduler/gke-cluster
+    source: modules/scheduler/gke-cluster
     use: [gke-a3-ultra-net-0]
     settings:
       release_channel: RAPID
@@ -136,7 +136,7 @@ deployment_groups:
     outputs: [instructions]
 
   - id: a3-ultragpu-pool
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/compute/gke-node-pool
+    source: modules/compute/gke-node-pool
     use: [a3-ultragpu-cluster]
     settings:
       machine_type: a3-ultragpu-8g
@@ -173,12 +173,12 @@ deployment_groups:
     outputs: [instructions]
 
   - id: topology-aware-scheduler-install
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/compute/gke-topology-scheduler
+    source: community/modules/compute/gke-topology-scheduler
     use: [a3-ultragpu-cluster]
 
   # Install Kueue, Jobset, and NCCL installer
   - id: workload-manager-install
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply
+    source: modules/management/kubectl-apply
     use: [a3-ultragpu-cluster]
     settings:
       kueue:

From 3671f74764376ab5255da9cd8e3dc2eab4e76292 Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Wed, 8 Jan 2025 00:03:31 +0000
Subject: [PATCH 06/15] Add link to CSI docs

---
 examples/hypercompute_clusters/a3u-gke-gcs/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/README.md b/examples/hypercompute_clusters/a3u-gke-gcs/README.md
index 7669903cf0..4040151504 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/README.md
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/README.md
@@ -9,8 +9,8 @@ Storage (GCS).
 * **Multi-VPC Design:** Utilizes three VPCs: two for GKE nodes and one dedicated
   for GPU RDMA networks.
 * **Cloud Storage Fuse Integration:** Enables seamless access to GCS buckets
-  from within your containers using the Cloud Storage Fuse CSI Driver. Cloud
-  Storage Fuse is configured to utilize the 12 TB of Local SSD
+  from within your containers using the [Cloud Storage Fuse CSI Driver](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/cloud-storage-fuse-csi-driver).
+  Cloud Storage Fuse is configured to utilize the 12 TB of Local SSD
 * **Hierarchical Namespace Buckets:** Leverages GCS buckets with Hierarchical
   Namespace enabled, optimizing performance for checkpointing and restarting
   workloads. (Requires GKE 1.31 or later).

From 9e465d2951734cda4c1c400e2d816009c5464b13 Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Wed, 8 Jan 2025 00:11:47 +0000
Subject: [PATCH 07/15] Add note about parallel downloads

---
 examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
index a6c3489c89..0020e31c7e 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
@@ -202,6 +202,9 @@ deployment_groups:
       remote_mount: $(vars.training_bucket_name)
       local_mount: /training-data
       fs_type: gcsfuse
+      # In addition to the mount options below, if the dataset is a large
+      # compressed files, `file-cache:enable-parallel-downloads:true` can boost
+      # perf (at the cost of each node pulling the file to each node's lssd)
       mount_options: >-
         implicit-dirs,
         metadata-cache:ttl-secs:-1,

From dfde71987bf69b2dace91b66fb9b4a5d4e21fa79 Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Wed, 8 Jan 2025 00:14:58 +0000
Subject: [PATCH 08/15] Remove parallel download note

---
 examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
index 0020e31c7e..a6c3489c89 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
@@ -202,9 +202,6 @@ deployment_groups:
       remote_mount: $(vars.training_bucket_name)
       local_mount: /training-data
       fs_type: gcsfuse
-      # In addition to the mount options below, if the dataset is a large
-      # compressed files, `file-cache:enable-parallel-downloads:true` can boost
-      # perf (at the cost of each node pulling the file to each node's lssd)
       mount_options: >-
         implicit-dirs,
         metadata-cache:ttl-secs:-1,

From d9e3f7b9d06687e2c0b673068659aff7ed5cd933 Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Thu, 16 Jan 2025 21:30:08 +0000
Subject: [PATCH 09/15] Adding Ramble based system benchmarks

Also adopt latest nccl-rdma installer.
---
 .../a3u-gke-gcs/README.md                     |   5 +
 .../a3u-gke-gcs/a3u-gke-gcs.yaml              |   4 +-
 .../a3u-gke-gcs/nccl-rdma-installer.yaml      |  17 +-
 .../a3u-gke-gcs/system_benchmarks/README.md   | 132 ++++
 .../system_benchmarks/ramble-hpl.yaml         | 553 +++++++++++++++
 .../system_benchmarks/ramble-nccl.yaml        | 526 ++++++++++++++
 .../system_benchmarks/ramble-nemo.yaml        | 646 ++++++++++++++++++
 7 files changed, 1872 insertions(+), 11 deletions(-)
 create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md
 create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml
 create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml
 create mode 100644 examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml

diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/README.md b/examples/hypercompute_clusters/a3u-gke-gcs/README.md
index 4040151504..37ca752ee0 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/README.md
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/README.md
@@ -107,3 +107,8 @@ Use the following commands to:
 Submit your job:
   kubectl create -f <PATH/TO/DEPLOYMENT_DIR>/primary/my-job-<some-id>.yaml
 ```
+
+## Running System Benchmarks with Ramble
+
+To run a series of NCCL, HPL, and NeMo test benchmarks on your cluster, see
+`system_benchmarks/README.md`.
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
index a6c3489c89..ce6e3a34f0 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
@@ -81,7 +81,7 @@ deployment_groups:
       network_name: $(vars.deployment_name)-net-1
       mtu: $(vars.mtu_size)
       subnetworks:
-      - subnet_name: gke-a3u-gcs-sub-1
+      - subnet_name: $(vars.deployment_name)-sub-1
         subnet_region: $(vars.region)
         subnet_ip: 192.168.64.0/18
 
@@ -189,7 +189,7 @@ deployment_groups:
           num_gpus: $(vars.num_gpus)
       jobset:
         install: true
-        version: v0.7.1
+        version: v0.7.2
       apply_manifests:
       - source: $(vars.nccl_installer_path)
 
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml
index 092ba1baf3..1186759a7b 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/nccl-rdma-installer.yaml
@@ -62,14 +62,14 @@ spec:
         args:
         - -c
         - |
-          sysctl -w net.ipv4.conf.eth2.log_martians=0
-          sysctl -w net.ipv4.conf.eth3.log_martians=0
-          sysctl -w net.ipv4.conf.eth4.log_martians=0
-          sysctl -w net.ipv4.conf.eth5.log_martians=0
-          sysctl -w net.ipv4.conf.eth6.log_martians=0
-          sysctl -w net.ipv4.conf.eth7.log_martians=0
-          sysctl -w net.ipv4.conf.eth8.log_martians=0
-          sysctl -w net.ipv4.conf.eth9.log_martians=0
+          sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
       - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.3
         name: nccl-rdma-installer
         resources:
@@ -89,7 +89,6 @@ spec:
           /scripts/container_entry.sh install --install-nccl
           cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
           cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
-          ibv_devinfo || exit 1
           echo "installation finishes"
       containers:
       - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md
new file mode 100644
index 0000000000..63bce8c2db
--- /dev/null
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md
@@ -0,0 +1,132 @@
+Running System Benchmarks with Ramble
+=====================================
+
+[Ramble](https://github.com/GoogleCloudPlatform/ramble) is an open source
+multi-platform experimentation framework written in python. It can be used
+to easily reproduce benchmark results across systems, and here
+we will use it to run a series of system benchmarks.
+
+Currently the following benchmarks are supported:
+
+* NCCL tests (all-gather, all-reduce, reduce-scatter)
+* HPL-NVIDIA
+* Mixtral 8x7b and LLama3.1 70B via NeMo
+
+All benchmarks use Kueue for topology aware scheduling, and use JobSet
+to orchestrate multi-node workloads.
+
+For NCCL tests, run:
+
+   ```bash
+   kubectl apply -f ramble-nccl.yaml
+   ```
+
+For HPL tests, run:
+
+   ```bash
+   kubectl apply -f ramble-hpl.yaml
+   ```
+
+For NeMo tests, run:
+
+   ```bash
+   kubectl apply -f ramble-nemo.yaml
+   ```
+
+Where applicable, the NeMo workloads configurations have been chosen to
+reproduce those found in
+[AI-Hypercomputer/gpu-recipes](https://github.com/AI-Hypercomputer/gpu-recipes).
+
+For any of the above, the following will be created:
+
+* A `ramble` namespace in your K8s cluster
+* A Kueue `LocalQueue` in the `ramble` namespace.
+* A "ramble" service account (and associated RBAC configs) that has access to
+  the core, batch, jobset, and kueue apis in the `ramble` namespace, as well as
+  read access to the kueue "clusterqueues" resources across the cluster.
+* Configmaps to various scripts/configurations.
+* A K8s `Job` that works as the ramble controller process, which creates a
+  series of `Jobset` objects for each individual benchmark.
+
+Once created, this will first create a K8s job called
+"ramble-{nccl,hpl,nemo}-runner". This controller job orchestrates the running
+and analysis of the benchmarks. It installs everything it needs within a
+self-contained pod, creates an ssh keypair for multi-node communication, and
+uses Ramble to create JobSet's for each benchmark. Once those benchmarks
+are complete, it provides a summary of the results. Full benchmark logs can
+otherwise be found in the logs for each of the created JobSet/Job/Pod's
+themselves.
+
+For each benchmark, multiple node scales will be submitted, up to your maximum
+node scale of your cluster.  This can be controlled with the `n_nodes` variable
+in the `ramble.yaml` configMap.
+
+Note: The following depends on several tightly coupled settings, in particular
+making sure that the subnet names in your GKE cluster match those defined in
+the "ramble.yaml" config file. If you modify the names of your subnets
+(including by changing the "deployment" name), then you will need to modify
+the K8s yaml files. Specifically, the following variables may need to be
+modified in the `ramble.yaml` configmap in each of the
+ramble-{nccl,hpl,nemo}.yaml files:
+
+        gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool  # The nodepool name
+        sysnet_subnet_prefix: a3u-gke-gcs-sub
+        gpu_subnet_prefix: a3u-gke-gcs-rdma-sub
+        cluster_queue: a3u
+
+Expected Results
+----------------
+
+For ramble-nccl.yaml, at the end of the logs of the created `ramble-nccl-runner`
+job, you should see something like:
+
+   ```bash
+   kubectl -n ramble logs job/ramble-nccl-runner
+   ...
+   ---- SUMMARY for >1GB Message Sizes ----
+   workload        n_nodes msg_size        busbw
+   all-gather      2       1073741824      XXX.XX
+   all-gather      2       2147483648      XXX.XX
+   all-gather      2       4294967296      XXX.XX
+   all-gather      2       8589934592      XXX.XX
+   ...
+   all-reduce      2       1073741824      XXX.XX
+   ...
+   reduce-scatter  2       1073741824      XXX.XX
+   ...
+
+   -------- Benchmarking Complete -------
+   ```
+
+   ```bash
+   kubectl -n ramble logs job/ramble-hpl-runner
+   ...
+   --------------- SUMMARY ---------------
+   workload        n_nodes GFlop/s         GFlops/s/GPU
+   calculator      1       X.XXXe+05       X.XXXe+04
+   calculator      2       X.XXXe+05       X.XXXe+04
+   calculator      4       X.XXXe+06       X.XXXe+04
+   calculator      8       X.XXXe+06       X.XXXe+04
+
+   -------- Benchmarking Complete -------
+   ```
+
+   ```bash
+   kubectl -n ramble logs job/ramble-nemo-runner
+   ...
+   --------------- SUMMARY ---------------
+   nemo_config     n_nodes step    train_step_timing
+   mixtral_8x7b    8       0-10/10 XX.XX
+   llama3_1_70b    8       0-10/10 XX.XX
+
+   -------- Benchmarking Complete -------
+   ```
+
+Cleaning Up
+-----------
+
+To remove all resources created by these benchmarks, you can run:
+
+    kubectl delete -f ramble-nccl.yaml
+    kubectl delete -f ramble-hpl.yaml
+    kubectl delete -f ramble-nemo.yaml
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml
new file mode 100644
index 0000000000..e5f5e4d2a9
--- /dev/null
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml
@@ -0,0 +1,553 @@
+# Copyright 2025 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: ramble
+  namespace: ramble
+---
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: LocalQueue
+metadata:
+  namespace: "ramble"
+  name: "a3u"
+spec:
+  clusterQueue: "a3u"
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: ramble
+  namespace: ramble
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  namespace: ramble
+  name: ramble-editor
+rules:
+- apiGroups: ["", "batch", "jobset.x-k8s.io", "kueue.x-k8s.io"] # "" indicates the core API group
+  resources: ["*"]
+  verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: kueue-reader
+rules:
+- apiGroups: ["kueue.x-k8s.io"]
+  resources: ["clusterqueues"]
+  verbs: ["get", "list"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: ramble-editor
+  namespace: ramble
+subjects:
+- kind: ServiceAccount
+  name: ramble
+  apiGroup: ""
+roleRef:
+  kind: Role
+  name: ramble-editor
+  apiGroup: ""
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: ramble-kueue-reader
+subjects:
+- kind: ServiceAccount
+  name: ramble
+  namespace: ramble
+  apiGroup: ""
+roleRef:
+  kind: ClusterRole
+  name: kueue-reader
+  apiGroup: ""
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: ramble-hpl-configs
+  namespace: ramble
+data:
+  execute_hpl.tpl: |
+    #!/bin/bash
+    set -e
+    cd "{experiment_run_dir}"
+    kubectl delete -n {gke_namespace} configmap {experiment_name} || true
+    kubectl create -n {gke_namespace} configmap {experiment_name} --from-file={experiment_run_dir}/HPL.dat
+    printf "Submitting {experiment_name}\n"
+    kubectl create -f jobset 2>&1 | tee klog
+
+  collect_logs.tpl: |
+    #!/bin/bash
+    set -e
+    jobname=$(head -n 1 {experiment_run_dir}/klog | awk -F " |/" '{print $2}')
+    printf "Waiting for up to a day for ${jobname} to complete.\n"
+    kubectl wait --timeout=86400s jobs/${jobname}-w-0 --for=condition=complete
+    kubectl logs --tail=-1 -f -l batch.kubernetes.io/job-completion-index=0,job-name=${jobname}-w-0 | tee {log_file}
+
+  ramble.yaml: |
+    ramble:
+      variables:
+        ssh_port: 22
+        batch_submit: '{execute_hpl}'
+        mpi_command: >-
+          mpirun
+          -n {n_ranks}
+          -N {processes_per_node}
+          --bind-to none
+          --hostfile /tmp/hostfile
+          --mca btl self,tcp
+          --mca btl_tcp_if_include eth0
+          --mca orte_keep_fqdn_hostnames 1
+          --mca plm_rsh_no_tree_spawn 1
+          -x {mpi_env_vars}
+          --mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p {ssh_port}"
+        mpi_env_vars: >-
+          $(echo
+          ${!NCCL*}
+          ${!OMPI*}
+          LD_LIBRARY_PATH
+          ${!HPL*}
+          ${!UCX*}
+          | sed 's/ / -x /g')
+
+
+        container_name: hpl
+        container_uri: "nvcr.io/nvidia/hpc-benchmarks:24.09"
+        gke_container_name: hpl
+        gke_namespace: ramble
+        jobset_name: "hpl-{n_nodes}"
+        processes_per_node: 8
+
+        # Potentially need to be modified
+        gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool
+        sysnet_subnet_prefix: a3u-gke-gcs-sub
+        gpu_subnet_prefix: a3u-gke-gcs-rdma-sub
+        cluster_queue: a3u
+      env_vars:
+        set:
+          CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+          HPL_FCT_COMM_POLICY: 1
+          HPL_P2P_AS_BCAST: '{hpl_p2p_as_bcast}'
+          HPL_USE_NVSHMEM: 0
+          NVSHMEM_DISABLE_CUDA_VMM: 1
+          OMPI_MCA_btl: openib
+          OMPI_MCA_pml: "^ucx"
+          UCX_MAX_RNDV_RAILS: 4
+          UCX_NET_DEVICES: "mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1"
+          NCCL_NET: gIB
+        prepend:
+        - paths:
+            LD_LIBRARY_PATH: /usr/local/gib/lib64
+
+
+      applications:
+        nvidia-hpl:
+          workloads:
+            calculator:
+              experiments:
+                hpl-{n_nodes}:
+                  variables:
+                    n_nodes: [1,2,4,8,16,24,32]
+
+                    # 0 = ncclBcast, 1 = ncclSend/Recv
+                    hpl_p2p_as_bcast: '0'
+
+                    # Percent of memory to use (default 85)
+                    percent_mem: 85
+
+                    # Memory per node in GB
+                    memory_per_node: '1200'
+
+                    # Other Recommended Settings
+                    block_size: '1024'
+                    PMAP: 1
+                    SWAP: 1
+                    swapping_threshold: 192
+                    L1: 1
+                    U: 0
+                    Equilibration: 0
+                    pfact: 0
+                    nbmin: 2
+                    rfact: 0
+                    bcast: 3
+                    depth: 1
+
+          internals:
+            custom_executables:
+              mpi_head_node:
+                template:
+                - source /usr/local/gib/scripts/set_nccl_env.sh
+                - if [[ "${NODE_RANK}" -eq "0" ]]; then
+                redirect: ''
+                log_file: ''
+              wait_worker_nodes:
+                template:
+                - else
+                - while ping -c 1 ${WORKERS_BASENAME}-0.${POSTFIX}; do
+                - sleep 5
+                - done
+                - fi
+                redirect: ''
+                log_file: ''
+              tail_log:
+                template:
+                - tail -f {log_file} &
+                - export TAIL_PID=$!
+                redirect: ''
+                log_file: ''
+              kill_tail:
+                template:
+                - kill -9 $TAIL_PID
+                redirect: ''
+                log_file: ''
+            executable_injection:
+            - name: mpi_head_node
+              order: before
+            - name: wait_worker_nodes
+              order: after
+            - name: tail_log
+              order: before
+            - name: kill_tail
+              order: after
+          formatted_executables:
+            yaml_command:
+              indentation: 18
+              join_separator: \n
+              commands:
+              - mkdir -p {experiment_run_dir}
+              - ulimit -l unlimited
+              - cp /configs/HPL.dat {experiment_run_dir}/
+              - '{unformatted_command}'
+
+  jobset.tpl: |
+    apiVersion: jobset.x-k8s.io/v1alpha2
+    kind: JobSet
+    metadata:
+      generateName: {jobset_name}-
+      namespace: {gke_namespace}
+      labels:
+        kueue.x-k8s.io/queue-name: {cluster_queue}
+    spec:
+      ttlSecondsAfterFinished: 86400
+      network:
+        enableDNSHostnames: true
+        publishNotReadyAddresses: true
+      replicatedJobs:
+        - name: w
+          template:
+            spec:
+              parallelism: {n_nodes}
+              completions: {n_nodes}
+              template:
+                metadata:
+                  annotations:
+                    kueue.x-k8s.io/podset-preferred-topology: "kubernetes.io/hostname"
+                    networking.gke.io/default-interface: 'eth0'
+                    networking.gke.io/interfaces: |
+                      [
+                        \{"interfaceName":"eth0","network":"default"\},
+                        \{"interfaceName":"eth1","network":"{sysnet_subnet_prefix}-1"\},
+                        \{"interfaceName":"eth2","network":"{gpu_subnet_prefix}-0"\},
+                        \{"interfaceName":"eth3","network":"{gpu_subnet_prefix}-1"\},
+                        \{"interfaceName":"eth4","network":"{gpu_subnet_prefix}-2"\},
+                        \{"interfaceName":"eth5","network":"{gpu_subnet_prefix}-3"\},
+                        \{"interfaceName":"eth6","network":"{gpu_subnet_prefix}-4"\},
+                        \{"interfaceName":"eth7","network":"{gpu_subnet_prefix}-5"\},
+                        \{"interfaceName":"eth8","network":"{gpu_subnet_prefix}-6"\},
+                        \{"interfaceName":"eth9","network":"{gpu_subnet_prefix}-7"\}
+                      ]
+                spec:
+                  restartPolicy: Never
+                  nodeSelector:
+                    cloud.google.com/gke-nodepool: {gke_nodepool}
+                  tolerations:
+                  - key: cloud.google.com/gke-queued
+                    effect: NoSchedule
+                    value: "true"
+                  - key: "nvidia.com/gpu"
+                    operator: "Exists"
+                    effect: "NoSchedule"
+                  setHostnameAsFQDN: true
+                  volumes:
+                  - name: mpi-id
+                    secret:
+                      secretName: mpi-ssh-hpl
+                      items:
+                      - key: ssh-privatekey
+                        path: "id_rsa"
+                      - key: ssh-publickey
+                        path: "id_rsa.pub"
+                  - name: gib
+                    hostPath:
+                      path: /home/kubernetes/bin/gib
+                  - name: nvidia
+                    hostPath:
+                      path: /home/kubernetes/bin/nvidia
+                  - name: lib64
+                    hostPath:
+                      path: /lib64
+                  - name: shared-memory
+                    emptyDir:
+                      medium: "Memory"
+                      sizeLimit: 250Gi
+                  - name: sys
+                    hostPath:
+                      path: /sys
+                  - name: proc-sys
+                    hostPath:
+                      path: /proc/sys
+                  - name: local-ssd
+                    hostPath:
+                      path: /mnt/stateful_partition/kube-ephemeral-ssd
+                  - name: hpl-config
+                    configMap:
+                      name: {experiment_name}
+                  initContainers:
+                  - name: gpu-healthcheck
+                    image: alpine:latest
+                    command: ["/bin/sh", "-c"]
+                    args:
+                    - |
+                      apk add --no-cache bash  # Install bash
+                      /bin/bash -c "set -ex
+                      NUM_GPUS=$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | wc -l)
+                      if [ \${NUM_GPUS} -lt 8 ]; then
+                        echo \"Error: Only \${NUM_GPUS} GPUs and expected 8\"
+                        exit 1
+                      fi
+                      gpu_errors=(\$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=ecc.errors.uncorrected.volatile.total --format=csv,noheader,nounits))
+                      for gpu_index in \${!gpu_errors[@]}; do
+                          if [ \${gpu_errors[\$gpu_index]} == '[N/A]' ]; then
+                              echo 'Error: ERR detected in GPU index '\$gpu_index
+                              exit 1
+                          elif [ \${gpu_errors[\$gpu_index]} -gt 0 ]; then
+                              echo 'Error: Unrecoverable ECC errors detected in GPU index '\$gpu_index
+                              exit 1
+                          fi
+                      done
+                      echo \${NUM_GPUS} GPUs found with no ERR or Unrecoverable ECC errors"
+                    volumeMounts:
+                    - name: nvidia
+                      mountPath: /usr/local/nvidia
+                    - name: lib64
+                      mountPath: /lib64
+                    securityContext:
+                      privileged: true
+                    env:
+                    - name: LD_LIBRARY_PATH
+                      value: /usr/local/nvidia/lib64
+                  containers:
+                  - name: {gke_container_name}
+                    stdin: true
+                    tty: true
+                    image: {container_uri}
+                    env:
+                    - name: OMPI_ALLOW_RUN_AS_ROOT
+                      value: "1"
+                    - name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
+                      value: "1"
+                    - name: MY_NODE_NAME
+                      valueFrom:
+                        fieldRef:
+                          fieldPath: spec.nodeName
+                    command:
+                    - bash
+                    - -c
+                    - |
+                      set -x
+
+                      # Setup SSH
+                      export DEBIAN_FRONTEND=noninteractive
+
+                      apt update -qq -y
+                      apt install -qq -y iputils-ping openssh-server
+
+                      mkdir -p /run/sshd ~/.ssh
+                      chmod 700 ~/.ssh
+                      cp /secrets/ssh/* ~/.ssh/
+                      cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
+                      chmod 600 ~/.ssh/*
+                      mkdir -p /run/sshd
+                      /sbin/sshd
+
+                      # Load all the cuda libs
+                      /sbin/ldconfig
+
+                      export POSTFIX=$(hostname | cut -d . -f 2-)
+                      export WORKERS_BASENAME=$(hostname | cut -d . -f 1 | rev | cut -d - -f 2- | rev )
+                      export NODE_RANK=$JOB_COMPLETION_INDEX
+
+                      # For every host, get the entity and add to hostfile
+                      for i in `seq 0 $(({n_nodes}-1))`; do
+                        OTHER=${WORKERS_BASENAME}-${i}.${POSTFIX}
+                        until ssh -p {ssh_port} -o StrictHostKeyChecking=no $OTHER hostname;
+                        do
+                          echo ...
+                          sleep 10
+                        done
+                        echo ${OTHER} port={ssh_port} slots={processes_per_node} | tee -a /tmp/hostfile;
+                      done
+                      cat /tmp/hostfile
+
+    {yaml_command}
+
+                      exit 0
+
+                    volumeMounts:
+                    - name: mpi-id
+                      mountPath: "/secrets/ssh"
+                      readOnly: true
+                    - name: nvidia
+                      mountPath: /usr/local/nvidia
+                    - name: gib
+                      mountPath: /usr/local/gib
+                    - name: shared-memory
+                      mountPath: /dev/shm
+                    - name: local-ssd
+                      mountPath: /ssd
+                    - name: hpl-config
+                      mountPath: /configs
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 8
+                      requests:
+                        nvidia.com/gpu: 8
+
+                  restartPolicy: Never
+
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ramble-hpl-runner
+  namespace: ramble
+spec:
+  template:
+    spec:
+      volumes:
+      - name: config
+        configMap:
+          name: ramble-hpl-configs
+          items:
+          - key: jobset.tpl
+            path: jobset.tpl
+          - key: ramble.yaml
+            path: ramble.yaml
+          - key: execute_hpl.tpl
+            path: execute_hpl.tpl
+          - key: collect_logs.tpl
+            path: collect_logs.tpl
+
+      serviceAccountName: ramble
+      containers:
+      - name: ramble-controller
+        image: ubuntu:latest
+
+        volumeMounts:
+        - name: config
+          mountPath: /opt/configs/
+          readOnly: true
+
+        command:
+        - bash
+        - -c
+        - |
+          export DEBIAN_FRONTEND=noninteractive
+
+          set -e
+          printf "Installing system dependencies\n"
+          apt update -qq -y > /dev/null
+          apt install -qq -y build-essential python3-venv jq git curl > /dev/null
+
+          printf "Installing kubectl\n"
+          curl -s -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
+          install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
+
+          # Use current unix timestamp as a unique tag
+          # for jobs submitted
+          TAG=$(date +%s)
+          TEST_DIR=/workspace/hpl-tests-"${TAG}"
+          SOFTWARE_INSTALL=/opt
+
+          mkdir -p ${SOFTWARE_INSTALL} ${TEST_DIR}
+
+          printf "Cloning ramble and cluster-toolkit\n"
+          git clone --depth 1 -c feature.manyFiles=true https://github.com/GoogleCloudPlatform/ramble.git "${SOFTWARE_INSTALL}"/ramble
+
+          printf "Setting up ramble python environment, and installing requirements\n"
+          python3 -m venv "${SOFTWARE_INSTALL}"/ramble/env || true
+          source "${SOFTWARE_INSTALL}"/ramble/env/bin/activate
+          pip install -q -r "${SOFTWARE_INSTALL}"/ramble/requirements.txt
+
+          # Activate ramble
+          . ${SOFTWARE_INSTALL}/ramble/share/ramble/setup-env.sh
+
+          ramble workspace create -a -d "${TEST_DIR}"
+
+          cp /opt/configs/* ${RAMBLE_WORKSPACE}/configs/
+
+          cd ${RAMBLE_WORKSPACE}
+
+          # Set up SSH
+          printf "Creating ssh keypair for MPI workloads\n"
+          ssh-keygen -b 2048 -f mpi_id -N ""
+          kubectl create secret generic mpi-ssh-hpl --from-file=ssh-privatekey=./mpi_id --from-file=ssh-publickey=./mpi_id.pub || true
+
+          # Get number of GPUs / nodes available in this cluster from Kueue:
+          AVAILABLE_GPUS=$(
+            kubectl get clusterqueues.kueue.x-k8s.io -o json |
+            jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name="a3u") |
+            .resources[] | select (.name="nvidia.com/gpu") | .nominalQuota'
+          )
+
+          N_NODES=$((AVAILABLE_GPUS / 8))
+
+          printf "\n--- Available Benchmarks on %s nodes --\n" ${N_NODES}
+          ramble workspace info --where '{n_nodes} <= '"${N_NODES}"
+
+          printf "\n--------- Setting up Benchmarks -------\n"
+          ramble workspace setup --where '{n_nodes} <= '"${N_NODES}"
+
+          printf "\n----------- Running Benchmarks --------\n"
+          ramble on --where '{n_nodes} <= '"${N_NODES}"
+
+          printf "\n------- Collecting benchmark logs -----\n"
+          ramble on --executor "{experiment_run_dir}/collect_logs" --where '{n_nodes} <= '"${N_NODES}"
+
+          printf "\n------- Analyzing benchmark logs ------\n"
+          ramble workspace analyze -f json --where '{n_nodes} <= '"${N_NODES}"
+
+          printf "\n--------------- SUMMARY ---------------\n"
+          jq -r '["workload","n_nodes","GFlop/s   ","GFlops/s/GPU"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context |
+          {
+            experiment_name: $exp.name,
+            workload: $exp.workload_name,
+            n_nodes: $exp.n_nodes,
+            Context: $context.name
+          } +
+          ($context.foms | from_entries )
+          | [.workload, .n_nodes, .GFlops, ."Per GPU GFlops"])
+          | @tsv' results.latest.json
+          printf "\n-------- Benchmarking Complete -------\n"
+
+      restartPolicy: Never
+  backoffLimit: 4
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml
new file mode 100644
index 0000000000..ae7753516e
--- /dev/null
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml
@@ -0,0 +1,526 @@
+# Copyright 2025 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: ramble
+  namespace: ramble
+---
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: LocalQueue
+metadata:
+  namespace: "ramble"
+  name: "a3u"
+spec:
+  clusterQueue: "a3u"
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: ramble
+  namespace: ramble
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  namespace: ramble
+  name: ramble-editor
+rules:
+- apiGroups: ["", "batch", "jobset.x-k8s.io", "kueue.x-k8s.io"] # "" indicates the core API group
+  resources: ["*"]
+  verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: kueue-reader
+rules:
+- apiGroups: ["kueue.x-k8s.io"]
+  resources: ["clusterqueues"]
+  verbs: ["get", "list"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: ramble-editor
+  namespace: ramble
+subjects:
+- kind: ServiceAccount
+  name: ramble
+  apiGroup: ""
+roleRef:
+  kind: Role
+  name: ramble-editor
+  apiGroup: ""
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: ramble-kueue-reader
+subjects:
+- kind: ServiceAccount
+  name: ramble
+  namespace: ramble
+  apiGroup: ""
+roleRef:
+  kind: ClusterRole
+  name: kueue-reader
+  apiGroup: ""
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: ramble-nccl-configs
+  namespace: ramble
+data:
+  execute_nccl.tpl: |
+    #!/bin/bash
+    set -e
+    cd "{experiment_run_dir}"
+    printf "Submitting ${experiment_name}\n"
+    kubectl create -f jobset 2>&1 | tee klog
+
+  collect_logs.tpl: |
+    #!/bin/bash
+    set -e
+    jobname=$(head -n 1 {experiment_run_dir}/klog | awk -F " |/" '{print $2}')
+    printf "Waiting for up to a day for ${jobname} to complete.\n"
+    kubectl wait --timeout=86400s jobs/${jobname}-w-0 --for=condition=complete
+    kubectl logs --tail=-1 -f -l batch.kubernetes.io/job-completion-index=0,job-name=${jobname}-w-0 | tee {log_file}
+
+  ramble.yaml: |
+    ramble:
+      variables:
+        ssh_port: 222
+        batch_submit: '{execute_nccl}'
+        mpi_command: >-
+          mpirun
+          -n {n_ranks}
+          -N {processes_per_node}
+          --bind-to none
+          --hostfile /tmp/hostfile
+          --mca btl self,tcp
+          --mca btl_tcp_if_include eth0
+          --mca orte_keep_fqdn_hostnames 1
+          --mca plm_rsh_no_tree_spawn 1
+          -x {mpi_env_vars}
+          --mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p {ssh_port}"
+        mpi_env_vars: >-
+          $(echo
+          ${!NCCL*}
+          ${!OMPI*}
+          LD_LIBRARY_PATH
+          | sed 's/ / -x /g')
+
+        container_name: nccl-tests
+        container_uri: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.3
+        gke_container_name: nccl
+        gke_namespace: ramble
+        gpus_per_node: 8
+        nccl-tests_path: /third_party/nccl-tests-mpi
+        processes_per_node: 8
+
+        # Potentially need to be modified
+        gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool
+        sysnet_subnet_prefix: a3u-gke-gcs-sub
+        gpu_subnet_prefix: a3u-gke-gcs-rdma-sub
+        cluster_queue: a3u
+      env_vars:
+        set:
+          CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+          NCCL_NET: gIB
+        prepend:
+        - paths:
+            LD_LIBRARY_PATH: /usr/local/gib/lib64
+
+
+      applications:
+        nccl-tests:
+          workloads:
+            '{workload}':
+              experiments:
+                '{workload}-{n_nodes}':
+                  variables:
+                    n_nodes: [2,4,8,16,32]
+
+                    jobset_name: ['ag-{n_nodes}', 'ar-{n_nodes}', 'rs-{n_nodes}']
+                    workload: [all-gather, all-reduce, reduce-scatter]
+                    binary: [all_gather_perf, all_reduce_perf, reduce_scatter_perf]
+                  zips:
+                    bench:
+                    - jobset_name
+                    - workload
+                    - binary
+                  matrix:
+                  - bench
+                  - n_nodes
+
+          internals:
+            custom_executables:
+              mpi_head_node:
+                template:
+                - cd /third_party/nccl-tests/build/
+                - source /usr/local/gib/scripts/set_nccl_env.sh
+                - if [[ "${NODE_RANK}" -eq "0" ]]; then
+                redirect: ''
+                log_file: ''
+              wait_worker_nodes:
+                template:
+                - else
+                - while ping -c 1 ${WORKERS_BASENAME}-0.${POSTFIX}; do
+                - sleep 5
+                - done
+                - fi
+                redirect: ''
+                log_file: ''
+              tail_log:
+                template:
+                - tail -f {log_file} &
+                - export TAIL_PID=$!
+                redirect: ''
+                log_file: ''
+              kill_tail:
+                template:
+                - kill -9 $TAIL_PID
+                redirect: ''
+                log_file: ''
+            executable_injection:
+            - name: mpi_head_node
+              order: before
+            - name: wait_worker_nodes
+              order: after
+            - name: tail_log
+              order: before
+            - name: kill_tail
+              order: after
+          formatted_executables:
+            yaml_command:
+              indentation: 18
+              join_separator: \n
+              commands:
+              - mkdir -p {experiment_run_dir}
+              - ulimit -l unlimited
+              - '{unformatted_command}'
+
+  jobset.tpl: |
+    apiVersion: jobset.x-k8s.io/v1alpha2
+    kind: JobSet
+    metadata:
+      generateName: {jobset_name}-
+      namespace: {gke_namespace}
+      labels:
+        kueue.x-k8s.io/queue-name: {cluster_queue}
+    spec:
+      ttlSecondsAfterFinished: 86400
+      network:
+        enableDNSHostnames: true
+        publishNotReadyAddresses: true
+      replicatedJobs:
+        - name: w
+          template:
+            spec:
+              parallelism: {n_nodes}
+              completions: {n_nodes}
+              template:
+                metadata:
+                  annotations:
+                    kueue.x-k8s.io/podset-preferred-topology: "kubernetes.io/hostname"
+                    networking.gke.io/default-interface: 'eth0'
+                    networking.gke.io/interfaces: |
+                      [
+                        \{"interfaceName":"eth0","network":"default"\},
+                        \{"interfaceName":"eth1","network":"{sysnet_subnet_prefix}-1"\},
+                        \{"interfaceName":"eth2","network":"{gpu_subnet_prefix}-0"\},
+                        \{"interfaceName":"eth3","network":"{gpu_subnet_prefix}-1"\},
+                        \{"interfaceName":"eth4","network":"{gpu_subnet_prefix}-2"\},
+                        \{"interfaceName":"eth5","network":"{gpu_subnet_prefix}-3"\},
+                        \{"interfaceName":"eth6","network":"{gpu_subnet_prefix}-4"\},
+                        \{"interfaceName":"eth7","network":"{gpu_subnet_prefix}-5"\},
+                        \{"interfaceName":"eth8","network":"{gpu_subnet_prefix}-6"\},
+                        \{"interfaceName":"eth9","network":"{gpu_subnet_prefix}-7"\}
+                      ]
+                spec:
+                  restartPolicy: Never
+                  nodeSelector:
+                    cloud.google.com/gke-nodepool: {gke_nodepool}
+                  tolerations:
+                  - key: cloud.google.com/gke-queued
+                    effect: NoSchedule
+                    value: "true"
+                  - key: "nvidia.com/gpu"
+                    operator: "Exists"
+                    effect: "NoSchedule"
+                  setHostnameAsFQDN: true
+                  volumes:
+                  - name: mpi-id
+                    secret:
+                      secretName: mpi-ssh-nccl
+                      items:
+                      - key: ssh-privatekey
+                        path: "id_rsa"
+                      - key: ssh-publickey
+                        path: "id_rsa.pub"
+                  - name: gib
+                    hostPath:
+                      path: /home/kubernetes/bin/gib
+                  - name: nvidia
+                    hostPath:
+                      path: /home/kubernetes/bin/nvidia
+                  - name: lib64
+                    hostPath:
+                      path: /lib64
+                  - name: shared-memory
+                    emptyDir:
+                      medium: "Memory"
+                      sizeLimit: 250Gi
+                  - name: sys
+                    hostPath:
+                      path: /sys
+                  - name: proc-sys
+                    hostPath:
+                      path: /proc/sys
+                  - name: local-ssd
+                    hostPath:
+                      path: /mnt/stateful_partition/kube-ephemeral-ssd
+                  initContainers:
+                  - name: gpu-healthcheck
+                    image: alpine:latest
+                    command: ["/bin/sh", "-c"]
+                    args:
+                    - |
+                      apk add --no-cache bash  # Install bash
+                      /bin/bash -c "set -ex
+                      NUM_GPUS=$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | wc -l)
+                      if [ \${NUM_GPUS} -lt 8 ]; then
+                        echo \"Error: Only \${NUM_GPUS} GPUs and expected 8\"
+                        exit 1
+                      fi
+                      gpu_errors=(\$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=ecc.errors.uncorrected.volatile.total --format=csv,noheader,nounits))
+                      for gpu_index in \${!gpu_errors[@]}; do
+                          if [ \${gpu_errors[\$gpu_index]} == '[N/A]' ]; then
+                              echo 'Error: ERR detected in GPU index '\$gpu_index
+                              exit 1
+                          elif [ \${gpu_errors[\$gpu_index]} -gt 0 ]; then
+                              echo 'Error: Unrecoverable ECC errors detected in GPU index '\$gpu_index
+                              exit 1
+                          fi
+                      done
+                      echo \${NUM_GPUS} GPUs found with no ERR or Unrecoverable ECC errors"
+                    volumeMounts:
+                    - name: nvidia
+                      mountPath: /usr/local/nvidia
+                    - name: lib64
+                      mountPath: /lib64
+                    securityContext:
+                      privileged: true
+                    env:
+                    - name: LD_LIBRARY_PATH
+                      value: /usr/local/nvidia/lib64
+                  containers:
+                  - name: {gke_container_name}
+                    stdin: true
+                    tty: true
+                    image: {container_uri}
+                    env:
+                    - name: OMPI_ALLOW_RUN_AS_ROOT
+                      value: "1"
+                    - name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
+                      value: "1"
+                    - name: MY_NODE_NAME
+                      valueFrom:
+                        fieldRef:
+                          fieldPath: spec.nodeName
+                    command:
+                    - bash
+                    - -c
+                    - |
+                      set -x
+
+                      # Setup SSH
+                      export DEBIAN_FRONTEND=noninteractive
+
+                      apt update -qq -y
+                      apt install -qq -y iputils-ping openssh-server
+
+                      mkdir -p /run/sshd ~/.ssh
+                      chmod 700 ~/.ssh
+                      cp /secrets/ssh/* ~/.ssh/
+                      cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
+                      chmod 600 ~/.ssh/*
+                      mkdir -p /run/sshd
+                      /sbin/sshd
+
+                      # Load all the cuda libs
+                      /sbin/ldconfig
+
+                      export POSTFIX=$(hostname | cut -d . -f 2-)
+                      export WORKERS_BASENAME=$(hostname | cut -d . -f 1 | rev | cut -d - -f 2- | rev )
+                      export NODE_RANK=$JOB_COMPLETION_INDEX
+
+                      # For every host, get the entity and add to hostfile
+                      for i in `seq 0 $(({n_nodes}-1))`; do
+                        OTHER=${WORKERS_BASENAME}-${i}.${POSTFIX}
+                        until ssh -p {ssh_port} -o StrictHostKeyChecking=no $OTHER hostname;
+                        do
+                          echo ...
+                          sleep 10
+                        done
+                        echo ${OTHER} port={ssh_port} slots={processes_per_node} | tee -a /tmp/hostfile;
+                      done
+                      cat /tmp/hostfile
+
+    {yaml_command}
+
+                      exit 0
+
+                    volumeMounts:
+                    - name: mpi-id
+                      mountPath: "/secrets/ssh"
+                      readOnly: true
+                    - name: nvidia
+                      mountPath: /usr/local/nvidia
+                    - name: gib
+                      mountPath: /usr/local/gib
+                    - name: shared-memory
+                      mountPath: /dev/shm
+                    - name: local-ssd
+                      mountPath: /ssd
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 8
+                      requests:
+                        nvidia.com/gpu: 8
+
+                  restartPolicy: Never
+
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ramble-nccl-runner
+  namespace: ramble
+spec:
+  template:
+    spec:
+      volumes:
+      - name: config
+        configMap:
+          name: ramble-nccl-configs
+          items:
+          - key: jobset.tpl
+            path: jobset.tpl
+          - key: ramble.yaml
+            path: ramble.yaml
+          - key: execute_nccl.tpl
+            path: execute_nccl.tpl
+          - key: collect_logs.tpl
+            path: collect_logs.tpl
+
+      serviceAccountName: ramble
+      containers:
+      - name: ramble-controller
+        image: ubuntu:latest
+
+        volumeMounts:
+        - name: config
+          mountPath: /opt/configs/
+          readOnly: true
+
+        command:
+        - bash
+        - -c
+        - |
+          export DEBIAN_FRONTEND=noninteractive
+
+          set -e
+          printf "Installing system dependencies\n"
+          apt update -qq -y > /dev/null
+          apt install -qq -y build-essential python3-venv jq git curl > /dev/null
+
+          printf "Installing kubectl\n"
+          curl -s -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
+          install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
+
+          # Use current unix timestamp as a unique tag
+          # for jobs submitted
+          TAG=$(date +%s)
+          TEST_DIR=/workspace/nccl-tests-"${TAG}"
+          SOFTWARE_INSTALL=/opt
+
+          mkdir -p ${SOFTWARE_INSTALL} ${TEST_DIR}
+
+          printf "Cloning ramble and cluster-toolkit\n"
+          git clone --depth 1 -c feature.manyFiles=true https://github.com/GoogleCloudPlatform/ramble.git "${SOFTWARE_INSTALL}"/ramble
+
+          printf "Setting up ramble python environment, and installing requirements\n"
+          python3 -m venv "${SOFTWARE_INSTALL}"/ramble/env || true
+          source "${SOFTWARE_INSTALL}"/ramble/env/bin/activate
+          pip install -q -r "${SOFTWARE_INSTALL}"/ramble/requirements.txt
+
+          # Activate ramble
+          . ${SOFTWARE_INSTALL}/ramble/share/ramble/setup-env.sh
+
+          ramble workspace create -a -d "${TEST_DIR}"
+
+          cp /opt/configs/* ${RAMBLE_WORKSPACE}/configs/
+
+          cd ${RAMBLE_WORKSPACE}
+
+          # Set up SSH
+          printf "Creating ssh keypair for MPI workloads\n"
+          ssh-keygen -b 2048 -f mpi_id -N ""
+          kubectl create secret generic mpi-ssh-nccl --from-file=ssh-privatekey=./mpi_id --from-file=ssh-publickey=./mpi_id.pub || true
+
+          # Get number of GPUs / nodes available in this cluster from Kueue:
+          AVAILABLE_GPUS=$(
+            kubectl get clusterqueues.kueue.x-k8s.io -o json |
+            jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name="a3u") |
+            .resources[] | select (.name="nvidia.com/gpu") | .nominalQuota'
+          )
+
+          N_NODES=$((AVAILABLE_GPUS / 8))
+
+          printf "\n--- Available Benchmarks on %s nodes --\n" ${N_NODES}
+          ramble workspace info --where '{n_nodes} <= '"${N_NODES}"
+
+          printf "\n--------- Setting up Benchmarks -------\n"
+          ramble workspace setup --where '{n_nodes} <= '"${N_NODES}"
+
+          printf "\n----------- Running Benchmarks --------\n"
+          ramble on --where '{n_nodes} <= '"${N_NODES}"
+
+          printf "\n------- Collecting benchmark logs -----\n"
+          ramble on --executor "{experiment_run_dir}/collect_logs" --where '{n_nodes} <= '"${N_NODES}"
+
+          printf "\n------- Analyzing benchmark logs ------\n"
+          ramble workspace analyze -f json --where '{n_nodes} <= '"${N_NODES}"
+
+          printf "\n---- SUMMARY for >1GB Message Sizes ----\n"
+          jq -r '["workload","n_nodes","msg_size","busbw"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context |
+          {
+            experiment_name: $exp.name,
+            workload: $exp.workload_name,
+            n_nodes: $exp.n_nodes,
+            Context: $context.name
+          } +
+          ($context.foms | from_entries )
+          | select(.Size | tonumber > 1000000000)
+          | [.workload, .n_nodes, .Size, ."Out of Place Bus Bandwidth"])
+          | @tsv' results.latest.json
+          printf "\n-------- Benchmarking Complete -------\n"
+
+      restartPolicy: Never
+  backoffLimit: 4
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml
new file mode 100644
index 0000000000..fe5642b4f1
--- /dev/null
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml
@@ -0,0 +1,646 @@
+# Copyright 2025 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: ramble
+  namespace: ramble
+---
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: LocalQueue
+metadata:
+  namespace: "ramble"
+  name: "a3u"
+spec:
+  clusterQueue: "a3u"
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: ramble
+  namespace: ramble
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  namespace: ramble
+  name: ramble-editor
+rules:
+- apiGroups: ["", "batch", "jobset.x-k8s.io", "kueue.x-k8s.io"] # "" indicates the core API group
+  resources: ["*"]
+  verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: kueue-reader
+rules:
+- apiGroups: ["kueue.x-k8s.io"]
+  resources: ["clusterqueues"]
+  verbs: ["get", "list"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: ramble-editor
+  namespace: ramble
+subjects:
+- kind: ServiceAccount
+  name: ramble
+  apiGroup: ""
+roleRef:
+  kind: Role
+  name: ramble-editor
+  apiGroup: ""
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: ramble-kueue-reader
+subjects:
+- kind: ServiceAccount
+  name: ramble
+  namespace: ramble
+  apiGroup: ""
+roleRef:
+  kind: ClusterRole
+  name: kueue-reader
+  apiGroup: ""
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: ramble-nemo-configs
+  namespace: ramble
+data:
+  execute_nemo.tpl: |
+    #!/bin/bash
+    set -e
+    cd "{experiment_run_dir}"
+    kubectl delete -n {gke_namespace} configmap {experiment_name} || true
+    kubectl create -n {gke_namespace} configmap {experiment_name} --from-file={nemo_generated_config_path}/{nemo_generated_config_name}
+    printf "Submitting {experiment_name}\n"
+    kubectl create -f jobset 2>&1 | tee klog
+
+  collect_logs.tpl: |
+    #!/bin/bash
+    set -e
+    jobname=$(head -n 1 {experiment_run_dir}/klog | awk -F " |/" '{print $2}')
+    printf "Waiting for up to a day for ${jobname} to complete.\n"
+    kubectl wait --timeout=86400s jobs/${jobname}-w-0 --for=condition=complete
+    kubectl logs --tail=-1 -f -l batch.kubernetes.io/job-completion-index=0,job-name=${jobname}-w-0 | tee {log_file}
+
+  ramble.yaml: |
+    ramble:
+      variables:
+        ssh_port: 22
+        batch_submit: '{execute_nemo}'
+        mpi_command: >-
+          mpirun
+          -n {n_ranks}
+          -N {processes_per_node}
+          --bind-to none
+          --hostfile /tmp/hostfile
+          --mca btl self,tcp
+          --mca btl_tcp_if_include eth0
+          --mca orte_keep_fqdn_hostnames 1
+          --mca plm_rsh_no_tree_spawn 1
+          -x {mpi_env_vars}
+          --mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p {ssh_port}"
+        mpi_env_vars: >-
+          $(echo
+          ${!NCCL*}
+          ${!OMPI*}
+          LD_LIBRARY_PATH
+          ${!CUDA*}
+          ${!GLOO*}
+          ${!NVIDIA*}
+          ${!NVTE*}
+          ${!OMP*}
+          ${!TORCH*}
+          ${!TQDM*}
+          TRANSFORMERS_OFFLINE
+          PYTHONPATH
+          | sed 's/ / -x /g')
+
+        container_name: nemo
+        container_uri: nvcr.io/nvidia/nemo:{nemo_version}
+        gke_container_name: nemo
+        gke_namespace: ramble
+        gpus_per_node: 8
+        n_threads: 12
+        nemo_launcher_tag: 24.07
+        nemo_version: 24.07
+        processes_per_node: 8
+
+        # Potentially need to be modified
+        gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool
+        sysnet_subnet_prefix: a3u-gke-gcs-sub
+        gpu_subnet_prefix: a3u-gke-gcs-rdma-sub
+        cluster_queue: a3u
+      env_vars:
+        set:
+          CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+          OMP_NUM_THREADS: '{n_threads}'
+          TRANSFORMERS_OFFLINE: 0
+          TORCH_NCCL_AVOID_RECORD_STREAMS: 1
+          NCCL_NVLS_ENABLE: 0
+          GLOO_SOCKET_IFNAME: eth0,eth1
+          # SM_MARGIN environment vars prevent send-receive stalling execution
+          # (results in reduced step time)
+          NVTE_FWD_LAYERNORM_SM_MARGIN: 8
+          NVTE_BWD_LAYERNORM_SM_MARGIN: 8
+          NCCL_NET: gIB
+        prepend:
+        - paths:
+            LD_LIBRARY_PATH: /usr/local/gib/lib64
+
+
+      applications:
+        py-nemo:
+          workloads:
+            pretraining:
+              experiments:
+                mixtral-{n_nodes}-nodes:
+                  variables:
+                    n_nodes: [8,16,32]
+                    jobset_name: 'm8x7b-{n_nodes}'
+
+                    nemo_stage: training
+                    nemo_model: mixtral
+                    nemo_config_name: mixtral_8x7b
+
+                    max_steps: 10
+                    trainer.max_steps: '{max_steps}'
+                    trainer.val_check_interval: 50
+                    trainer.log_every_n_steps: 1
+                    trainer.enable_model_summary: false
+
+                    model.tokenizer.library: megatron
+                    model.tokenizer.type: GPT2BPETokenizer
+                    model.tokenizer.model: null
+                    model.tokenizer.delimiter : null
+                    model.tokenizer.vocab_file: gpt2-vocab.json
+                    model.tokenizer.merge_file: gpt2-merges.txt
+
+                    exp_manager.exp_dir: '{experiment_run_dir}'
+
+                    run.time_limit: 01:00:00
+
+                    model.data.data_impl: mock
+                    model.data.data_prefix: []
+                    model.data.splits_string: 90,8,2
+                    model.data.num_workers: 4
+
+                    model.optim.contiguous_grad_buffer: true
+                    model.optim.contiguous_param_buffer: true
+
+                    model.global_batch_size: 1024
+                    model.micro_batch_size: 2
+                    model.virtual_pipeline_model_parallel_size: null
+                    model.pipeline_model_parallel_size: 1
+                    model.gc_interval: 100
+                    model.fp8_params: true
+
+                    model.nsys_profile.ranks: [0, 8]
+                    model.nsys_profile.start_step: 27
+                    model.nsys_profile.end_step: 29
+
+                    # Checkpoint saving & logging
+                    exp_manager.resume_if_exists: false
+                    exp_manager.create_checkpoint_callback: false
+
+                llama3-{n_nodes}-nodes:
+                  variables:
+                    n_nodes: [8,16,32]
+
+                    jobset_name: 'llama-{n_nodes}'
+                    nemo_stage: training
+                    nemo_model: llama
+                    nemo_config_name: llama3_1_70b
+
+                    max_steps: 10
+                    trainer.max_steps: '{max_steps}'
+                    trainer.val_check_interval: 200
+                    trainer.log_every_n_steps: 1
+                    trainer.limit_val_batches: 5
+                    trainer.limit_test_batches: 5
+
+                    model.tokenizer.library: megatron
+                    model.tokenizer.type: GPT2BPETokenizer
+                    model.tokenizer.model: null
+                    model.tokenizer.delimiter : null
+                    model.tokenizer.vocab_file: gpt2-vocab.json
+                    model.tokenizer.merge_file: gpt2-merges.txt
+
+                    exp_manager.exp_dir: '{experiment_run_dir}'
+                    exp_manager.checkpoint_callback_params.model_parallel_size: ${multiply:$\{model.tensor_model_parallel_size}, $\{model.pipeline_model_parallel_size}}
+                    run.time_limit: 0-03:30:00
+
+                    model.data.data_impl: mock
+                    model.data.data_prefix: []
+                    model.data.splits_string: 90,8,2
+
+                    model.optim.grad_sync_dtype: bf16
+
+                    model.global_batch_size: 1024
+                    model.virtual_pipeline_model_parallel_size: 20
+                    model.tensor_model_parallel_size: 2
+                    model.context_parallel_size: 1
+
+                    model.fp8: true
+                    model.fp8_e4m3: true
+                    model.fp8_hybrid: true
+                    model.fp8_params: true
+
+                    model.ub_tp_comm_overlap: false
+
+                    model.nsys_profile.ranks: [0, 8]
+                    model.nsys_profile.start_step: 17
+                    model.nsys_profile.end_step: 19
+
+                    # Checkpoint saving & logging
+                    exp_manager.resume_if_exists: false
+                    exp_manager.create_checkpoint_callback: false
+                    exp_manager.create_dllogger_logger: false
+
+          internals:
+            custom_executables:
+              mpi_head_node:
+                template:
+                - echo "Downloading GPT vocabulary files"
+                - wget -P /opt/NeMo/ https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
+                - wget -P /opt/NeMo/ https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
+                - source /usr/local/gib/scripts/set_nccl_env.sh
+                - if [[ "${NODE_RANK}" -eq "0" ]]; then
+                redirect: ''
+                log_file: ''
+              wait_worker_nodes:
+                template:
+                - else
+                - while ping -c 1 ${WORKERS_BASENAME}-0.${POSTFIX}; do
+                - sleep 5
+                - done
+                - fi
+                redirect: ''
+                log_file: ''
+              tail_log:
+                template:
+                - tail -f {log_file} &
+                - export TAIL_PID=$!
+                redirect: ''
+                log_file: ''
+              kill_tail:
+                template:
+                - kill -9 $TAIL_PID
+                redirect: ''
+                log_file: ''
+            executable_injection:
+            - name: mpi_head_node
+              order: before
+            - name: wait_worker_nodes
+              order: after
+            - name: tail_log
+              order: before
+            - name: kill_tail
+              order: after
+          formatted_executables:
+            yaml_command:
+              indentation: 18
+              join_separator: \n
+              commands:
+              - mkdir -p {experiment_run_dir}
+              - ulimit -l unlimited
+              - cp /configs/nemo.yaml {experiment_run_dir}/
+              - '{unformatted_command}'
+
+  jobset.tpl: |
+    apiVersion: jobset.x-k8s.io/v1alpha2
+    kind: JobSet
+    metadata:
+      generateName: {jobset_name}-
+      namespace: {gke_namespace}
+      labels:
+        kueue.x-k8s.io/queue-name: {cluster_queue}
+    spec:
+      ttlSecondsAfterFinished: 86400
+      network:
+        enableDNSHostnames: true
+        publishNotReadyAddresses: true
+      replicatedJobs:
+        - name: w
+          template:
+            spec:
+              parallelism: {n_nodes}
+              completions: {n_nodes}
+              template:
+                metadata:
+                  annotations:
+                    kueue.x-k8s.io/podset-preferred-topology: "kubernetes.io/hostname"
+                    networking.gke.io/default-interface: 'eth0'
+                    networking.gke.io/interfaces: |
+                      [
+                        \{"interfaceName":"eth0","network":"default"\},
+                        \{"interfaceName":"eth1","network":"{sysnet_subnet_prefix}-1"\},
+                        \{"interfaceName":"eth2","network":"{gpu_subnet_prefix}-0"\},
+                        \{"interfaceName":"eth3","network":"{gpu_subnet_prefix}-1"\},
+                        \{"interfaceName":"eth4","network":"{gpu_subnet_prefix}-2"\},
+                        \{"interfaceName":"eth5","network":"{gpu_subnet_prefix}-3"\},
+                        \{"interfaceName":"eth6","network":"{gpu_subnet_prefix}-4"\},
+                        \{"interfaceName":"eth7","network":"{gpu_subnet_prefix}-5"\},
+                        \{"interfaceName":"eth8","network":"{gpu_subnet_prefix}-6"\},
+                        \{"interfaceName":"eth9","network":"{gpu_subnet_prefix}-7"\}
+                      ]
+                spec:
+                  restartPolicy: Never
+                  nodeSelector:
+                    cloud.google.com/gke-nodepool: {gke_nodepool}
+                  tolerations:
+                  - key: cloud.google.com/gke-queued
+                    effect: NoSchedule
+                    value: "true"
+                  - key: "nvidia.com/gpu"
+                    operator: "Exists"
+                    effect: "NoSchedule"
+                  setHostnameAsFQDN: true
+                  volumes:
+                  - name: mpi-id
+                    secret:
+                      secretName: mpi-ssh-nemo
+                      items:
+                      - key: ssh-privatekey
+                        path: "id_rsa"
+                      - key: ssh-publickey
+                        path: "id_rsa.pub"
+                  - name: gib
+                    hostPath:
+                      path: /home/kubernetes/bin/gib
+                  - name: nvidia
+                    hostPath:
+                      path: /home/kubernetes/bin/nvidia
+                  - name: lib64
+                    hostPath:
+                      path: /lib64
+                  - name: shared-memory
+                    emptyDir:
+                      medium: "Memory"
+                      sizeLimit: 250Gi
+                  - name: sys
+                    hostPath:
+                      path: /sys
+                  - name: proc-sys
+                    hostPath:
+                      path: /proc/sys
+                  - name: local-ssd
+                    hostPath:
+                      path: /mnt/stateful_partition/kube-ephemeral-ssd
+                  - name: nemo-config
+                    configMap:
+                      name: {experiment_name}
+                  initContainers:
+                  - name: gpu-healthcheck
+                    image: alpine:latest
+                    command: ["/bin/sh", "-c"]
+                    args:
+                    - |
+                      apk add --no-cache bash  # Install bash
+                      /bin/bash -c "set -ex
+                      NUM_GPUS=$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | wc -l)
+                      if [ \${NUM_GPUS} -lt 8 ]; then
+                        echo \"Error: Only \${NUM_GPUS} GPUs and expected 8\"
+                        exit 1
+                      fi
+                      gpu_errors=(\$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=ecc.errors.uncorrected.volatile.total --format=csv,noheader,nounits))
+                      for gpu_index in \${!gpu_errors[@]}; do
+                          if [ \${gpu_errors[\$gpu_index]} == '[N/A]' ]; then
+                              echo 'Error: ERR detected in GPU index '\$gpu_index
+                              exit 1
+                          elif [ \${gpu_errors[\$gpu_index]} -gt 0 ]; then
+                              echo 'Error: Unrecoverable ECC errors detected in GPU index '\$gpu_index
+                              exit 1
+                          fi
+                      done
+                      echo \${NUM_GPUS} GPUs found with no ERR or Unrecoverable ECC errors"
+                    volumeMounts:
+                    - name: nvidia
+                      mountPath: /usr/local/nvidia
+                    - name: lib64
+                      mountPath: /lib64
+                    securityContext:
+                      privileged: true
+                    env:
+                    - name: LD_LIBRARY_PATH
+                      value: /usr/local/nvidia/lib64
+                  containers:
+                  - name: {gke_container_name}
+                    stdin: true
+                    tty: true
+                    image: {container_uri}
+                    env:
+                    - name: OMPI_ALLOW_RUN_AS_ROOT
+                      value: "1"
+                    - name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
+                      value: "1"
+                    - name: MY_NODE_NAME
+                      valueFrom:
+                        fieldRef:
+                          fieldPath: spec.nodeName
+                    command:
+                    - bash
+                    - -c
+                    - |
+                      set -x
+
+                      # Setup SSH
+                      export DEBIAN_FRONTEND=noninteractive
+
+                      apt update -qq -y
+                      apt install -qq -y iputils-ping openssh-server
+
+                      mkdir -p /run/sshd ~/.ssh
+                      chmod 700 ~/.ssh
+                      cp /secrets/ssh/* ~/.ssh/
+                      cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
+                      chmod 600 ~/.ssh/*
+                      mkdir -p /run/sshd
+                      /sbin/sshd
+
+                      # Load all the cuda libs
+                      /sbin/ldconfig
+
+                      export POSTFIX=$(hostname | cut -d . -f 2-)
+                      export WORKERS_BASENAME=$(hostname | cut -d . -f 1 | rev | cut -d - -f 2- | rev )
+                      export NODE_RANK=$JOB_COMPLETION_INDEX
+
+                      # For every host, get the entity and add to hostfile
+                      for i in `seq 0 $(({n_nodes}-1))`; do
+                        OTHER=${WORKERS_BASENAME}-${i}.${POSTFIX}
+                        until ssh -p {ssh_port} -o StrictHostKeyChecking=no $OTHER hostname;
+                        do
+                          echo ...
+                          sleep 10
+                        done
+                        echo ${OTHER} port={ssh_port} slots={processes_per_node} | tee -a /tmp/hostfile;
+                      done
+                      cat /tmp/hostfile
+
+                      export MASTER_ADDR=${WORKERS_BASENAME}-0.${POSTFIX}
+                      export MASTER_PORT=5678
+
+    {yaml_command}
+
+                      exit 0
+
+                    volumeMounts:
+                    - name: mpi-id
+                      mountPath: "/secrets/ssh"
+                      readOnly: true
+                    - name: nvidia
+                      mountPath: /usr/local/nvidia
+                    - name: gib
+                      mountPath: /usr/local/gib
+                    - name: shared-memory
+                      mountPath: /dev/shm
+                    - name: local-ssd
+                      mountPath: /ssd
+                    - name: nemo-config
+                      mountPath: /configs
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 8
+                      requests:
+                        nvidia.com/gpu: 8
+
+                  restartPolicy: Never
+
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ramble-nemo-runner
+  namespace: ramble
+spec:
+  template:
+    spec:
+      volumes:
+      - name: config
+        configMap:
+          name: ramble-nemo-configs
+          items:
+          - key: jobset.tpl
+            path: jobset.tpl
+          - key: ramble.yaml
+            path: ramble.yaml
+          - key: execute_nemo.tpl
+            path: execute_nemo.tpl
+          - key: collect_logs.tpl
+            path: collect_logs.tpl
+
+      serviceAccountName: ramble
+      containers:
+      - name: ramble-controller
+        image: ubuntu:latest
+
+        volumeMounts:
+        - name: config
+          mountPath: /opt/configs/
+          readOnly: true
+
+        command:
+        - bash
+        - -c
+        - |
+          export DEBIAN_FRONTEND=noninteractive
+
+          set -e
+          printf "Installing system dependencies\n"
+          apt update -qq -y > /dev/null
+          apt install -qq -y build-essential python3-venv jq git curl > /dev/null
+
+          printf "Installing kubectl\n"
+          curl -s -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
+          install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
+
+          # Use current unix timestamp as a unique tag
+          # for jobs submitted
+          TAG=$(date +%s)
+          TEST_DIR=/workspace/nemo-tests-"${TAG}"
+          SOFTWARE_INSTALL=/opt
+
+          mkdir -p ${SOFTWARE_INSTALL} ${TEST_DIR}
+
+          printf "Cloning ramble and cluster-toolkit\n"
+          git clone --depth 1 -c feature.manyFiles=true https://github.com/GoogleCloudPlatform/ramble.git "${SOFTWARE_INSTALL}"/ramble
+
+          printf "Setting up ramble python environment, and installing requirements\n"
+          python3 -m venv "${SOFTWARE_INSTALL}"/ramble/env || true
+          source "${SOFTWARE_INSTALL}"/ramble/env/bin/activate
+          pip install -q -r "${SOFTWARE_INSTALL}"/ramble/requirements.txt
+
+          # Activate ramble
+          . ${SOFTWARE_INSTALL}/ramble/share/ramble/setup-env.sh
+
+          ramble workspace create -a -d "${TEST_DIR}"
+
+          cp /opt/configs/* ${RAMBLE_WORKSPACE}/configs/
+
+          cd ${RAMBLE_WORKSPACE}
+
+          # Set up SSH
+          printf "Creating ssh keypair for MPI workloads\n"
+          ssh-keygen -b 2048 -f mpi_id -N ""
+          kubectl create secret generic mpi-ssh-nemo --from-file=ssh-privatekey=./mpi_id --from-file=ssh-publickey=./mpi_id.pub || true
+
+          # Get number of GPUs / nodes available in this cluster from Kueue:
+          AVAILABLE_GPUS=$(
+            kubectl get clusterqueues.kueue.x-k8s.io -o json |
+            jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name="a3u") |
+            .resources[] | select (.name="nvidia.com/gpu") | .nominalQuota'
+          )
+
+          N_NODES=$((AVAILABLE_GPUS / 8))
+
+          printf "\n--- Available Benchmarks on %s nodes --\n" ${N_NODES}
+          ramble workspace info --where '{n_nodes} <= '"${N_NODES}"
+
+          printf "\n--------- Setting up Benchmarks -------\n"
+          ramble workspace setup --where '{n_nodes} <= '"${N_NODES}"
+
+          printf "\n----------- Running Benchmarks --------\n"
+          ramble on --where '{n_nodes} <= '"${N_NODES}"
+
+          printf "\n------- Collecting benchmark logs -----\n"
+          ramble on --executor "{experiment_run_dir}/collect_logs" --where '{n_nodes} <= '"${N_NODES}"
+
+          printf "\n------- Analyzing benchmark logs ------\n"
+          ramble workspace analyze -f json --where '{n_nodes} <= '"${N_NODES}"
+
+          printf "\n--------------- SUMMARY ---------------\n"
+          jq -r '["nemo_config","n_nodes","step","train_step_timing"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context |
+          {
+            name: $exp.RAMBLE_VARIABLES.nemo_config_name,
+            workload: $exp.workload_name,
+            n_nodes: $exp.n_nodes,
+            Context: $context.name
+          } +
+          ($context.foms | from_entries )
+          | select (.Context == "0-5/5")
+          | [.name, .n_nodes, .Context, .train_step_timing])
+          | @tsv' results.latest.json
+          printf "\n-------- Benchmarking Complete -------\n"
+
+      restartPolicy: Never
+  backoffLimit: 4

From ee88546422c96f510a969cebaa5c40b501cb8c47 Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Fri, 17 Jan 2025 05:29:42 +0000
Subject: [PATCH 10/15] Update README for benchmarks

---
 .../a3u-gke-gcs/system_benchmarks/README.md   | 29 +++++++++++++------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md
index 63bce8c2db..ade615a8f0 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/README.md
@@ -49,19 +49,30 @@ For any of the above, the following will be created:
   series of `Jobset` objects for each individual benchmark.
 
 Once created, this will first create a K8s job called
-"ramble-{nccl,hpl,nemo}-runner". This controller job orchestrates the running
-and analysis of the benchmarks. It installs everything it needs within a
-self-contained pod, creates an ssh keypair for multi-node communication, and
-uses Ramble to create JobSet's for each benchmark. Once those benchmarks
-are complete, it provides a summary of the results. Full benchmark logs can
-otherwise be found in the logs for each of the created JobSet/Job/Pod's
+"ramble-{nccl,hpl,nemo}-runner" in the ramble workspace. This controller job
+orchestrates the running and analysis of the benchmarks. It installs everything
+it needs within a self-contained pod, creates an ssh keypair for multi-node
+communication, and uses Ramble to create JobSet's for each benchmark. Once those
+benchmarks are complete, it provides a summary of the results. Full benchmark
+logs can otherwise be found in the logs for each of the created JobSet/Job/Pod's
 themselves.
 
+If you were to run all of the above commands, you would initially see something
+like this:
+
+   ```bash
+   $ kubectl -n ramble get jobs
+   NAME                 STATUS    COMPLETIONS   DURATION   AGE
+   ramble-hpl-runner    Running   0/1           30s        30s
+   ramble-nccl-runner   Running   0/1           43s        43s
+   ramble-nemo-runner   Running   0/1           22s        22s
+   ```
+
 For each benchmark, multiple node scales will be submitted, up to your maximum
 node scale of your cluster.  This can be controlled with the `n_nodes` variable
 in the `ramble.yaml` configMap.
 
-Note: The following depends on several tightly coupled settings, in particular
+Note: The benchmarks depends on several tightly coupled settings, in particular
 making sure that the subnet names in your GKE cluster match those defined in
 the "ramble.yaml" config file. If you modify the names of your subnets
 (including by changing the "deployment" name), then you will need to modify
@@ -74,8 +85,8 @@ ramble-{nccl,hpl,nemo}.yaml files:
         gpu_subnet_prefix: a3u-gke-gcs-rdma-sub
         cluster_queue: a3u
 
-Expected Results
-----------------
+Viewing the Results
+-------------------
 
 For ramble-nccl.yaml, at the end of the logs of the created `ramble-nccl-runner`
 job, you should see something like:

From 5758a2695b381cd07e3a857737c9c46c64f21e68 Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Sat, 18 Jan 2025 23:13:30 +0000
Subject: [PATCH 11/15] Simplify/improve system benchmarks

Simplify nemo examples, add instructions to get ramble workspace.
---
 .../system_benchmarks/ramble-hpl.yaml         |  28 +++-
 .../system_benchmarks/ramble-nccl.yaml        |  29 +++-
 .../system_benchmarks/ramble-nemo.yaml        | 141 ++++++++----------
 3 files changed, 117 insertions(+), 81 deletions(-)

diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml
index e5f5e4d2a9..bfeeecfbdb 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml
@@ -129,7 +129,6 @@ data:
           ${!UCX*}
           | sed 's/ / -x /g')
 
-
         container_name: hpl
         container_uri: "nvcr.io/nvidia/hpc-benchmarks:24.09"
         gke_container_name: hpl
@@ -142,6 +141,7 @@ data:
         sysnet_subnet_prefix: a3u-gke-gcs-sub
         gpu_subnet_prefix: a3u-gke-gcs-rdma-sub
         cluster_queue: a3u
+
       env_vars:
         set:
           CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
@@ -158,7 +158,6 @@ data:
         - paths:
             LD_LIBRARY_PATH: /usr/local/gib/lib64
 
-
       applications:
         nvidia-hpl:
           workloads:
@@ -536,6 +535,9 @@ spec:
           printf "\n------- Analyzing benchmark logs ------\n"
           ramble workspace analyze -f json --where '{n_nodes} <= '"${N_NODES}"
 
+          printf "\n------- Archiving ramble workspace ------\n"
+          ramble workspace archive -t --where '{n_nodes} <= '"${N_NODES}"
+
           printf "\n--------------- SUMMARY ---------------\n"
           jq -r '["workload","n_nodes","GFlop/s   ","GFlops/s/GPU"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context |
           {
@@ -549,5 +551,27 @@ spec:
           | @tsv' results.latest.json
           printf "\n-------- Benchmarking Complete -------\n"
 
+          ARCHIVE_TAR=$(readlink archive/archive.latest.tar.gz)
+          ARCHIVE_PATH=${RAMBLE_WORKSPACE}/archive/${ARCHIVE_TAR}
+          RESULTS_FILE=$(basename $(readlink results.latest.json))
+          RESULTS_PATH=${RAMBLE_WORKSPACE}/${RESULTS_FILE}
+
+          printf "\n# To copy the full results from container:\n"
+          printf "kubectl cp %s:%s %s\n" $(hostname) ${RESULTS_PATH} ${RESULTS_FILE}
+          printf "\n# To copy the ramble workspace archive from container:\n"
+          printf "kubectl cp %s:%s ./%s\n" $(hostname) ${ARCHIVE_PATH} ${ARCHIVE_TAR}
+
+          printf "\n# To re-activate ramble workspace, first access runner:\n"
+          printf "kubectl exec -it %s -- /bin/bash\n" $(hostname)
+          printf "# Then run:\n"
+          printf "cd ${RAMBLE_WORKSPACE}\n"
+          printf "source "${SOFTWARE_INSTALL}"/ramble/env/bin/activate\n"
+          printf ". ${SOFTWARE_INSTALL}/ramble/share/ramble/setup-env.sh\n"
+          printf "ramble workspace activate .\n"
+
+          printf "\n- Sleeping for 1 day to allow introspection -\n"
+          sleep 86400
+
+
       restartPolicy: Never
   backoffLimit: 4
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml
index ae7753516e..e28e2ad1d9 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml
@@ -90,7 +90,7 @@ data:
     #!/bin/bash
     set -e
     cd "{experiment_run_dir}"
-    printf "Submitting ${experiment_name}\n"
+    printf "Submitting {experiment_name}\n"
     kubectl create -f jobset 2>&1 | tee klog
 
   collect_logs.tpl: |
@@ -138,6 +138,7 @@ data:
         sysnet_subnet_prefix: a3u-gke-gcs-sub
         gpu_subnet_prefix: a3u-gke-gcs-rdma-sub
         cluster_queue: a3u
+
       env_vars:
         set:
           CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
@@ -146,7 +147,6 @@ data:
         - paths:
             LD_LIBRARY_PATH: /usr/local/gib/lib64
 
-
       applications:
         nccl-tests:
           workloads:
@@ -508,6 +508,9 @@ spec:
           printf "\n------- Analyzing benchmark logs ------\n"
           ramble workspace analyze -f json --where '{n_nodes} <= '"${N_NODES}"
 
+          printf "\n------- Archiving ramble workspace ------\n"
+          ramble workspace archive -t --where '{n_nodes} <= '"${N_NODES}"
+
           printf "\n---- SUMMARY for >1GB Message Sizes ----\n"
           jq -r '["workload","n_nodes","msg_size","busbw"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context |
           {
@@ -522,5 +525,27 @@ spec:
           | @tsv' results.latest.json
           printf "\n-------- Benchmarking Complete -------\n"
 
+          ARCHIVE_TAR=$(readlink archive/archive.latest.tar.gz)
+          ARCHIVE_PATH=${RAMBLE_WORKSPACE}/archive/${ARCHIVE_TAR}
+          RESULTS_FILE=$(basename $(readlink results.latest.json))
+          RESULTS_PATH=${RAMBLE_WORKSPACE}/${RESULTS_FILE}
+
+          printf "\n# To copy the full results from container:\n"
+          printf "kubectl cp %s:%s %s\n" $(hostname) ${RESULTS_PATH} ${RESULTS_FILE}
+          printf "\n# To copy the ramble workspace archive from container:\n"
+          printf "kubectl cp %s:%s ./%s\n" $(hostname) ${ARCHIVE_PATH} ${ARCHIVE_TAR}
+
+          printf "\n# To re-activate ramble workspace, first access runner:\n"
+          printf "kubectl exec -it %s -- /bin/bash\n" $(hostname)
+          printf "# Then run:\n"
+          printf "cd ${RAMBLE_WORKSPACE}\n"
+          printf "source "${SOFTWARE_INSTALL}"/ramble/env/bin/activate\n"
+          printf ". ${SOFTWARE_INSTALL}/ramble/share/ramble/setup-env.sh\n"
+          printf "ramble workspace activate .\n"
+
+          printf "\n- Sleeping for 1 day to allow introspection -\n"
+          sleep 86400
+
+
       restartPolicy: Never
   backoffLimit: 4
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml
index fe5642b4f1..90ac428240 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml
@@ -146,11 +146,36 @@ data:
         nemo_version: 24.07
         processes_per_node: 8
 
+        # Shared NeMo Configurations
+        trainer.max_steps: 10
+        trainer.val_check_interval: null
+        trainer.limit_val_batches: 0.0
+        trainer.log_every_n_steps: 1
+        trainer.enable_model_summary: false
+
+        model.tokenizer.library: megatron
+        model.tokenizer.type: GPT2BPETokenizer
+        model.tokenizer.model: null
+        model.tokenizer.delimiter : null
+        model.tokenizer.vocab_file: gpt2-vocab.json
+        model.tokenizer.merge_file: gpt2-merges.txt
+        model.data.data_impl: mock
+        model.data.data_prefix: []
+        model.data.splits_string: 98,1,1
+
+        exp_manager.resume_if_exists: false
+        exp_manager.create_checkpoint_callback: false
+        exp_manager.create_dllogger_logger: false
+        exp_manager.checkpoint_callback_params.save_top_k: 1
+        exp_manager.checkpoint_callback_params.model_parallel_size: ${multiply:$\{model.tensor_model_parallel_size}, $\{model.pipeline_model_parallel_size}}
+        exp_manager.exp_dir: '{experiment_run_dir}'
+
         # Potentially need to be modified
         gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool
         sysnet_subnet_prefix: a3u-gke-gcs-sub
         gpu_subnet_prefix: a3u-gke-gcs-rdma-sub
         cluster_queue: a3u
+
       env_vars:
         set:
           CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
@@ -168,7 +193,6 @@ data:
         - paths:
             LD_LIBRARY_PATH: /usr/local/gib/lib64
 
-
       applications:
         py-nemo:
           workloads:
@@ -176,106 +200,44 @@ data:
               experiments:
                 mixtral-{n_nodes}-nodes:
                   variables:
-                    n_nodes: [8,16,32]
+                    n_nodes: [8, 16, 32]
                     jobset_name: 'm8x7b-{n_nodes}'
 
                     nemo_stage: training
                     nemo_model: mixtral
                     nemo_config_name: mixtral_8x7b
 
-                    max_steps: 10
-                    trainer.max_steps: '{max_steps}'
-                    trainer.val_check_interval: 50
-                    trainer.log_every_n_steps: 1
-                    trainer.enable_model_summary: false
-
-                    model.tokenizer.library: megatron
-                    model.tokenizer.type: GPT2BPETokenizer
-                    model.tokenizer.model: null
-                    model.tokenizer.delimiter : null
-                    model.tokenizer.vocab_file: gpt2-vocab.json
-                    model.tokenizer.merge_file: gpt2-merges.txt
-
-                    exp_manager.exp_dir: '{experiment_run_dir}'
-
-                    run.time_limit: 01:00:00
-
-                    model.data.data_impl: mock
-                    model.data.data_prefix: []
-                    model.data.splits_string: 90,8,2
                     model.data.num_workers: 4
-
-                    model.optim.contiguous_grad_buffer: true
-                    model.optim.contiguous_param_buffer: true
-
+                    model.fp8_params: true
+                    model.gc_interval: 0
                     model.global_batch_size: 1024
                     model.micro_batch_size: 2
-                    model.virtual_pipeline_model_parallel_size: null
+                    model.moe_grouped_gemm: false
+                    model.optim.contiguous_grad_buffer: true
+                    model.optim.contiguous_param_buffer: true
                     model.pipeline_model_parallel_size: 1
-                    model.gc_interval: 100
-                    model.fp8_params: true
-
-                    model.nsys_profile.ranks: [0, 8]
-                    model.nsys_profile.start_step: 27
-                    model.nsys_profile.end_step: 29
-
-                    # Checkpoint saving & logging
-                    exp_manager.resume_if_exists: false
-                    exp_manager.create_checkpoint_callback: false
+                    model.virtual_pipeline_model_parallel_size: null
 
                 llama3-{n_nodes}-nodes:
                   variables:
-                    n_nodes: [8,16,32]
+                    n_nodes: [8, 16, 32]
 
                     jobset_name: 'llama-{n_nodes}'
                     nemo_stage: training
                     nemo_model: llama
                     nemo_config_name: llama3_1_70b
 
-                    max_steps: 10
-                    trainer.max_steps: '{max_steps}'
-                    trainer.val_check_interval: 200
-                    trainer.log_every_n_steps: 1
-                    trainer.limit_val_batches: 5
-                    trainer.limit_test_batches: 5
-
-                    model.tokenizer.library: megatron
-                    model.tokenizer.type: GPT2BPETokenizer
-                    model.tokenizer.model: null
-                    model.tokenizer.delimiter : null
-                    model.tokenizer.vocab_file: gpt2-vocab.json
-                    model.tokenizer.merge_file: gpt2-merges.txt
-
-                    exp_manager.exp_dir: '{experiment_run_dir}'
-                    exp_manager.checkpoint_callback_params.model_parallel_size: ${multiply:$\{model.tensor_model_parallel_size}, $\{model.pipeline_model_parallel_size}}
-                    run.time_limit: 0-03:30:00
-
-                    model.data.data_impl: mock
-                    model.data.data_prefix: []
-                    model.data.splits_string: 90,8,2
-
-                    model.optim.grad_sync_dtype: bf16
-
-                    model.global_batch_size: 1024
-                    model.virtual_pipeline_model_parallel_size: 20
-                    model.tensor_model_parallel_size: 2
+                    model.data.num_workers: 2
                     model.context_parallel_size: 1
-
                     model.fp8: true
                     model.fp8_e4m3: true
                     model.fp8_hybrid: true
                     model.fp8_params: true
-
+                    model.global_batch_size: 1024
+                    model.optim.grad_sync_dtype: bf16
+                    model.tensor_model_parallel_size: 2
                     model.ub_tp_comm_overlap: false
-
-                    model.nsys_profile.ranks: [0, 8]
-                    model.nsys_profile.start_step: 17
-                    model.nsys_profile.end_step: 19
-
-                    # Checkpoint saving & logging
-                    exp_manager.resume_if_exists: false
-                    exp_manager.create_checkpoint_callback: false
-                    exp_manager.create_dllogger_logger: false
+                    model.virtual_pipeline_model_parallel_size: 20
 
           internals:
             custom_executables:
@@ -628,6 +590,9 @@ spec:
           printf "\n------- Analyzing benchmark logs ------\n"
           ramble workspace analyze -f json --where '{n_nodes} <= '"${N_NODES}"
 
+          printf "\n------- Archiving ramble workspace ------\n"
+          ramble workspace archive -t --where '{n_nodes} <= '"${N_NODES}"
+
           printf "\n--------------- SUMMARY ---------------\n"
           jq -r '["nemo_config","n_nodes","step","train_step_timing"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context |
           {
@@ -637,10 +602,32 @@ spec:
             Context: $context.name
           } +
           ($context.foms | from_entries )
-          | select (.Context == "0-5/5")
+          | select (.Context == "0-10/10")
           | [.name, .n_nodes, .Context, .train_step_timing])
           | @tsv' results.latest.json
           printf "\n-------- Benchmarking Complete -------\n"
 
+          ARCHIVE_TAR=$(readlink archive/archive.latest.tar.gz)
+          ARCHIVE_PATH=${RAMBLE_WORKSPACE}/archive/${ARCHIVE_TAR}
+          RESULTS_FILE=$(basename $(readlink results.latest.json))
+          RESULTS_PATH=${RAMBLE_WORKSPACE}/${RESULTS_FILE}
+
+          printf "\n# To copy the full results from container:\n"
+          printf "kubectl cp %s:%s %s\n" $(hostname) ${RESULTS_PATH} ${RESULTS_FILE}
+          printf "\n# To copy the ramble workspace archive from container:\n"
+          printf "kubectl cp %s:%s ./%s\n" $(hostname) ${ARCHIVE_PATH} ${ARCHIVE_TAR}
+
+          printf "\n# To re-activate ramble workspace, first access runner:\n"
+          printf "kubectl exec -it %s -- /bin/bash\n" $(hostname)
+          printf "# Then run:\n"
+          printf "cd ${RAMBLE_WORKSPACE}\n"
+          printf "source "${SOFTWARE_INSTALL}"/ramble/env/bin/activate\n"
+          printf ". ${SOFTWARE_INSTALL}/ramble/share/ramble/setup-env.sh\n"
+          printf "ramble workspace activate .\n"
+
+          printf "\n- Sleeping for 1 day to allow introspection -\n"
+          sleep 86400
+
+
       restartPolicy: Never
   backoffLimit: 4

From 8e823412e0a017bc7d716116239d34750feb64a2 Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Sat, 18 Jan 2025 23:25:12 +0000
Subject: [PATCH 12/15] Update kueue config to best practices

---
 .../a3u-gke-gcs/a3u-gke-gcs.yaml              |  3 ++-
 .../kueue-configuration.yaml.tftpl            | 24 ++++++++++++++-----
 .../system_benchmarks/ramble-hpl.yaml         |  3 ++-
 .../system_benchmarks/ramble-nccl.yaml        |  3 ++-
 .../system_benchmarks/ramble-nemo.yaml        |  3 ++-
 5 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
index ce6e3a34f0..7bc4e79d26 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
@@ -186,7 +186,8 @@ deployment_groups:
         version: v0.10.0
         config_path: $(ghpc_stage("kueue-configuration.yaml.tftpl"))
         config_template_vars:
-          num_gpus: $(vars.num_gpus)
+          num_gpus: $(a3-ultragpu-pool.static_gpu_count)
+          node_pool_name: $(a3-ultragpu-pool.node_pool_name)
       jobset:
         install: true
         version: v0.7.2
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl b/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl
index 97cbaede33..1beffe206d 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/kueue-configuration.yaml.tftpl
@@ -26,26 +26,38 @@ spec:
 kind: ResourceFlavor
 apiVersion: kueue.x-k8s.io/v1beta1
 metadata:
-  name: "a3u"
+  name: "a3-ultra-tas"
 spec:
   nodeLabels:
-    cloud.google.com/gke-nodepool: "a3-ultragpu-8g-a3-ultragpu-pool"
+    cloud.google.com/gke-nodepool: ${node_pool_name}
   topologyName: "gke-default"
   tolerations:
   - key: "nvidia.com/gpu"
     operator: "Exists"
     effect: NoSchedule
 ---
+kind: ResourceFlavor
+apiVersion: kueue.x-k8s.io/v1beta1
+metadata:
+  name: "a3-ultra"
+spec:
+  nodeLabels:
+    cloud.google.com/gke-nodepool: ${node_pool_name}
+---
 apiVersion: kueue.x-k8s.io/v1beta1
 kind: ClusterQueue
 metadata:
-  name: "a3u"
+  name: "a3-ultra"
 spec:
   namespaceSelector: {} # match all.
   resourceGroups:
   - coveredResources: ["nvidia.com/gpu"]
     flavors:
-    - name: "a3u"
+    - name: "a3-ultra"
+      resources:
+      - name: "nvidia.com/gpu"
+        nominalQuota: ${num_gpus}
+    - name: "a3-ultra-tas"
       resources:
       - name: "nvidia.com/gpu"
         nominalQuota: ${num_gpus}
@@ -54,6 +66,6 @@ apiVersion: kueue.x-k8s.io/v1beta1
 kind: LocalQueue
 metadata:
   namespace: "default"
-  name: "a3u"
+  name: "a3-ultra"
 spec:
-  clusterQueue: "a3u"
+  clusterQueue: "a3-ultra"
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml
index bfeeecfbdb..93e1d57462 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml
@@ -140,7 +140,7 @@ data:
         gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool
         sysnet_subnet_prefix: a3u-gke-gcs-sub
         gpu_subnet_prefix: a3u-gke-gcs-rdma-sub
-        cluster_queue: a3u
+        cluster_queue: a3-ultra-tas
 
       env_vars:
         set:
@@ -278,6 +278,7 @@ data:
                   restartPolicy: Never
                   nodeSelector:
                     cloud.google.com/gke-nodepool: {gke_nodepool}
+                    cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
                   tolerations:
                   - key: cloud.google.com/gke-queued
                     effect: NoSchedule
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml
index e28e2ad1d9..9abc8afc60 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml
@@ -137,7 +137,7 @@ data:
         gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool
         sysnet_subnet_prefix: a3u-gke-gcs-sub
         gpu_subnet_prefix: a3u-gke-gcs-rdma-sub
-        cluster_queue: a3u
+        cluster_queue: a3-ultra-tas
 
       env_vars:
         set:
@@ -256,6 +256,7 @@ data:
                   restartPolicy: Never
                   nodeSelector:
                     cloud.google.com/gke-nodepool: {gke_nodepool}
+                    cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
                   tolerations:
                   - key: cloud.google.com/gke-queued
                     effect: NoSchedule
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml
index 90ac428240..3ea6338ed0 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml
@@ -174,7 +174,7 @@ data:
         gke_nodepool: a3-ultragpu-8g-a3-ultragpu-pool
         sysnet_subnet_prefix: a3u-gke-gcs-sub
         gpu_subnet_prefix: a3u-gke-gcs-rdma-sub
-        cluster_queue: a3u
+        cluster_queue: a3-ultra-tas
 
       env_vars:
         set:
@@ -330,6 +330,7 @@ data:
                   restartPolicy: Never
                   nodeSelector:
                     cloud.google.com/gke-nodepool: {gke_nodepool}
+                    cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
                   tolerations:
                   - key: cloud.google.com/gke-queued
                     effect: NoSchedule

From 642239f45794300d40c74989dd201f664b2299c0 Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Sat, 18 Jan 2025 23:40:48 +0000
Subject: [PATCH 13/15] Remove manual num_gpus

---
 examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml | 4 ----
 examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml  | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
index 7bc4e79d26..a38a901f71 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/a3u-gke-gcs.yaml
@@ -32,10 +32,6 @@ vars:
   # The number of nodes to be created
   static_node_count:
 
-  # Number of H200 GPUs (for later use by Kueue), which
-  # This should be 8 x static_node_count.
-  num_gpus:
-
   # Cidr block containing the IP of the machine calling terraform.
   # To allow all (IAM restrictions still enforced), use 0.0.0.0/0
   # To allow only your IP address, use <YOUR-IP-ADDRESS>/32
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml
index 48b8b200d8..a47ac61692 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/deployment.yaml
@@ -35,10 +35,6 @@ vars:
   # The number of nodes to be created
   static_node_count:
 
-  # Number of H200 GPUs (for later use by Kueue), which
-  # This should be 8 x static_node_count.
-  num_gpus:
-
   # Cidr block containing the IP of the machine calling terraform.
   # To allow all (IAM restrictions still enforced), use 0.0.0.0/0
   # To allow only your IP address, use <YOUR-IP-ADDRESS>/32

From deaab229f0b1f9c061cdceb9ac2b9bbb0aed00a8 Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Sun, 19 Jan 2025 00:29:38 +0000
Subject: [PATCH 14/15] Update reference workloads to new kueue config.

---
 .../a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml           | 4 ++--
 .../a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml          | 6 +++---
 .../a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml          | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml
index 93e1d57462..7ecff65571 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml
@@ -23,9 +23,9 @@ apiVersion: kueue.x-k8s.io/v1beta1
 kind: LocalQueue
 metadata:
   namespace: "ramble"
-  name: "a3u"
+  name: "a3-ultra-tas"
 spec:
-  clusterQueue: "a3u"
+  clusterQueue: "a3-ultra"
 ---
 apiVersion: v1
 kind: ServiceAccount
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml
index 9abc8afc60..22cdd510ca 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nccl.yaml
@@ -23,9 +23,9 @@ apiVersion: kueue.x-k8s.io/v1beta1
 kind: LocalQueue
 metadata:
   namespace: "ramble"
-  name: "a3u"
+  name: "a3-ultra-tas"
 spec:
-  clusterQueue: "a3u"
+  clusterQueue: "a3-ultra"
 ---
 apiVersion: v1
 kind: ServiceAccount
@@ -488,7 +488,7 @@ spec:
           # Get number of GPUs / nodes available in this cluster from Kueue:
           AVAILABLE_GPUS=$(
             kubectl get clusterqueues.kueue.x-k8s.io -o json |
-            jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name="a3u") |
+            jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name=="a3-ultra-tas") |
             .resources[] | select (.name="nvidia.com/gpu") | .nominalQuota'
           )
 
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml
index 3ea6338ed0..a0d4cf0f9b 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml
@@ -23,9 +23,9 @@ apiVersion: kueue.x-k8s.io/v1beta1
 kind: LocalQueue
 metadata:
   namespace: "ramble"
-  name: "a3u"
+  name: "a3-ultra-tas"
 spec:
-  clusterQueue: "a3u"
+  clusterQueue: "a3-ultra"
 ---
 apiVersion: v1
 kind: ServiceAccount

From 456678c567ae0ca37c1bc9b1da063f606bb99a13 Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Mon, 20 Jan 2025 14:52:38 +0000
Subject: [PATCH 15/15] One more fix for new kueue config

---
 .../a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml               | 2 +-
 .../a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml
index 7ecff65571..0c2911abd5 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-hpl.yaml
@@ -515,7 +515,7 @@ spec:
           # Get number of GPUs / nodes available in this cluster from Kueue:
           AVAILABLE_GPUS=$(
             kubectl get clusterqueues.kueue.x-k8s.io -o json |
-            jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name="a3u") |
+            jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name=="a3-ultra-tas") |
             .resources[] | select (.name="nvidia.com/gpu") | .nominalQuota'
           )
 
diff --git a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml
index a0d4cf0f9b..caf807d9b5 100644
--- a/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml
+++ b/examples/hypercompute_clusters/a3u-gke-gcs/system_benchmarks/ramble-nemo.yaml
@@ -570,7 +570,7 @@ spec:
           # Get number of GPUs / nodes available in this cluster from Kueue:
           AVAILABLE_GPUS=$(
             kubectl get clusterqueues.kueue.x-k8s.io -o json |
-            jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name="a3u") |
+            jq -r '.items[].spec.resourceGroups[].flavors[] | select (.name=="a3-ultra-tas") |
             .resources[] | select (.name="nvidia.com/gpu") | .nominalQuota'
           )