updates to HyperPod Observability Lifecycle scripts (#326)

* updates to install dcgm_exporter to resolve versions, make install efa exporter idempotent, add script to update prometheus config which is idempotent * Update install_efa_node_exporter.sh Fix file for install_efa_node_exporter.sh * Update install_efa_node_exporter.sh
aws-samples · May 14, 2024 · 3823e9a · 3823e9a
1 parent 6649635
commit 3823e9a
Show file tree

Hide file tree

Showing 3 changed files with 161 additions and 107 deletions.
diff --git a/...tectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh b/...tectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh
@@ -2,119 +2,31 @@
 
 if nvidia-smi; then
     echo "NVIDIA GPU found. Proceeding with script..."
-
-    ### Create directory if it doesn't exist
-    sudo mkdir -p /opt/dcgm-exporter/
-    ### write a csv file to define dcgm metric collection (includes more metrics than default file)
 
-    sudo tee /opt/dcgm-exporter/dcgm-golden-metrics.csv > /dev/null <<EOF
-# Format
-# If line starts with a '#' it is considered a comment
-# DCGM FIELD, Prometheus metric type, help message
+    # Get the instance-type from EC2 instance metadata
+    TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
+    INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/instance-type)
 
-# Clocks
-DCGM_FI_DEV_SM_CLOCK,  gauge, SM clock frequency (in MHz).
-DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
-
-# Temperature
-DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
-DCGM_FI_DEV_GPU_TEMP,    gauge, GPU temperature (in C).
-
-# Power
-DCGM_FI_DEV_POWER_USAGE,              gauge, Power draw (in W).
-DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
-
-# PCIE
-# DCGM_FI_DEV_PCIE_TX_THROUGHPUT,  counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML.
-# DCGM_FI_DEV_PCIE_RX_THROUGHPUT,  counter, Total number of bytes received through PCIe RX (in KB) via NVML.
-DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
-
-# Utilization (the sample period varies depending on the product)
-DCGM_FI_DEV_GPU_UTIL,      gauge, GPU utilization (in %).
-DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
-DCGM_FI_DEV_ENC_UTIL,      gauge, Encoder utilization (in %).
-DCGM_FI_DEV_DEC_UTIL ,     gauge, Decoder utilization (in %).
-
-# Errors and violations
-DCGM_FI_DEV_XID_ERRORS,            gauge,   Value of the last XID error encountered.
-DCGM_FI_DEV_POWER_VIOLATION,       counter, Throttling duration due to power constraints (in us).
-DCGM_FI_DEV_THERMAL_VIOLATION,     counter, Throttling duration due to thermal constraints (in us).
-DCGM_FI_DEV_SYNC_BOOST_VIOLATION,  counter, Throttling duration due to sync-boost constraints (in us).
-DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
-DCGM_FI_DEV_LOW_UTIL_VIOLATION,    counter, Throttling duration due to low utilization (in us).
-DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
-
-# Memory usage
-DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
-DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
-
-# ECC
-DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
-DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
-DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
-DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.
-
-# Retired pages
-DCGM_FI_DEV_RETIRED_SBE,     counter, Total number of retired pages due to single-bit errors.
-DCGM_FI_DEV_RETIRED_DBE,     counter, Total number of retired pages due to double-bit errors.
-DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.
-
-# NVLink
-DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
-DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
-DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,   counter, Total number of NVLink retries.
-DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
-DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL,            counter, Total number of NVLink bandwidth counters for all lanes.
-# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0,               counter, The number of bytes of active NVLink rx or tx data including both header and payload.
-
-# VGPU License status
-DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
-
-# Remapped rows
-DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
-DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS,   counter, Number of remapped rows for correctable errors
-DCGM_FI_DEV_ROW_REMAP_FAILURE,           gauge,   Whether remapping of rows has failed
-DCGM_FI_DEV_ROW_REMAP_PENDING,           gauge,   Whether remapping of rows is pending.
-
-# Static configuration information. These appear as labels on the other metrics
-DCGM_FI_DRIVER_VERSION,        label, Driver Version
-# DCGM_FI_NVML_VERSION,          label, NVML Version
-# DCGM_FI_DEV_BRAND,             label, Device Brand
-DCGM_FI_DEV_SERIAL,            label, Device Serial Number
-# DCGM_FI_DEV_OEM_INFOROM_VER,   label, OEM inforom version
-# DCGM_FI_DEV_ECC_INFOROM_VER,   label, ECC inforom version
-# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version
-# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version
-# DCGM_FI_DEV_VBIOS_VERSION,     label, VBIOS version of the device
-
-# DCP metrics
-DCGM_FI_PROF_GR_ENGINE_ACTIVE,   gauge, Ratio of time the graphics engine is active (in %).
-# DCGM_FI_PROF_SM_ACTIVE,          gauge, The ratio of cycles an SM has at least 1 warp assigned (in %).
-# DCGM_FI_PROF_SM_OCCUPANCY,       gauge, The ratio of number of warps resident on an SM (in %).
-DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %).
-DCGM_FI_PROF_DRAM_ACTIVE,        gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %).
-# DCGM_FI_PROF_PIPE_FP64_ACTIVE,   gauge, Ratio of cycles the fp64 pipes are active (in %).
-# DCGM_FI_PROF_PIPE_FP32_ACTIVE,   gauge, Ratio of cycles the fp32 pipes are active (in %).
-# DCGM_FI_PROF_PIPE_FP16_ACTIVE,   gauge, Ratio of cycles the fp16 pipes are active (in %).
-DCGM_FI_PROF_PCIE_TX_BYTES,      gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
-DCGM_FI_PROF_PCIE_RX_BYTES,      gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
-EOF
-
-    echo "NVIDIA GPU found. Proceeding with script..."
-    # Set DCGM Exporter version
-    DCGM_EXPORTER_VERSION=3.3.5-3.4.0-ubuntu22.04
+    # Set DCGM-Exporter-Version, for g5s, use older version (https://github.com/NVIDIA/dcgm-exporter/issues/319)
+    if [[ $INSTANCE_TYPE == *"g5"* ]]; then
+        echo "Instance Type is recognized as $INSTANCE_TYPE setting DCGM_EXPORTER_VERSION to 2.1.4-2.3.1-ubuntu20.04"
+        DCGM_EXPORTER_VERSION=2.1.4-2.3.1-ubuntu20.04
+    else
+        echo "Instance Type is recognized as $INSTANCE_TYPE, setting DCGM_EXPORTER_VERSION to 3.3.5-3.4.0-ubuntu22.04"
+        DCGM_EXPORTER_VERSION=3.3.5-3.4.1-ubuntu22.04
+    fi
+    echo "DCGM_EXPORTER_VERSION = $DCGM_EXPORTER_VERSION"
 
     # Run the DCGM Exporter Docker container
     sudo docker run -d --restart always \
        --gpus all \
        --net host \
        --cap-add SYS_ADMIN \
-       -v /opt/dcgm-exporter/dcgm-golden-metrics.csv:/etc/dcgm-exporter/dcgm-golden-metrics.csv \
        nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION} \
-       -f /etc/dcgm-exporter/dcgm-golden-metrics.csv || { echo "Failed to run DCGM Exporter Docker container"; exit 1; }
+       -f /etc/dcgm-exporter/dcp-metrics-included.csv || { echo "Failed to run DCGM Exporter Docker container"; exit 1; }
 
     echo "Running DCGM exporter in a Docker container on port 9400..."
 else
     echo "NVIDIA GPU not found. DCGM Exporter was not installed. If this is a controller node, you can safely ignore this warning. Exiting gracefully..."
     exit 0
-fi
+fi
diff --git a/...ures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_efa_node_exporter.sh b/...ures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_efa_node_exporter.sh
@@ -1,18 +1,27 @@
 #!/bin/bash
 
-# Clone the repository
-git clone https://github.com/aws-samples/awsome-distributed-training.git || { echo "Failed to clone the repository"; exit 1; }
+# Define variables
+REPO_DIR="awsome-distributed-training"
+REPO_URL="https://github.com/aws-samples/awsome-distributed-training.git"
+
+# Check if the repository directory exists
+if [ -d "$REPO_DIR" ]; then
+    echo "Repository already exists, skipping cloning."
+else
+    # Clone the repository
+    git clone --depth=1 "$REPO_URL" || { echo "Failed to clone the repository"; exit 1; }
+fi
+
 # Change directory to the desired location
-cd awsome-distributed-training/4.validation_and_observability/3.efa-node-exporter || { echo "Failed to change directory"; exit 1; }
+cd "$REPO_DIR/4.validation_and_observability/3.efa-node-exporter" || { echo "Failed to change directory"; exit 1; }
 
 # Build the Docker image explicitly
 sudo docker build -t node_exporter_efa:latest . || { echo "Failed to build Docker image"; exit 1; }
 
 # Run the Docker container with appropriate configurations
-sudo docker run -d --restart always\
+sudo docker run -d --restart always \
   --net="host" \
   --pid="host" \
   -v "/:/host:ro,rslave" \
   node_exporter_efa:latest \
   --path.rootfs=/host && { echo "Successfully started EFA Node Exporter on node"; exit 0; } || { echo "Failed to run Docker container"; exit 1; }
-
diff --git a/4.validation_and_observability/4.prometheus-grafana/update-prometheus.sh b/4.validation_and_observability/4.prometheus-grafana/update-prometheus.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+
+# Check if WORKSPACEID argument is provided
+if [ $# -ne 1 ]; then
+    echo "Usage: $0 <WORKSPACEID>"
+    exit 1
+fi
+
+WORKSPACEID="$1"
+
+
+# Retrieve IMDSv2 Token to fetch region of current EC2 Instance (Head Node)
+echo "Retrieving IMDSv2 Token to fetch region of current EC2 Instance (Head Node)"
+TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" -s)
+REGION=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/placement/region)
+
+# Retrieve AMPRemoteWriteURL from Output Tab of CloudFormation Stack
+echo "Retrieving AMPRemoteWriteURL from Output Tab of CloudFormation Stack"
+AMPREMOTEWRITEURL=https://aps-workspaces.$REGION.amazonaws.com/workspaces/$WORKSPACEID/api/v1/remote_write
+
+
+# Retrieve compute nodes from scontrol
+echo "Retrieving compute nodes from scontrol"
+export COMPUTENODES=$(scontrol show nodes | awk '/NodeAddr/ {print $1}' | cut -d '=' -f 2 | paste -sd "," -)
+
+# Function to generate target lines for a job
+generate_targets() {
+    local port="$1"
+    local nodes="$2"
+    IFS=',' read -r -a nodes_array <<< "$nodes"
+    for node_ip in "${nodes_array[@]}"; do
+        echo "          - '${node_ip}:${port}'"
+    done
+}
+
+if command -v prometheus &>/dev/null; then
+    echo "Prometheus is already installed. Skipping installation."
+else
+    # Retrieve the latest Prometheus version from GitHub releases
+    echo "Retrieving the latest Prometheus version..."
+    LATEST_VERSION=$(curl -s https://api.github.com/repos/prometheus/prometheus/releases/latest | grep -oP '"tag_name": "\K(.*?)(?=")' | sed 's/^v//')
+
+    # Check if the latest version retrieval was successful
+    if [ -z "$LATEST_VERSION" ]; then
+        echo "Error: Failed to retrieve the latest Prometheus version."
+        exit 1
+    fi
+
+    echo "Latest Prometheus version: $LATEST_VERSION"
+
+    # Construct the download URL with the correct version format
+    DOWNLOAD_URL="https://github.com/prometheus/prometheus/releases/download/v$LATEST_VERSION/prometheus-$LATEST_VERSION.linux-amd64.tar.gz"
+
+    # Download the latest Prometheus release tarball
+    echo "Downloading Prometheus version $LATEST_VERSION from $DOWNLOAD_URL ..."
+    wget --progress=dot:giga "$DOWNLOAD_URL"
+
+    # Extract Prometheus
+    echo "Extracting Prometheus"
+    tar xvfz prometheus-*.tar.gz
+
+    # Move to Prometheus directory
+    cd prometheus-*-amd64
+
+    # Move binaries to /usr/bin/
+    echo "Moving Prometheus binaries to /usr/bin/"
+    sudo mv prometheus /usr/bin/
+    sudo mv promtool /usr/bin/
+
+    # Create Prometheus config directory
+    echo "Creating Prometheus config directory"
+    sudo mkdir -p /etc/prometheus
+
+    # Move prometheus.yml to config directory
+    echo "Moving prometheus.yml to /etc/prometheus/"
+    sudo mv prometheus.yml /etc/prometheus/prometheus.yml
+fi
+
+# Replace placeholders in the configuration template
+echo "Replacing placeholders in the Prometheus configuration template"
+sudo tee /etc/prometheus/prometheus.yml > /dev/null <<EOF
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+  scrape_timeout: 15s
+
+scrape_configs:
+  - job_name: 'slurm_exporter'
+    static_configs:
+      - targets:
+          - 'localhost:8080'
+  - job_name: 'dcgm_exporter'
+    static_configs:
+      - targets:
+$(generate_targets 9400 "$COMPUTENODES")
+  - job_name: 'efa_node_exporter'
+    static_configs:
+      - targets:
+$(generate_targets 9100 "$COMPUTENODES")
+
+remote_write:
+  - url: ${AMPREMOTEWRITEURL}
+    queue_config:
+      max_samples_per_send: 1000
+      max_shards: 200
+      capacity: 2500
+    sigv4:
+      region: ${REGION}
+EOF
+
+# Create Prometheus systemd service file
+echo "Creating Prometheus systemd service file"
+sudo tee /etc/systemd/system/prometheus.service > /dev/null <<EOF
+[Unit]
+Description=Prometheus Exporter
+
+[Service]
+Environment=PATH=/opt/slurm/bin:\$PATH
+ExecStart=/usr/bin/prometheus --config.file=/etc/prometheus/prometheus.yml --enable-feature=agent --storage.agent.path="/opt/prometheus/data-agent"
+Restart=on-failure
+RestartSec=15
+Type=simple
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+# Reload systemd and enable Prometheus service
+echo "Reloading systemd and enabling Prometheus service"
+sudo systemctl daemon-reload
+sudo systemctl restart prometheus
+
+echo "Prometheus setup completed successfully"