diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py index e26f6a5f..d02ff1c3 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py @@ -151,10 +151,20 @@ def main(args): ExecuteBashScript("./start_slurm.sh").run(node_type, ",".join(controllers)) - # Note: Uncomment the below lines to install docker and enroot - # ExecuteBashScript("./utils/install_docker.sh").run() - # ExecuteBashScript("./utils/install_enroot_pyxis.sh").run(node_type) + ## Note: Uncomment the below lines to install docker and enroot. + ExecuteBashScript("./utils/install_docker.sh").run() + ExecuteBashScript("./utils/install_enroot_pyxis.sh").run(node_type) + # # Note: Uncomment the below lines to install DCGM Exporter and EFA Node Exporter and Cluster Nodes. (Docker must also be installed above) + # if node_type == SlurmNodeType.COMPUTE_NODE: + # ExecuteBashScript("./utils/install_dcgm_exporter.sh").run() + # ExecuteBashScript("./utils/install_efa_node_exporter.sh").run() + + # # Note: Uncomment the below lines to install Slurm Exporter and Prometheus on the Controller Node. + # if node_type == SlurmNodeType.HEAD_NODE: + # ExecuteBashScript("./utils/install_slurm_exporter.sh").run() + # ExecuteBashScript("./utils/install_prometheus.sh").run() + print("[INFO]: Success: All provisioning scripts completed") diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh new file mode 100644 index 00000000..f3a842fa --- /dev/null +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Check if Nvidia GPU is present +if nvidia-smi; then + echo "NVIDIA GPU found. Proceeding with script..." + # Set DCGM Exporter version + DCGM_EXPORTER_VERSION=2.1.4-2.3.1 + + # Run the DCGM Exporter Docker container + sudo docker run -d --rm \ + --gpus all \ + --net host \ + --cap-add SYS_ADMIN \ + nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION}-ubuntu20.04 \ + -f /etc/dcgm-exporter/dcp-metrics-included.csv || { echo "Failed to run DCGM Exporter Docker container"; exit 1; } + + echo "Running DCGM exporter in a Docker container on port 9400..." +else + echo "NVIDIA GPU not found. DCGM Exporter was not installed. If this is controller node, you can safelly ignore this warning. Exiting gracefully..." + exit 0 +fi \ No newline at end of file diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_efa_node_exporter.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_efa_node_exporter.sh new file mode 100644 index 00000000..48432a76 --- /dev/null +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_efa_node_exporter.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Clone the repository +git clone https://github.com/aws-samples/awsome-distributed-training.git || { echo "Failed to clone the repository"; exit 1; } +# Change directory to the desired location +cd awsome-distributed-training/4.validation_and_observability/3.efa-node-exporter || { echo "Failed to change directory"; exit 1; } + +# Build the Docker image explicitly +sudo docker build -t node_exporter_efa:latest . || { echo "Failed to build Docker image"; exit 1; } + +# Run the Docker container with appropriate configurations +sudo docker run -d \ + --net="host" \ + --pid="host" \ + -v "/:/host:ro,rslave" \ + node_exporter_efa:latest \ + --path.rootfs=/host && { echo "Successfully started EFA Node Exporter on node"; exit 0; } || { echo "Failed to run Docker container"; exit 1; } + diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_prometheus.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_prometheus.sh new file mode 100644 index 00000000..6d5eb343 --- /dev/null +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_prometheus.sh @@ -0,0 +1,127 @@ +#!/bin/bash + +# Retrieve IMDSv2 Token to fetch region of current EC2 Instance (Head Node) +echo "Retrieving IMDSv2 Token to fetch region of current EC2 Instance (Head Node)" +TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" -s) +REGION=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/placement/region) + +# Retrieve AMPRemoteWriteURL from Output Tab of CloudFormation Stack +echo "Retrieving AMPRemoteWriteURL from Output Tab of CloudFormation Stack" +AMPREMOTEWRITEURL=$(aws cloudformation describe-stacks \ +--region $REGION \ +--query "Stacks[?Description != null && contains(Description, 'monitor sagemaker hyperpod')][].Outputs[?OutputKey=='AMPRemoteWriteURL'].OutputValue" \ +--output text | grep -v 'None') + +# Check if CFNREGION is empty +if [ -z "$AMPREMOTEWRITEURL" ]; then + echo "Cluster may be in a different Region than monitoring stack. Unable to determine AMPRemoteWriteURL for prometheus. You will need to manually edit /etc/prometheus/prometheus.yml file on the head node and restart prometheus to complete setup." +fi + +# Retrieve compute nodes from scontrol +echo "Retrieving compute nodes from scontrol" +export COMPUTENODES=$(scontrol show nodes | awk '/NodeAddr/ {print $1}' | cut -d '=' -f 2 | paste -sd "," -) + +# Function to generate target lines for a job +generate_targets() { + local port="$1" + local nodes="$2" + IFS=',' read -r -a nodes_array <<< "$nodes" + for node_ip in "${nodes_array[@]}"; do + echo " - '${node_ip}:${port}'" + done +} + +# Retrieve the latest Prometheus version from GitHub releases +echo "Retrieving the latest Prometheus version..." +LATEST_VERSION=$(curl -s https://api.github.com/repos/prometheus/prometheus/releases/latest | grep -oP '"tag_name": "\K(.*?)(?=")' | sed 's/^v//') + +# Check if the latest version retrieval was successful +if [ -z "$LATEST_VERSION" ]; then + echo "Error: Failed to retrieve the latest Prometheus version." + exit 1 +fi + +echo "Latest Prometheus version: $LATEST_VERSION" + +# Construct the download URL with the correct version format +DOWNLOAD_URL="https://github.com/prometheus/prometheus/releases/download/v$LATEST_VERSION/prometheus-$LATEST_VERSION.linux-amd64.tar.gz" + +# Download the latest Prometheus release tarball +echo "Downloading Prometheus version $LATEST_VERSION from $DOWNLOAD_URL ..." +wget "$DOWNLOAD_URL" + +# Extract Prometheus +echo "Extracting Prometheus" +tar xvfz prometheus-*.tar.gz + +# Move to Prometheus directory +cd prometheus-*-amd64 + +# Move binaries to /usr/bin/ +echo "Moving Prometheus binaries to /usr/bin/" +sudo mv prometheus /usr/bin/ +sudo mv promtool /usr/bin/ + +# Create Prometheus config directory +echo "Creating Prometheus config directory" +sudo mkdir -p /etc/prometheus + +# Move prometheus.yml to config directory +echo "Moving prometheus.yml to /etc/prometheus/" +sudo mv prometheus.yml /etc/prometheus/prometheus.yml + +# Replace placeholders in the configuration template +echo "Replacing placeholders in the Prometheus configuration template" +sudo tee /etc/prometheus/prometheus.yml > /dev/null < /dev/null < /dev/null; then + echo "Go is not installed. Installing Go..." + sudo apt install -y golang + else + echo "Go is already installed." + fi + echo "This was identified as the controller node because Slurmctld is running. Begining SLURM Exporter Installation" + git clone -b 0.20 https://github.com/vpenso/prometheus-slurm-exporter.git + cd prometheus-slurm-exporter + sudo make && sudo cp bin/prometheus-slurm-exporter /usr/bin/ + sudo tee /etc/systemd/system/prometheus-slurm-exporter.service > /dev/null < + +This repository provides a comprehensive guide for deploying an observability stack tailored to enhance monitoring capabilities for your SageMaker HyperPod cluster. It demonstrates how to export both cluster metrics (SLURM-exporter) and node metrics (DCGM-exporter, EFA-node-exporter) to a Prometheus/Grafana monitoring stack. This setup enables your administrators, ML-ops teams, and model developers to access real-time metrics, offering valuable insights into your cluster's performance. + + +To get started, you will initiate the provisioning of an Amazon CloudFormation Stack within your AWS Account. You can find the complete stack template in [cluster-observability.yaml](./cluster-observability.yaml). This CloudFormation stack will orchestrate the deployment of the following resources dedicated to cluster monitoring in your AWS environment: + + * [Amazon Manged Prometheus WorkSpace](https://aws.amazon.com/prometheus/) + * [Amazon Managed Grafana Workspace](https://aws.amazon.com/grafana/) + * Associated IAM roles and permissions + + +![observability_architecture](./assets/observability_architecture.png) + + +The solution uses SageMaker HyperPod [Lifecycle Scripts](https://github.com/aws-samples/awsome-distributed-training/tree/main/1.architectures/5.sagemaker-hyperpod#31-lifecycle-scripts), to bootstrap your cluster with the following open-source exporter services: + +| Name | Script Deployment Target | Metrics Description | +| ------------------------------------------------------------------ | -------- | --------------------------------------------------- | +| [`0.Prometheus Slurm Exporter`](https://github.com/vpenso/prometheus-slurm-exporter) | controller-node | SLURM Accounting metrics (sinfo, sacct) | +| [`1.EFA-Node-Exporter`](https://github.com/aws-samples/awsome-distributed-training/tree/main/4.validation_and_observability/3.efa-node-exporter) | cluster-nodes | Fork of Node exporter to include metrics from emitted from EFA | +| [`2.NVIDIA-DCGM-Exporter`](https://github.com/NVIDIA/dcgm-exporter) | cluster-nodes | Nvidia DCGM Metrics about Nvidia Enabled GPUs | + +### Prerequisites + +> [!IMPORTANT] +> To enable these exporter services, uncomment lines 154-165 from the [lifecycle_script.py](https://github.com/aws-samples/awsome-distributed-training/blob/c52a69393f4ecdaba7de8af802174d075eca3a3b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py#L154) file used when deploying your cluster. Uncommenting these lines will install and configure the necessary exporter services to export cluster metrics to the Amazon Managed Prometheus workspace. Save this file, and [upload it to the s3 bucket path](https://catalog.workshops.aws/sagemaker-hyperpod/en-US/01-cluster/03-s3) referenced in your [`cluster-config.json`](https://catalog.workshops.aws/sagemaker-hyperpod/en-US/01-cluster/04-create-cluster#create-cluster) file. + + +> [!IMPORTANT] +>Before proceeding, you will need to add the following AWS Managed IAM Policies to your AmazonSagemakerClusterExecutionRole: +>* [AmazonPrometheusRemoteWriteAccess](https://us-east-1.console.aws.amazon.com/iam/home?/policies/details/arn%3Aaws%3Aiam%3A%3Aaws%3Apolicy%2FAmazonPrometheusRemoteWriteAccess?section=permissions#/policies/details/arn%3Aaws%3Aiam%3A%3Aaws%3Apolicy%2FAmazonPrometheusRemoteWriteAccess?section=permissions): *this will give the control node access to write cluster metrics to the Amazon Managed Prometheus Workspace you will create.* +>* [AWSCloudFormatinoReadOnlyAccess](https://us-east-1.console.aws.amazon.com/iam/home?policies/details/arn%3Aaws%3Aiam%3A%3Aaws%3Apolicy%2FAWSCloudFormationReadOnlyAccess?section=permissions#/policies/details/arn%3Aaws%3Aiam%3A%3Aaws%3Apolicy%2FAWSCloudFormationReadOnlyAccess?section=permissions) *this will give the install_prometheus.sh file permissions to read stack outputs (remotewriteurl, region) from your cloudformation stack* + +### Deploy the CloudFormation Stack + +[
 1-Click Deploy 🚀 
](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://awsome-distributed-training.s3.amazonaws.com/templates/cluster-observability.yaml&stackName=Cluster-Observability) + +>[!IMPORTANT] +> It is strongly recommended you deploy this stack into the same region and same account as your SageMaker HyperPod Cluster.This will ensure successful execution of the Lifecycle Scripts, specifically `install_prometheus.sh`, which relies on AWS CLI commands that assume same account and same region. + +### Connect to the cluster +Connect to the controller node of your cluster via ssm: +>[!NOTE] +>You can find the ClusterID, WorkerGroup, and Instance ID of your controller node in the SageMaker Console or via the AWS CLI + +```bash +aws ssm start-session —target sagemaker-cluster:_- +``` + +Verify the new prometheus config and service file created by `install_prometheus.sh` is running on the controller node: +```bash +sudo systemctl status prometheus +``` +The output should show active (running): +![prometheus_running](./assets/prometheus_running.png) + +You can validate the prometheus confiugration file with: +```bash +cat /etc/prometheus/prometheus.yml +``` + +Your file should look similar to the following: +```yaml +global: + scrape_interval: 15s + evaluation_interval: 15s + scrape_timeout: 15s + +scrape_configs: + - job_name: 'slurm_exporter' + static_configs: + - targets: + - 'localhost:8080' + - job_name: 'dcgm_exporter' + static_configs: + - targets: + - ':9400' + - ':9400' + - job_name: 'efa_node_exporter' + static_configs: + - targets: + - ':9100' + - ':9100' + +remote_write: + - url: + queue_config: + max_samples_per_send: 1000 + max_shards: 200 + capacity: 2500 + sigv4: + region: +``` + +You can curl for relevant Promtetheus metrics on the controller nodes using: +```bash +curl -s http://localhost:9090/metrics | grep -E 'slurm|dcgm|efa' +``` + +With node and cluster metrics now being exported to Amazon Managed Prometheus Workspace via prometheus remote write from the control node, next you will set up the Amazon Managed Grafana Workspace. + +### Setup the Grafana Workspace +>[!IMPORTANT] +>Before proceeding, ensure your AWS Account has been setup with [AWS Identity Center](https://docs.aws.amazon.com/singlesignon/latest/userguide/get-set-up-for-idc.html). It will be used to authenticate to the Amazon Managed Grafana Workspace in the final steps: + +Navigate to [Amazon Managed Grafana](https://console.aws.amazon.com/grafana/home?#/workspaces) in the AWS Management Console + +In the Authentication Tab, configure Authentication using AWS IAM Identity Center: + +>[!NOTE] +>Configure your AWS IAM Identity Center User as User type: Admin. + +![grafana users admin](./assets/grafana_users_admin.png) + +Within the DataSources Tab of your Grafana workspace, click the “Configure in Grafana” link to Configure Prometheus as a data source. + +![grafana datasources](./assets/grafana-datasource.png) + +You will prompted to authenticate to the Grafana workspace with the IAM Identity Center Username and Password. This is the you set up for the workspace. + +>[!NOTE] +>If you have forgotten username and password, you can find and reset them within [IAM Identity Center](https://us-east-1.console.aws.amazon.com/singlesignon/identity) + +![grafana datasources](./assets/grafana-datasource.png) + +Once you are in the Amazon Managed Grafana Workspace "datasources" page, select the AWS Region and Prometheus Workspace ID of your Amazon Managed Prometheus Workspace ID. + +![grafana datasource configure](./assets/grafana-datasource-configure.png) + +### Build Grafana Dashboards + +Finally, with authentication and data sources setup, within your grafana workspace, select dashboards > new > import. + +To display metrics for the exporter services, you can start by configuring and customizing the following 3 open source Grafana Dashboards by copying and pasting the below links: + +#### Slurm Exporter Dashboard: + +https://grafana.com/grafana/dashboards/4323-slurm-dashboard/ + +![slurm dashboard](./assets/slurm-dashboard.png) + +#### Node Exporter Dashboard: + +https://grafana.com/grafana/dashboards/1860-node-exporter-full/ + +![EFA Node dashboard](./assets/efa-node-dashboard.png) + +#### DCGM Exporter Dashboard: + +https://grafana.com/grafana/dashboards/12239-nvidia-dcgm-exporter-dashboard/ + +![DCGM Dashboard](./assets/dcgm-dashboard.png) + +Congratulations, you can now view real time metrics about your Sagemaker HyperPod Cluster and compute nodes in Grafana! \ No newline at end of file diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/dcgm-dashboard.png b/4.validation_and_observability/4.prometheus-grafana/assets/dcgm-dashboard.png new file mode 100644 index 00000000..7280f80b Binary files /dev/null and b/4.validation_and_observability/4.prometheus-grafana/assets/dcgm-dashboard.png differ diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/efa-node-dashboard.png b/4.validation_and_observability/4.prometheus-grafana/assets/efa-node-dashboard.png new file mode 100644 index 00000000..2db264b1 Binary files /dev/null and b/4.validation_and_observability/4.prometheus-grafana/assets/efa-node-dashboard.png differ diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/grafana-datasource-configure.png b/4.validation_and_observability/4.prometheus-grafana/assets/grafana-datasource-configure.png new file mode 100644 index 00000000..cd6fb6fe Binary files /dev/null and b/4.validation_and_observability/4.prometheus-grafana/assets/grafana-datasource-configure.png differ diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/grafana-datasource.png b/4.validation_and_observability/4.prometheus-grafana/assets/grafana-datasource.png new file mode 100644 index 00000000..8969fb0b Binary files /dev/null and b/4.validation_and_observability/4.prometheus-grafana/assets/grafana-datasource.png differ diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/grafana_users.png b/4.validation_and_observability/4.prometheus-grafana/assets/grafana_users.png new file mode 100644 index 00000000..74bb2025 Binary files /dev/null and b/4.validation_and_observability/4.prometheus-grafana/assets/grafana_users.png differ diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/grafana_users_admin.png b/4.validation_and_observability/4.prometheus-grafana/assets/grafana_users_admin.png new file mode 100644 index 00000000..9f5f1e7c Binary files /dev/null and b/4.validation_and_observability/4.prometheus-grafana/assets/grafana_users_admin.png differ diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/observability_architecture.png b/4.validation_and_observability/4.prometheus-grafana/assets/observability_architecture.png new file mode 100644 index 00000000..67d9fdd6 Binary files /dev/null and b/4.validation_and_observability/4.prometheus-grafana/assets/observability_architecture.png differ diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/prometheus_running.png b/4.validation_and_observability/4.prometheus-grafana/assets/prometheus_running.png new file mode 100644 index 00000000..7b5cc536 Binary files /dev/null and b/4.validation_and_observability/4.prometheus-grafana/assets/prometheus_running.png differ diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/slurm-dashboard.png b/4.validation_and_observability/4.prometheus-grafana/assets/slurm-dashboard.png new file mode 100644 index 00000000..42638b38 Binary files /dev/null and b/4.validation_and_observability/4.prometheus-grafana/assets/slurm-dashboard.png differ diff --git a/4.validation_and_observability/4.prometheus-grafana/cluster-observability.yaml b/4.validation_and_observability/4.prometheus-grafana/cluster-observability.yaml new file mode 100644 index 00000000..6e07ea50 --- /dev/null +++ b/4.validation_and_observability/4.prometheus-grafana/cluster-observability.yaml @@ -0,0 +1,67 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: "Setup to monitor sagemaker hyperpod clusters on AWS. Amazon Managed Prometheus and Amazon Manged Grafana workspaces with associated IAM roles are deployed in the AWS Account. Prometheus and exporter services are set up on Cluster Nodes. Author: Matt Nightingale - nghtm@" + + +Resources: + AmazonGrafanaWorkspaceIAMRole: + Type: 'AWS::IAM::Role' + Properties: + AssumeRolePolicyDocument: + Version: 2012-10-17 + Statement: + - Effect: Allow + Principal: + Service: + - grafana.amazonaws.com + Action: + - 'sts:AssumeRole' + RoleName: !Sub ${AWS::StackName}-Grafana-Role + + AmazonGrafanaPrometheusPolicy: + Type: AWS::IAM::Policy + Properties: + PolicyName: AmazonGrafana_Prometheus_policy + PolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Action: + - aps:ListWorkspaces + - aps:DescribeWorkspace + - aps:QueryMetrics + - aps:GetLabels + - aps:GetSeries + - aps:GetMetricMetadata + Resource: "*" + Roles: [!Ref AmazonGrafanaWorkspaceIAMRole] + + AmazonGrafanaWorkspace: + Type: 'AWS::Grafana::Workspace' + Properties: + AccountAccessType: CURRENT_ACCOUNT + Name: !Sub ${AWS::StackName}-Dashboard + Description: Amazon Grafana Workspace to monitor SageMaker Cluster + AuthenticationProviders: + - AWS_SSO + PermissionType: SERVICE_MANAGED + RoleArn: !GetAtt + - AmazonGrafanaWorkspaceIAMRole + - Arn + DataSources: ["CLOUDWATCH","PROMETHEUS"] + OrganizationRoleName: "ADMIN" + + APSWorkspace: + Type: AWS::APS::Workspace + Properties: + Alias: !Sub ${AWS::StackName}-Hyperpod-WorkSpace + Tags: + - Key: Name + Value: SageMaker Hyperpod PrometheusMetrics + +Outputs: + Region: + Value: !Ref "AWS::Region" + AMPRemoteWriteURL: + Value: !Join ["" , [ !GetAtt APSWorkspace.PrometheusEndpoint , "api/v1/remote_write" ]] + GrafanWorkspaceURL: + Value: !Join ["" , [ "https://", !GetAtt AmazonGrafanaWorkspace.Endpoint ]]