Skip to content

Commit

Permalink
feat: multi headnode support update (#535)
Browse files Browse the repository at this point in the history
* feat: mutilheadnode support update

* feat: add license to bash

* revert the backup subnet change

* feat: add optional subnet output
  • Loading branch information
guanweim authored Jan 31, 2025
1 parent b0389a1 commit 3b6aab9
Show file tree
Hide file tree
Showing 6 changed files with 1,077 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class ProvisioningParameters:
WORKLOAD_MANAGER_KEY: str = "workload_manager"
FSX_DNS_NAME: str = "fsx_dns_name"
FSX_MOUNT_NAME: str = "fsx_mountname"
SLURM_CONFIGURATIONS: str = "slurm_configurations"

def __init__(self, path: str):
with open(path, "r") as f:
Expand All @@ -81,6 +82,14 @@ def controller_group(self) -> Optional[str]:
def login_group(self) -> Optional[str]:
return self._params.get("login_group")

@property
def slurm_configurations(self) -> Dict[str, Any]:
slurm_configurations = self._params.get(ProvisioningParameters.SLURM_CONFIGURATIONS)
if not slurm_configurations:
return {}

return slurm_configurations

def get_ip_address():
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
try:
Expand Down Expand Up @@ -176,7 +185,10 @@ def main(args):
node_type = SlurmNodeType.LOGIN_NODE

if node_type == SlurmNodeType.HEAD_NODE:
ExecuteBashScript("./setup_mariadb_accounting.sh").run()
if params.slurm_configurations:
ExecuteBashScript("./multi_headnode_setup/headnode_setup.sh").run()
else:
ExecuteBashScript("./setup_mariadb_accounting.sh").run()

ExecuteBashScript("./apply_hotfix.sh").run(node_type)
ExecuteBashScript("./utils/motd.sh").run(node_type, ",".join(head_node_ip), ",".join(login_node_ip))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
#!/bin/bash

: <<'SUMMARY'
Script: headnode_notification.sh
Purpose:
This script sets up SNS (Simple Notification Service) notification scripts for Slurm
controller failover in a multi-head node cluster environment. It creates two scripts:
one for when the primary controller comes online, and another for when it goes offline.
Key Functions:
1. main: The entry point of the script. It processes arguments and calls create_script
for both "ON" and "OFF" scenarios.
2. create_script: Generates a notification script for a given scenario (ON/OFF).
It creates a bash script that logs controller status changes and sends SNS
notifications.
Arguments:
1. SNS Topic ARN
2. SNS Region
3. Slurm Cluster Path /<shared volume dir>/aws/hyperpod/<cluster-name>
4. File Ownership - it's used for executor to setup file ownership
Output:
- Creates two scripts:
1. slurm_controller_on.sh: Notifies when the slurm controller comes online.
2. slurm_controller_off.sh: Notifies when the slurm controller goes offline.
- Returns the paths of the created scripts.
Usage:
To use this script, run the following command:
./headnode_notification.sh <sns_topic_arn> <sns_region> <slurm_cluster_path>
Or execute the main script headnode_setup.sh as a LifeCycle Script upon cluster creation which it will also
execute this headnode_notification.sh
Notes:
- Requires AWS CLI to be installed and configured for sending SNS notifications.
- The created scripts log events to status and notification log files.
- Ensures proper permissions (executable) for the created scripts.
- The scripts are owned by the 'slurm' user and group.
Important:
This script is crucial for maintaining awareness of the Slurm controller's status
in a high-availability setup. Ensure that the SNS topic is properly configured and
that the necessary permissions are in place for sending notifications.
SUMMARY

main() {
local sns_topic_arn=$1
local sns_region=$2
local slurm_cluster_path=$3
local ownership=$4
local on_script_path="${slurm_cluster_path}/opt/slurm/etc/scripts/slurm_controller_on.sh"
local off_script_path="${slurm_cluster_path}/opt/slurm/etc/scripts/slurm_controller_off.sh"
# Create the scripts
create_script "$on_script_path" "$sns_topic_arn" "$sns_region" "ON", "Primary" "$slurm_cluster_path" "$ownership"
create_script "$off_script_path" "$sns_topic_arn" "$sns_region" "OFF" "Backup" "$slurm_cluster_path" "$ownership"
# return the two path in an array
local paths=("$on_script_path" "$off_script_path")
echo "${paths[@]}"
}

#######################################
# Sets up sns notification script for headnode failover alarm
#
# Arguments:
# script_path: Where the path should be located
# sns_topic_arn: sns topic arn
# sns_region: sns topic region
# status: ON or OFF enum for the controller script
# is_primary: is it a primary controller
# slurm_cluster_path: path to slurm cluster
# ownership: file ownership
# Outputs:
# Returns array of script paths
# Returns:
# 0 on success, non-zero on failure
#######################################
create_script() {
local script_path="$1"
local sns_topic_arn="$2"
local sns_region="$3"
local status="$4"
local is_primary="$5"
local slurm_cluster_path="$6"
local ownership="$7"

cat << EOF > "$script_path"
#!/bin/bash
set -e
set -x
set -o pipefail # trace ERR through pipes
set -o errtrace # trace ERR through 'time command' and other functions
set -o nounset ## set -u : exit the script if you try to use an uninitialised variable
# Log file paths
STATUS_LOG="${slurm_cluster_path}/var/log/slurm/controller_status.log"
NOTIFICATION_LOG="${slurm_cluster_path}/var/log/slurm/notification_attempts.log"
# Event details
TIMESTAMP=\$(date -u +'%Y-%m-%dT%H:%M:%SZ')
EVENT="Slurm Controller is $status"
NODE_TYPE="$is_primary Node"
hostname=\$(ip route get 1 | sed -n 's/.*src \([^ ]*\).*/\1/p')
# Log the event
echo "\$TIMESTAMP: \$EVENT (\$hostname, \$NODE_TYPE)" >> "\$STATUS_LOG"
# Attempt to send notification via SNS
if command -v aws &> /dev/null; then
RESPONSE=\$(aws sns publish \
--topic-arn "$sns_topic_arn" \
--message "\$EVENT (\$hostname, \$NODE_TYPE)" \
--subject "Slurm Controller Status Change" \
--region "$sns_region" 2>&1)
if [ \$? -eq 0 ]; then
echo "\$TIMESTAMP: Notification sent successfully to SNS" >> "\$NOTIFICATION_LOG"
else
echo "\$TIMESTAMP: Failed to send notification to SNS. Error: \$RESPONSE" >> "\$NOTIFICATION_LOG"
fi
else
echo "\$TIMESTAMP: AWS CLI not found. Unable to send notification." >> "\$NOTIFICATION_LOG"
fi
EOF
chmod 755 "$script_path"
chown "$ownership" "$script_path"
}

main "$@"

Loading

0 comments on commit 3b6aab9

Please sign in to comment.