Skip to content

Commit

Permalink
Userdata send cloudwatch metrics to setup alerts on failures (#5399)
Browse files Browse the repository at this point in the history
  • Loading branch information
jeanschmidt authored Jul 4, 2024
1 parent 6ad6597 commit e8625c3
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
set -euxo pipefail
set -exo pipefail

install_hooks() {
pushd /home/$USER_NAME
Expand Down Expand Up @@ -55,9 +55,15 @@ else
fi

echo wait for configuration
RETRY_LEFT=600
while [[ $(aws ssm get-parameters --names ${environment}-$INSTANCE_ID --with-decryption --region $REGION | jq -r ".Parameters | .[0] | .Value") == null ]]; do
echo Waiting for configuration ...
sleep 1
RETRY_LEFT=$((RETRY_LEFT-1))
if [[ $RETRY_LEFT -eq 0 ]]; then
echo "Timeout waiting for configuration"
false # the script should fail when a command returns non-zero, and then send logs about it
fi
done
CONFIG=$(aws ssm get-parameters --names ${environment}-$INSTANCE_ID --with-decryption --region $REGION | jq -r ".Parameters | .[0] | .Value")
retry aws ssm delete-parameter --name ${environment}-$INSTANCE_ID --region $REGION
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,40 @@
#!/bin/bash

set -euxo pipefail
set -exo pipefail

function metric_report () {
local metric_name=$1
local value=$2

aws cloudwatch put-metric-data --metric-name "$metric_name" --namespace "GHARunners/all" --value $value --region us-east-1 || true

local namespace="GHARunners/all"
if [ ! -z "${environment}" ]; then
namespace="GHARunners/${environment}"
aws cloudwatch put-metric-data --metric-name "$metric_name" --namespace "$namespace" --value $value --region us-east-1 || true
fi

if [ ! -z "$INSTANCE_ID" ]; then
aws cloudwatch put-metric-data --metric-name "$metric_name" --namespace "$namespace" --value $value --region us-east-1 --dimensions "InstanceId=$INSTANCE_ID" || true
fi
if [ ! -z "$REGION" ]; then
aws cloudwatch put-metric-data --metric-name "$metric_name" --namespace "$namespace" --value $value --region us-east-1 --dimensions "Region=$REGION" || true
fi
if [ ! -z "$OS_ID" ]; then
aws cloudwatch put-metric-data --metric-name "$metric_name" --namespace "$namespace" --value $value --region us-east-1 --dimensions "os=$OS_ID" || true
fi
if [ ! -z "$OS_ID" ]; then
aws cloudwatch put-metric-data --metric-name "$metric_name" --namespace "$namespace" --value $value --region us-east-1 --dimensions GHRunnerId=$GH_RUNNER_ID || true
fi
}

function err_report () {
echo "Error on line $1"
metric_report "linux_userdata.error" 1
exit 1
}

trap 'err_report $LINENO' ERR

function retry {
local retries=7
Expand All @@ -22,6 +56,8 @@ function retry {

exec > >(tee /var/log/user-data.log | logger -t user-data -s 2>/dev/console) 2>&1

metric_report "linux_userdata.execution" 1

OS_ID=$(. /etc/os-release;echo $ID$VERSION_ID)
if [[ "$OS_ID" =~ ^amzn2023* ]]; then
PKG_MANAGER="dnf"
Expand Down Expand Up @@ -85,7 +121,6 @@ retry sudo $PKG_MANAGER install -y "kernel-devel-uname-r == $(uname -r)" || true
tar xzf 4.14.336-257.562.amzn2.x86_64.tar.gz
)


echo Checking if nvidia install required ${nvidia_driver_install}
%{ if nvidia_driver_install ~}
echo "NVIDIA driver install required"
Expand Down Expand Up @@ -117,3 +152,5 @@ fi
${post_install}

./svc.sh start

metric_report "linux_userdata.success" 1

0 comments on commit e8625c3

Please sign in to comment.