diff --git a/terraform-aws-github-runner/modules/runners-instances/templates/install-config-runner.sh b/terraform-aws-github-runner/modules/runners-instances/templates/install-config-runner.sh index 3f18e22ef0..119f9f4ba3 100644 --- a/terraform-aws-github-runner/modules/runners-instances/templates/install-config-runner.sh +++ b/terraform-aws-github-runner/modules/runners-instances/templates/install-config-runner.sh @@ -1,4 +1,4 @@ -set -euxo pipefail +set -exo pipefail install_hooks() { pushd /home/$USER_NAME @@ -55,9 +55,15 @@ else fi echo wait for configuration +RETRY_LEFT=600 while [[ $(aws ssm get-parameters --names ${environment}-$INSTANCE_ID --with-decryption --region $REGION | jq -r ".Parameters | .[0] | .Value") == null ]]; do echo Waiting for configuration ... sleep 1 + RETRY_LEFT=$((RETRY_LEFT-1)) + if [[ $RETRY_LEFT -eq 0 ]]; then + echo "Timeout waiting for configuration" + false # the script should fail when a command returns non-zero, and then send logs about it + fi done CONFIG=$(aws ssm get-parameters --names ${environment}-$INSTANCE_ID --with-decryption --region $REGION | jq -r ".Parameters | .[0] | .Value") retry aws ssm delete-parameter --name ${environment}-$INSTANCE_ID --region $REGION diff --git a/terraform-aws-github-runner/modules/runners-instances/templates/user-data.sh b/terraform-aws-github-runner/modules/runners-instances/templates/user-data.sh index d55a299203..edfe01b5d4 100644 --- a/terraform-aws-github-runner/modules/runners-instances/templates/user-data.sh +++ b/terraform-aws-github-runner/modules/runners-instances/templates/user-data.sh @@ -1,6 +1,40 @@ #!/bin/bash -set -euxo pipefail +set -exo pipefail + +function metric_report () { + local metric_name=$1 + local value=$2 + + aws cloudwatch put-metric-data --metric-name "$metric_name" --namespace "GHARunners/all" --value $value --region us-east-1 || true + + local namespace="GHARunners/all" + if [ ! -z "${environment}" ]; then + namespace="GHARunners/${environment}" + aws cloudwatch put-metric-data --metric-name "$metric_name" --namespace "$namespace" --value $value --region us-east-1 || true + fi + + if [ ! -z "$INSTANCE_ID" ]; then + aws cloudwatch put-metric-data --metric-name "$metric_name" --namespace "$namespace" --value $value --region us-east-1 --dimensions "InstanceId=$INSTANCE_ID" || true + fi + if [ ! -z "$REGION" ]; then + aws cloudwatch put-metric-data --metric-name "$metric_name" --namespace "$namespace" --value $value --region us-east-1 --dimensions "Region=$REGION" || true + fi + if [ ! -z "$OS_ID" ]; then + aws cloudwatch put-metric-data --metric-name "$metric_name" --namespace "$namespace" --value $value --region us-east-1 --dimensions "os=$OS_ID" || true + fi + if [ ! -z "$OS_ID" ]; then + aws cloudwatch put-metric-data --metric-name "$metric_name" --namespace "$namespace" --value $value --region us-east-1 --dimensions GHRunnerId=$GH_RUNNER_ID || true + fi +} + +function err_report () { + echo "Error on line $1" + metric_report "linux_userdata.error" 1 + exit 1 +} + +trap 'err_report $LINENO' ERR function retry { local retries=7 @@ -22,6 +56,8 @@ function retry { exec > >(tee /var/log/user-data.log | logger -t user-data -s 2>/dev/console) 2>&1 +metric_report "linux_userdata.execution" 1 + OS_ID=$(. /etc/os-release;echo $ID$VERSION_ID) if [[ "$OS_ID" =~ ^amzn2023* ]]; then PKG_MANAGER="dnf" @@ -85,7 +121,6 @@ retry sudo $PKG_MANAGER install -y "kernel-devel-uname-r == $(uname -r)" || true tar xzf 4.14.336-257.562.amzn2.x86_64.tar.gz ) - echo Checking if nvidia install required ${nvidia_driver_install} %{ if nvidia_driver_install ~} echo "NVIDIA driver install required" @@ -117,3 +152,5 @@ fi ${post_install} ./svc.sh start + +metric_report "linux_userdata.success" 1