From c266a950b50d83e71a4b2947a6132674f5616a7f Mon Sep 17 00:00:00 2001 From: Jean Schmidt <4520845+jeanschmidt@users.noreply.github.com> Date: Wed, 19 Jun 2024 18:58:03 +0200 Subject: [PATCH] Retry dnf commands + always install dev tools (#5358) --- .../templates/install-config-runner.sh | 2 +- .../runners-instances/templates/user-data.sh | 93 +++++++++++-------- 2 files changed, 55 insertions(+), 40 deletions(-) diff --git a/terraform-aws-github-runner/modules/runners-instances/templates/install-config-runner.sh b/terraform-aws-github-runner/modules/runners-instances/templates/install-config-runner.sh index 4b462178d0..11428d51d3 100644 --- a/terraform-aws-github-runner/modules/runners-instances/templates/install-config-runner.sh +++ b/terraform-aws-github-runner/modules/runners-instances/templates/install-config-runner.sh @@ -1,4 +1,4 @@ -set -x +set -euxo pipefail install_hooks() { pushd /home/$USER_NAME diff --git a/terraform-aws-github-runner/modules/runners-instances/templates/user-data.sh b/terraform-aws-github-runner/modules/runners-instances/templates/user-data.sh index 2caa339879..5fe998612a 100644 --- a/terraform-aws-github-runner/modules/runners-instances/templates/user-data.sh +++ b/terraform-aws-github-runner/modules/runners-instances/templates/user-data.sh @@ -1,78 +1,93 @@ -#!/bin/bash -xe -set -x +#!/bin/bash + +set -euxo pipefail + exec > >(tee /var/log/user-data.log | logger -t user-data -s 2>/dev/console) 2>&1 +OS_ID=$(. /etc/os-release;echo $ID$VERSION_ID) +if [[ "$OS_ID" =~ ^amzn2023* ]]; then + PKG_MANAGER="dnf" +else + PKG_MANAGER="yum" +fi + ${pre_install} -sudo yum update -y +if ! command -v curl 2>/dev/null; then + echo "Installing curl" + sudo $PKG_MANAGER install -y curl +fi + +sudo sh -c "curl https://raw.githubusercontent.com/kadwanev/retry/master/retry -o /usr/local/bin/retry && chmod +x /usr/local/bin/retry" + +sudo retry "$PKG_MANAGER update -y" + +if ! command -v jq 2>/dev/null; then + echo "Installing jq" + sudo retry "$PKG_MANAGER install -y jq" +fi +if ! command -v git 2>/dev/null; then + echo "Installing git" + sudo retry "$PKG_MANAGER install -y git" +fi +if ! command -v pip3 2>/dev/null; then + echo "Installing git" + sudo retry "$PKG_MANAGER install -y pip" +fi %{ if enable_cloudwatch_agent ~} -sudo yum install amazon-cloudwatch-agent -y +sudo retry "$PKG_MANAGER install amazon-cloudwatch-agent -y" amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c ssm:${ssm_key_cloudwatch_agent_config} %{ endif ~} # Install docker if [ "$(uname -m)" == "aarch64" ]; then - sudo yum install -y docker + sudo retry "$PKG_MANAGER install -y docker" else if command -v amazon-linux-extras 2>/dev/null; then echo "Installing docker using amazon-linux-extras" - sudo amazon-linux-extras install docker + sudo retry "amazon-linux-extras install docker" else echo "Installing docker using dnf" - sudo dnf install docker -y + sudo retry "dnf install docker -y" fi fi service docker start usermod -a -G docker ec2-user -if ! command -v curl 2>/dev/null; then - echo "Installing curl" - sudo yum install -y curl -fi -if ! command -v jq 2>/dev/null; then - echo "Installing jq" - sudo yum install -y jq -fi -if ! command -v git 2>/dev/null; then - echo "Installing git" - sudo yum install -y git -fi -if ! command -v pip3 2>/dev/null; then - echo "Installing git" - sudo yum install -y pip -fi - USER_NAME=ec2-user ${install_config_runner} +sudo retry "$PKG_MANAGER groupinstall -y 'Development Tools'" +sudo retry "$PKG_MANAGER install -y 'kernel-devel-uname-r == $(uname -r)'" + echo Checking if nvidia install required ${nvidia_driver_install} %{ if nvidia_driver_install ~} -set +e - -os_id=$(. /etc/os-release;echo $ID$VERSION_ID) -if [[ "$os_id" =~ ^amzn.* ]]; then - if [[ "$os_id" =~ "amzn2023" ]] ; then +echo "NVIDIA driver install required" +if [[ "$OS_ID" =~ ^amzn.* ]]; then + if [[ "$OS_ID" =~ "amzn2023" ]] ; then echo "On Amazon Linux 2023, installing kernel-modules-extra" - sudo dnf install kernel-modules-extra -y + sudo retry "dnf install kernel-modules-extra -y" fi echo Installing Development Tools - sudo yum groupinstall -y "Development Tools" - sudo yum install -y "kernel-devel-uname-r == $(uname -r)" sudo modprobe backlight fi -sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-550.54.15.run" -sudo /bin/bash /tmp/nvidia_driver -s --no-drm +sudo retry "curl -fsL -o /tmp/nvidia_driver 'https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-550.54.15.run'" +sudo retry "/bin/bash /tmp/nvidia_driver -s --no-drm" sudo rm -fv /tmp/nvidia_driver -if [[ "$os_id" =~ ^amzn.* ]]; then +if [[ "$OS_ID" =~ ^amzn.* ]]; then + if [[ "$OS_ID" == ^amzn2023* ]]; then + sudo retry "dnf install -y dnf-plugins-core" + sudo retry "dnf config-manager --add-repo 'https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo'" + else + sudo retry "yum install -y yum-utils" + sudo retry "yum-config-manager --add-repo 'https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo'" + fi echo Installing nvidia-docker tools - sudo yum install -y yum-utils - sudo yum-config-manager --add-repo https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo - sudo yum install -y nvidia-docker2 + sudo retry "$PKG_MANAGER install -y nvidia-docker2" sudo systemctl restart docker fi -set -e %{ endif ~} ${post_install}