From a0e1857c5698e42a36a4c8cf5b49860051210f76 Mon Sep 17 00:00:00 2001 From: Roman Melnikov <roman.melnikov@serokell.io> Date: Thu, 23 Mar 2023 14:38:02 +0800 Subject: [PATCH 1/2] Add vault login retries Problem: During massive updates secrets services are restarted when 'After=network-online.target' is satisfied but still there is no internet connection. As a result login to vault fails and deployment is considered as failed due to service failure. Solution: Add option that will configure the amount of vault login retries and actually perform retry in script when login fails. Sadly, it's not viable to do this retry via systemd, even though 'Restart=on-failure' is available for oneshot service, due to the fact that service will fail and cause the NixOS activation to stop and rollback despite the fact that this service will attempt to restart afterwards. --- modules/options.nix | 11 +++++++++++ modules/script.nix | 13 +++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/modules/options.nix b/modules/options.nix index 6728ce7..70906bf 100644 --- a/modules/options.nix +++ b/modules/options.nix @@ -134,6 +134,17 @@ let ''; }; + loginRetries = mkOption { + type = with types; int; + default = 5; + description = '' + Number of attempts script will try to login into Vault. + This may be useful in case secrets service is restarted when internet + connection is not yet available. Sadly After=network-online.target + doesn't always guarantee that. + ''; + }; + __toString = mkOption { default = _: "${cfg.outPrefix}/${name}"; readOnly = true; diff --git a/modules/script.nix b/modules/script.nix index ddfc16f..3e22282 100644 --- a/modules/script.nix +++ b/modules/script.nix @@ -3,7 +3,7 @@ let inherit (scfg) environmentKey quoteEnvironmentValues environmentVariableNamePrefix extraScript - user group secretsKey secretsAreBase64; + user group secretsKey secretsAreBase64 loginRetries; inherit (lib) optionalString toUpper; secretsPath = "${cfg.outPrefix}/${name}"; @@ -18,9 +18,18 @@ in # Make sure we start from a clean slate rm -rf "${secretsPath}" mkdir -p "${secretsPath}" + max_retry="${toString loginRetries}" + counter="0" + set +e # Log into Vault using credentials from environmentFile - vaultOutput="$(vault write -format=json auth/approle/login role_id="$VAULT_ROLE_ID" secret_id=- <<< "$VAULT_SECRET_ID")" + until vaultOutput="$(vault write -format=json auth/approle/login role_id="$VAULT_ROLE_ID" secret_id=- <<< "$VAULT_SECRET_ID")"; do + echo "Failed to login into Vault, retrying" + sleep 5 + [[ counter -eq $max_retry ]] && echo "Failed to login into Vault" && exit 1 + ((counter++)) + done + set -e jq '.auth.client_token = "redacted"' <<< "$vaultOutput" VAULT_TOKEN="$(jq -r '.auth.client_token' <<< "$vaultOutput")" export VAULT_TOKEN From 831590f2df8731c2715546d832f52a74d50aca38 Mon Sep 17 00:00:00 2001 From: Roman Melnikov <roman.melnikov@serokell.io> Date: Thu, 23 Mar 2023 14:46:44 +0800 Subject: [PATCH 2/2] Use GA runner with 'nix' label To avoid using less powerful runners that have access to docker. --- .github/workflows/check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 0a0642f..c89e812 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -3,7 +3,7 @@ on: push jobs: check: - runs-on: self-hosted + runs-on: [nix, self-hosted] steps: - uses: actions/checkout@v3