From a0e1857c5698e42a36a4c8cf5b49860051210f76 Mon Sep 17 00:00:00 2001
From: Roman Melnikov <roman.melnikov@serokell.io>
Date: Thu, 23 Mar 2023 14:38:02 +0800
Subject: [PATCH 1/2] Add vault login retries

Problem: During massive updates secrets services are restarted when
'After=network-online.target' is satisfied but still there is no
internet connection. As a result login to vault fails and deployment is
considered as failed due to service failure.

Solution: Add option that will configure the amount of vault login
retries and actually perform retry in script when login fails.

Sadly, it's not viable to do this retry via systemd, even though
'Restart=on-failure' is available for oneshot service, due to the fact
that service will fail and cause the NixOS activation to stop and
rollback despite the fact that this service will attempt to restart
afterwards.
---
 modules/options.nix | 11 +++++++++++
 modules/script.nix  | 13 +++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/modules/options.nix b/modules/options.nix
index 6728ce7..70906bf 100644
--- a/modules/options.nix
+++ b/modules/options.nix
@@ -134,6 +134,17 @@ let
           '';
         };
 
+        loginRetries = mkOption {
+          type = with types; int;
+          default = 5;
+          description = ''
+            Number of attempts script will try to login into Vault.
+            This may be useful in case secrets service is restarted when internet
+            connection is not yet available. Sadly After=network-online.target
+            doesn't always guarantee that.
+          '';
+        };
+
         __toString = mkOption {
           default = _: "${cfg.outPrefix}/${name}";
           readOnly = true;
diff --git a/modules/script.nix b/modules/script.nix
index ddfc16f..3e22282 100644
--- a/modules/script.nix
+++ b/modules/script.nix
@@ -3,7 +3,7 @@ let
   inherit (scfg)
     environmentKey quoteEnvironmentValues
     environmentVariableNamePrefix extraScript
-    user group secretsKey secretsAreBase64;
+    user group secretsKey secretsAreBase64 loginRetries;
   inherit (lib) optionalString toUpper;
 
   secretsPath = "${cfg.outPrefix}/${name}";
@@ -18,9 +18,18 @@ in
   # Make sure we start from a clean slate
   rm -rf "${secretsPath}"
   mkdir -p "${secretsPath}"
+  max_retry="${toString loginRetries}"
+  counter="0"
 
+  set +e
   # Log into Vault using credentials from environmentFile
-  vaultOutput="$(vault write -format=json auth/approle/login role_id="$VAULT_ROLE_ID" secret_id=- <<< "$VAULT_SECRET_ID")"
+  until vaultOutput="$(vault write -format=json auth/approle/login role_id="$VAULT_ROLE_ID" secret_id=- <<< "$VAULT_SECRET_ID")"; do
+    echo "Failed to login into Vault, retrying"
+    sleep 5
+    [[ counter -eq $max_retry ]] && echo "Failed to login into Vault" && exit 1
+    ((counter++))
+  done
+  set -e
   jq '.auth.client_token = "redacted"' <<< "$vaultOutput"
   VAULT_TOKEN="$(jq -r '.auth.client_token' <<< "$vaultOutput")"
   export VAULT_TOKEN

From 831590f2df8731c2715546d832f52a74d50aca38 Mon Sep 17 00:00:00 2001
From: Roman Melnikov <roman.melnikov@serokell.io>
Date: Thu, 23 Mar 2023 14:46:44 +0800
Subject: [PATCH 2/2] Use GA runner with 'nix' label

To avoid using less powerful runners that have access to docker.
---
 .github/workflows/check.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
index 0a0642f..c89e812 100644
--- a/.github/workflows/check.yml
+++ b/.github/workflows/check.yml
@@ -3,7 +3,7 @@ on: push
 
 jobs:
   check:
-    runs-on: self-hosted
+    runs-on: [nix, self-hosted]
     steps:
       - uses: actions/checkout@v3