From f14d1e0b9ce104fc26fc25ace26f469777282a02 Mon Sep 17 00:00:00 2001 From: Russ Allbery Date: Fri, 27 Oct 2023 10:15:03 -0700 Subject: [PATCH] Improve Redis pool configuration The retry specification for the Redis connection pool was passed into the Redis constructor rather than the connection pool constructor, which appears to mean it was unused. Pass it into the connection pool constructor instead, add a socket timeout, and enable socket keepalive. This hopefully will fix a problem we saw where Gafaelfawr never recovered from the Redis server being restarted during Kubernetes maintenance. --- changelog.d/20231027_101349_rra_DM_41424.md | 3 +++ src/gafaelfawr/constants.py | 6 +++++- src/gafaelfawr/factory.py | 15 +++++++++------ 3 files changed, 17 insertions(+), 7 deletions(-) create mode 100644 changelog.d/20231027_101349_rra_DM_41424.md diff --git a/changelog.d/20231027_101349_rra_DM_41424.md b/changelog.d/20231027_101349_rra_DM_41424.md new file mode 100644 index 000000000..6f2279b0f --- /dev/null +++ b/changelog.d/20231027_101349_rra_DM_41424.md @@ -0,0 +1,3 @@ +### Bug fixes + +- Add a socket timeout, enable keepalive, and fix the retry specification for the Redis connection pool to help Gafaelfawr recover from Redis outages. diff --git a/src/gafaelfawr/constants.py b/src/gafaelfawr/constants.py index b6b096ffd..7c614ea02 100644 --- a/src/gafaelfawr/constants.py +++ b/src/gafaelfawr/constants.py @@ -29,6 +29,7 @@ "REDIS_POOL_SIZE", "REDIS_POOL_TIMEOUT", "REDIS_RETRIES", + "REDIS_TIMEOUT", "SCOPE_REGEX", "TOKEN_CACHE_SIZE", "UID_BOT_MIN", @@ -117,7 +118,10 @@ """Size of the Redis connection pool.""" REDIS_POOL_TIMEOUT = 10 -"""How long to wait for a connection from the pool before giving up.""" +"""Seconds to wait for a connection from the pool before giving up.""" + +REDIS_TIMEOUT = 5 +"""Timeout in seconds for a Redis network operation (including connecting).""" # The following constants define per-process cache sizes. diff --git a/src/gafaelfawr/factory.py b/src/gafaelfawr/factory.py index 416301852..2187df799 100644 --- a/src/gafaelfawr/factory.py +++ b/src/gafaelfawr/factory.py @@ -31,6 +31,7 @@ REDIS_POOL_SIZE, REDIS_POOL_TIMEOUT, REDIS_RETRIES, + REDIS_TIMEOUT, ) from .exceptions import NotConfiguredError from .models.ldap import LDAPUserData @@ -142,13 +143,15 @@ async def from_config(cls, config: Config) -> Self: config.redis_url, password=config.redis_password, max_connections=REDIS_POOL_SIZE, - timeout=REDIS_POOL_TIMEOUT, - ), - retry=Retry( - ExponentialBackoff( - base=REDIS_BACKOFF_START, cap=REDIS_BACKOFF_MAX + retry=Retry( + ExponentialBackoff( + base=REDIS_BACKOFF_START, cap=REDIS_BACKOFF_MAX + ), + REDIS_RETRIES, ), - REDIS_RETRIES, + socket_keepalive=True, + socket_timeout=REDIS_TIMEOUT, + timeout=REDIS_POOL_TIMEOUT, ), )