diff --git a/changelog.d/20231027_101349_rra_DM_41424.md b/changelog.d/20231027_101349_rra_DM_41424.md new file mode 100644 index 000000000..6f2279b0f --- /dev/null +++ b/changelog.d/20231027_101349_rra_DM_41424.md @@ -0,0 +1,3 @@ +### Bug fixes + +- Add a socket timeout, enable keepalive, and fix the retry specification for the Redis connection pool to help Gafaelfawr recover from Redis outages. diff --git a/src/gafaelfawr/constants.py b/src/gafaelfawr/constants.py index b6b096ffd..7c614ea02 100644 --- a/src/gafaelfawr/constants.py +++ b/src/gafaelfawr/constants.py @@ -29,6 +29,7 @@ "REDIS_POOL_SIZE", "REDIS_POOL_TIMEOUT", "REDIS_RETRIES", + "REDIS_TIMEOUT", "SCOPE_REGEX", "TOKEN_CACHE_SIZE", "UID_BOT_MIN", @@ -117,7 +118,10 @@ """Size of the Redis connection pool.""" REDIS_POOL_TIMEOUT = 10 -"""How long to wait for a connection from the pool before giving up.""" +"""Seconds to wait for a connection from the pool before giving up.""" + +REDIS_TIMEOUT = 5 +"""Timeout in seconds for a Redis network operation (including connecting).""" # The following constants define per-process cache sizes. diff --git a/src/gafaelfawr/factory.py b/src/gafaelfawr/factory.py index 416301852..2187df799 100644 --- a/src/gafaelfawr/factory.py +++ b/src/gafaelfawr/factory.py @@ -31,6 +31,7 @@ REDIS_POOL_SIZE, REDIS_POOL_TIMEOUT, REDIS_RETRIES, + REDIS_TIMEOUT, ) from .exceptions import NotConfiguredError from .models.ldap import LDAPUserData @@ -142,13 +143,15 @@ async def from_config(cls, config: Config) -> Self: config.redis_url, password=config.redis_password, max_connections=REDIS_POOL_SIZE, - timeout=REDIS_POOL_TIMEOUT, - ), - retry=Retry( - ExponentialBackoff( - base=REDIS_BACKOFF_START, cap=REDIS_BACKOFF_MAX + retry=Retry( + ExponentialBackoff( + base=REDIS_BACKOFF_START, cap=REDIS_BACKOFF_MAX + ), + REDIS_RETRIES, ), - REDIS_RETRIES, + socket_keepalive=True, + socket_timeout=REDIS_TIMEOUT, + timeout=REDIS_POOL_TIMEOUT, ), )