From 247ba66a94e64caf2a9a05ff74a2c58676ac938b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Gouteroux?= Date: Sat, 20 Jul 2024 16:00:02 +0200 Subject: [PATCH] feat: add parameter to allow return expired cache in case of errors (#146) --- README.md | 5 ++++- pkg/config/config.go | 31 +++++++++++++++++++++---------- pkg/exporter/cache.go | 4 ++-- pkg/exporter/exporter.go | 4 ++++ pkg/exporter/metrics.go | 17 +++++++++++++---- 5 files changed, 44 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 2779b42..9104e46 100644 --- a/README.md +++ b/README.md @@ -111,6 +111,7 @@ scripts: max_timeout: enforced: cacheDuration: + useExpiredCacheOnError: discovery: params: : @@ -160,7 +161,9 @@ Prometheus will normally provide an indication of its scrape timeout to the scri For testing purposes, the timeout can be specified directly as a URL parameter (`timeout`). If present, the URL parameter takes priority over the Prometheus HTTP header. -The `cacheDuration` config can be used to cache the results from an execution of the script for the provided time. The provided duration must be parsable by the [`time.ParseDuration`](https://pkg.go.dev/time#ParseDuration) function. If no cache duration is provided or the provided cache duration can not be parsed, the output of an script will not be cached. +The `cacheDuration` config can be used to cache the results from an execution of the script for the provided time. The provided duration must be parsable by the [`time.ParseDuration`](https://pkg.go.dev/time#ParseDuration) function. If no cache duration is provided or the provided cache duration can not be parsed, the output of an script will not be cached. It produces the metric `script_use_cache` to track in time when results returned are coming from cache. + +The `useExpiredCacheOnError` config allow to return expired cache in case of errors. It produces the metric `script_use_expired_cache` for track in time if you are using expired cache, it mean there is something wrong with the script execution. You can fine tune the script discovery options via optional script `discovery`. All these options will go through prometheus configuration where you can change them via relabel mechanism. There are `params` to define dynamic script parameters (with reserved keys: `params`, `prefix`, `script` and `timeout`) where only value will be used during script invoking (similar to `args`), `prefix` to define prefix for all script metrics, `scrape_interval` to define how often the script scrape should run and `scrape_timeout` to define the scrape timeout for prometheus (similar to `timeout`). diff --git a/pkg/config/config.go b/pkg/config/config.go index 5a6d292..09ba388 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -60,16 +60,17 @@ type Config struct { // ScriptConfig is the configuration for a single script. type ScriptConfig struct { - Name string `yaml:"name"` - Script string `yaml:"script"` - Command string `yaml:"command"` - Args []string `yaml:"args"` - Env map[string]string `yaml:"env"` - AllowEnvOverwrite bool `yaml:"allowEnvOverwrite"` - IgnoreOutputOnFail bool `yaml:"ignoreOutputOnFail"` - Timeout timeout `yaml:"timeout"` - CacheDuration string `yaml:"cacheDuration"` - Discovery scriptDiscovery `yaml:"discovery"` + Name string `yaml:"name"` + Script string `yaml:"script"` + Command string `yaml:"command"` + Args []string `yaml:"args"` + Env map[string]string `yaml:"env"` + AllowEnvOverwrite bool `yaml:"allowEnvOverwrite"` + IgnoreOutputOnFail bool `yaml:"ignoreOutputOnFail"` + Timeout timeout `yaml:"timeout"` + UseExpiredCacheOnError bool `yaml:"useExpiredCacheOnError"` + CacheDuration string `yaml:"cacheDuration"` + Discovery scriptDiscovery `yaml:"discovery"` } // LoadConfig reads the configuration file and umarshal the data into the config struct @@ -251,6 +252,16 @@ func (c *Config) GetCacheDuration(scriptName string) *time.Duration { return nil } +// GetUseExpiredCacheOnError returns the UseExpiredCacheOnError parameter for the provided script. +func (c *Config) GetUseExpiredCacheOnError(scriptName string) bool { + for _, script := range c.Scripts { + if script.Name == scriptName { + return script.UseExpiredCacheOnError + } + } + return false +} + // GetDiscoveryScrapeInterval returns the scrape_interval if it is valid duration, otherwise empty string. func (sc *ScriptConfig) GetDiscoveryScrapeInterval() string { _, err := time.ParseDuration(sc.Discovery.ScrapeInterval) diff --git a/pkg/exporter/cache.go b/pkg/exporter/cache.go index d3d87db..42436ca 100644 --- a/pkg/exporter/cache.go +++ b/pkg/exporter/cache.go @@ -15,9 +15,9 @@ type cacheEntry struct { successStatus int } -func getCacheResult(scriptName string, paramValues []string, cacheDuration time.Duration) (*string, *int, *int) { +func getCacheResult(scriptName string, paramValues []string, cacheDuration time.Duration, expCacheOnTimeout bool) (*string, *int, *int) { if entry, ok := cache[fmt.Sprintf("%s--%s", scriptName, strings.Join(paramValues, "-"))]; ok { - if entry.cacheTime.Add(cacheDuration).After(time.Now()) { + if entry.cacheTime.Add(cacheDuration).After(time.Now()) || expCacheOnTimeout { return &entry.formattedOutput, &entry.successStatus, &entry.exitCode } } diff --git a/pkg/exporter/exporter.go b/pkg/exporter/exporter.go index 53a3d37..09904bb 100644 --- a/pkg/exporter/exporter.go +++ b/pkg/exporter/exporter.go @@ -32,6 +32,10 @@ const ( scriptDurationSecondsType = "# TYPE script_duration_seconds gauge" scriptExitCodeHelp = "# HELP script_exit_code The exit code of the script." scriptExitCodeType = "# TYPE script_exit_code gauge" + scriptCacheHelp = "# HELP script_use_cache Script use cache (0 = no, 1 = yes)." + scriptCacheType = "# TYPE script_use_cache gauge" + scriptExpCacheHelp = "# HELP script_use_expired_cache Script re-use expired cache (0 = no, 1 = yes)." + scriptExpCacheType = "# TYPE script_use_expired_cache gauge" ) type Exporter struct { diff --git a/pkg/exporter/metrics.go b/pkg/exporter/metrics.go index 7cd5035..06aba76 100644 --- a/pkg/exporter/metrics.go +++ b/pkg/exporter/metrics.go @@ -42,10 +42,10 @@ func (e *Exporter) metricsHandler(scriptName string, params url.Values, promethe // stale. cacheDuration := e.Config.GetCacheDuration(scriptName) if cacheDuration != nil { - formattedOutput, successStatus, exitCode := getCacheResult(scriptName, paramValues, *cacheDuration) + formattedOutput, successStatus, exitCode := getCacheResult(scriptName, paramValues, *cacheDuration, false) if formattedOutput != nil && successStatus != nil && exitCode != nil { level.Debug(e.Logger).Log("msg", "Returning cached result", "script", scriptName) - return fmt.Sprintf("%s\n%s\n%s_success{script=\"%s\"} %d\n%s\n%s\n%s_duration_seconds{script=\"%s\"} %f\n%s\n%s\n%s_exit_code{script=\"%s\"} %d\n%s\n", scriptSuccessHelp, scriptSuccessType, namespace, scriptName, *successStatus, scriptDurationSecondsHelp, scriptDurationSecondsType, namespace, scriptName, time.Since(scriptStartTime).Seconds(), scriptExitCodeHelp, scriptExitCodeType, namespace, scriptName, *exitCode, *formattedOutput), nil + return fmt.Sprintf("%s\n%s\n%s_success{script=\"%s\"} %d\n%s\n%s\n%s_duration_seconds{script=\"%s\"} %f\n%s\n%s\n%s_exit_code{script=\"%s\"} %d\n%s\n%s\n%s_use_cache{script=\"%s\"} %d\n%s\n%s\n%s_use_expired_cache{script=\"%s\"} %d\n%s\n", scriptSuccessHelp, scriptSuccessType, namespace, scriptName, *successStatus, scriptDurationSecondsHelp, scriptDurationSecondsType, namespace, scriptName, time.Since(scriptStartTime).Seconds(), scriptExitCodeHelp, scriptExitCodeType, namespace, scriptName, *exitCode, scriptCacheHelp, scriptCacheType, namespace, scriptName, 1, scriptExpCacheHelp, scriptExpCacheType, namespace, scriptName, 0, *formattedOutput), nil } } @@ -81,6 +81,15 @@ func (e *Exporter) metricsHandler(scriptName string, params url.Values, promethe output, exitCode, err := runScript(scriptName, e.Logger, e.logEnv, timeout, e.Config.GetTimeoutEnforced(scriptName), runArgs, runEnv) if err != nil { successStatus = 0 + + useExpiredCacheOnError := e.Config.GetUseExpiredCacheOnError(scriptName) + if cacheDuration != nil && useExpiredCacheOnError { + formattedOutput, successStatus, exitCode := getCacheResult(scriptName, paramValues, *cacheDuration, useExpiredCacheOnError) + if formattedOutput != nil && successStatus != nil && exitCode != nil { + level.Debug(e.Logger).Log("msg", "Returning expired cache result", "script", scriptName) + return fmt.Sprintf("%s\n%s\n%s_success{script=\"%s\"} %d\n%s\n%s\n%s_duration_seconds{script=\"%s\"} %f\n%s\n%s\n%s_exit_code{script=\"%s\"} %d\n%s\n%s\n%s_use_cache{script=\"%s\"} %d\n%s\n%s\n%s_use_expired_cache{script=\"%s\"} %d\n%s\n", scriptSuccessHelp, scriptSuccessType, namespace, scriptName, *successStatus, scriptDurationSecondsHelp, scriptDurationSecondsType, namespace, scriptName, time.Since(scriptStartTime).Seconds(), scriptExitCodeHelp, scriptExitCodeType, namespace, scriptName, *exitCode, scriptCacheHelp, scriptCacheType, namespace, scriptName, 1, scriptExpCacheHelp, scriptExpCacheType, namespace, scriptName, 1, *formattedOutput), nil + } + } } // Get ignore output parameter and only return success and duration seconds if 'output=ignore'. If the script failed @@ -88,7 +97,7 @@ func (e *Exporter) metricsHandler(scriptName string, params url.Values, promethe // true. outputParam := params.Get("output") if outputParam == "ignore" || (successStatus == 0 && e.Config.GetIgnoreOutputOnFail(scriptName)) { - return fmt.Sprintf("%s\n%s\n%s_success{script=\"%s\"} %d\n%s\n%s\n%s_duration_seconds{script=\"%s\"} %f\n%s\n%s\n%s_exit_code{script=\"%s\"} %d\n", scriptSuccessHelp, scriptSuccessType, namespace, scriptName, successStatus, scriptDurationSecondsHelp, scriptDurationSecondsType, namespace, scriptName, time.Since(scriptStartTime).Seconds(), scriptExitCodeHelp, scriptExitCodeType, namespace, scriptName, exitCode), nil + return fmt.Sprintf("%s\n%s\n%s_success{script=\"%s\"} %d\n%s\n%s\n%s_duration_seconds{script=\"%s\"} %f\n%s\n%s\n%s_exit_code{script=\"%s\"} %d\n%s\n%s\n%s_use_cache{script=\"%s\"} %d\n%s\n%s\n%s_use_expired_cache{script=\"%s\"} %d\n", scriptSuccessHelp, scriptSuccessType, namespace, scriptName, successStatus, scriptDurationSecondsHelp, scriptDurationSecondsType, namespace, scriptName, time.Since(scriptStartTime).Seconds(), scriptExitCodeHelp, scriptExitCodeType, namespace, scriptName, exitCode, scriptCacheHelp, scriptCacheType, namespace, scriptName, 0, scriptExpCacheHelp, scriptExpCacheType, namespace, scriptName, 0), nil } // Format output @@ -128,7 +137,7 @@ func (e *Exporter) metricsHandler(scriptName string, params url.Values, promethe setCacheResult(scriptName, paramValues, formattedOutput, successStatus, exitCode) } - return fmt.Sprintf("%s\n%s\n%s_success{script=\"%s\"} %d\n%s\n%s\n%s_duration_seconds{script=\"%s\"} %f\n%s\n%s\n%s_exit_code{script=\"%s\"} %d\n%s\n", scriptSuccessHelp, scriptSuccessType, namespace, scriptName, successStatus, scriptDurationSecondsHelp, scriptDurationSecondsType, namespace, scriptName, time.Since(scriptStartTime).Seconds(), scriptExitCodeHelp, scriptExitCodeType, namespace, scriptName, exitCode, formattedOutput), nil + return fmt.Sprintf("%s\n%s\n%s_success{script=\"%s\"} %d\n%s\n%s\n%s_duration_seconds{script=\"%s\"} %f\n%s\n%s\n%s_exit_code{script=\"%s\"} %d\n%s\n%s\n%s_use_cache{script=\"%s\"} %d\n%s\n%s\n%s_use_expired_cache{script=\"%s\"} %d\n%s\n", scriptSuccessHelp, scriptSuccessType, namespace, scriptName, successStatus, scriptDurationSecondsHelp, scriptDurationSecondsType, namespace, scriptName, time.Since(scriptStartTime).Seconds(), scriptExitCodeHelp, scriptExitCodeType, namespace, scriptName, exitCode, scriptCacheHelp, scriptCacheType, namespace, scriptName, 0, scriptExpCacheHelp, scriptExpCacheType, namespace, scriptName, 0, formattedOutput), nil } func (e *Exporter) MetricsHandler(w http.ResponseWriter, r *http.Request) {