Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: remote graphite retries #1085

Merged
merged 37 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
19f04a9
feat: Added retry logic to remote api request
Jun 17, 2022
3122c96
refactor: update with master
AleksandrMatsko Sep 18, 2024
cb08142
refactor: local checker config
AleksandrMatsko Sep 18, 2024
c0890f7
refactor: config structs, remove unused functions
AleksandrMatsko Sep 18, 2024
8c52507
refactor: remove comments, add new lines
AleksandrMatsko Sep 18, 2024
722a211
refactor: remote graphite tests
AleksandrMatsko Sep 18, 2024
38cca14
refactor: remove comment
AleksandrMatsko Sep 18, 2024
85bc18c
refactor: api config
AleksandrMatsko Sep 19, 2024
48dc77a
merge master into feat/remote-graphite-retries
AleksandrMatsko Oct 1, 2024
f5c96c1
refactor: recognising isUnavailable code, refactor tests
AleksandrMatsko Oct 1, 2024
6842542
refactor: notifier config
AleksandrMatsko Oct 2, 2024
497eb29
refactor: remove empty line
AleksandrMatsko Oct 2, 2024
8c00a84
merge master into feat/remote-graphite-retries
AleksandrMatsko Oct 2, 2024
6db0ad2
merge master into feat/remote-graphite-retries
AleksandrMatsko Oct 4, 2024
f922144
merge master into feat/remote-graphite-retries
AleksandrMatsko Oct 8, 2024
2e54649
feat: retries config in cmd
AleksandrMatsko Oct 9, 2024
409b734
feat: retries logic
AleksandrMatsko Oct 9, 2024
a98dd11
test: exponential backoff factory and add godocs
AleksandrMatsko Oct 9, 2024
4e03075
test: finish fot exponential backoff factory
AleksandrMatsko Oct 9, 2024
cdace7d
test: for retrier
AleksandrMatsko Oct 9, 2024
f83cace
refactor: (work in progress) remote metric source to use retrier
AleksandrMatsko Oct 9, 2024
7826e24
refactor: configs add tests for configs
AleksandrMatsko Oct 11, 2024
0edd1c5
tests: fix for requests
AleksandrMatsko Oct 11, 2024
ab98963
refactor: add testcases to config_test for remote source, use linter
AleksandrMatsko Oct 15, 2024
acc1c67
refactor: fix remote tests, clock interface
AleksandrMatsko Oct 15, 2024
5e95fdf
merge origin/master into feat/remote-graphite-retries
AleksandrMatsko Oct 15, 2024
139d701
refactor: tests
AleksandrMatsko Oct 15, 2024
8be8caa
refactor: error initialization
AleksandrMatsko Oct 15, 2024
2c87158
style: use linter
AleksandrMatsko Oct 15, 2024
76477a6
style: rename var
AleksandrMatsko Oct 23, 2024
92e0c06
merge master into feat/remote-graphite-retries
AleksandrMatsko Oct 25, 2024
1bde467
refactor: deferring closing response body
AleksandrMatsko Oct 25, 2024
eabeaed
refactor: use moira.ValidateStruct for retries and graphite remote co…
AleksandrMatsko Oct 25, 2024
459c9e7
style: use linter
AleksandrMatsko Oct 25, 2024
d325bdc
merge master into feat/remote-graphite-retries
AleksandrMatsko Oct 30, 2024
61f61fa
style: remove comments
AleksandrMatsko Oct 31, 2024
fe51434
merge master into feat/remote-graphite-retries
AleksandrMatsko Nov 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions clock/clock.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,17 @@ func NewSystemClock() *SystemClock {
return &SystemClock{}
}

// Now returns now time.Time with UTC location.
// NowUTC returns now time.Time with UTC location.
func (t *SystemClock) NowUTC() time.Time {
return time.Now().UTC()
}

// Now returns now time.Time as a Unix time.
// Sleep pauses the current goroutine for at least the passed duration.
func (t *SystemClock) Sleep(duration time.Duration) {
time.Sleep(duration)
}

// NowUnix returns now time.Time as a Unix time.
Tetrergeru marked this conversation as resolved.
Show resolved Hide resolved
func (t *SystemClock) NowUnix() int64 {
return time.Now().Unix()
}
38 changes: 32 additions & 6 deletions cmd/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ import (
"errors"
"fmt"
"os"
"strconv"
"strings"
"time"

"github.com/moira-alert/moira"
"github.com/moira-alert/moira/metrics"
Expand Down Expand Up @@ -239,21 +241,45 @@ type GraphiteRemoteConfig struct {
User string `yaml:"user"`
// Password for basic auth
Password string `yaml:"password"`
// Retry seconds for remote requests divided by spaces
RetrySeconds string `yaml:"retry_seconds"`
Tetrergeru marked this conversation as resolved.
Show resolved Hide resolved
// HealthCheckTimeout is timeout for remote api health check requests
HealthCheckTimeout string `yaml:"health_check_timeout"`
// Retry seconds for remote api health check requests divided by spaces
HealthCheckRetrySeconds string `yaml:"health_check_retry_seconds"`
}

func (config GraphiteRemoteConfig) getRemoteCommon() *RemoteCommonConfig {
return &config.RemoteCommonConfig
}

// ParseRetrySeconds parses config value string into array of integers.
func ParseRetrySeconds(retrySecondsString string) []time.Duration {
secondsStringList := strings.Fields(retrySecondsString)
retrySecondsIntList := make([]time.Duration, len(secondsStringList))

for index, secondsString := range secondsStringList {
secondsInt, err := strconv.Atoi(secondsString)
if err != nil {
panic(err)
}
retrySecondsIntList[index] = time.Second * time.Duration(secondsInt)
}
return retrySecondsIntList
}

// GetRemoteSourceSettings returns remote config parsed from moira config files.
func (config *GraphiteRemoteConfig) GetRemoteSourceSettings() *graphiteRemoteSource.Config {
return &graphiteRemoteSource.Config{
URL: config.URL,
CheckInterval: to.Duration(config.CheckInterval),
MetricsTTL: to.Duration(config.MetricsTTL),
Timeout: to.Duration(config.Timeout),
User: config.User,
Password: config.Password,
URL: config.URL,
CheckInterval: to.Duration(config.CheckInterval),
MetricsTTL: to.Duration(config.MetricsTTL),
Timeout: to.Duration(config.Timeout),
User: config.User,
Password: config.Password,
RetrySeconds: ParseRetrySeconds(config.RetrySeconds),
HealthCheckTimeout: to.Duration(config.HealthCheckTimeout),
HealthCheckRetrySeconds: ParseRetrySeconds(config.HealthCheckRetrySeconds),
}
}

Expand Down
1 change: 1 addition & 0 deletions interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -229,5 +229,6 @@ type PlotTheme interface {
// Clock is an interface to work with Time.
type Clock interface {
NowUTC() time.Time
Sleep(duration time.Duration)
Tetrergeru marked this conversation as resolved.
Show resolved Hide resolved
NowUnix() int64
}
5 changes: 4 additions & 1 deletion local/api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,11 @@ graphite_remote:
cluster_name: Graphite 1
url: "http://graphite:80/render"
check_interval: 60s
timeout: 60s
metrics_ttl: 168h
timeout: 60s
retry_seconds: 1 1 1
health_check_timeout: 6s
health_check_retry_seconds: 1 1 1
prometheus_remote:
- cluster_id: default
cluster_name: Prometheus 1
Expand Down
5 changes: 4 additions & 1 deletion local/checker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@ graphite_remote:
cluster_name: Graphite 1
url: "http://graphite:80/render"
check_interval: 60s
timeout: 60s
metrics_ttl: 168h
timeout: 60s
retry_seconds: 1 1 1
health_check_timeout: 6s
health_check_retry_seconds: 1 1 1
prometheus_remote:
- cluster_id: default
cluster_name: Prometheus 1
Expand Down
5 changes: 4 additions & 1 deletion local/notifier.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,11 @@ graphite_remote:
cluster_name: Graphite 1
url: "http://graphite:80/render"
check_interval: 60s
timeout: 60s
metrics_ttl: 168h
timeout: 60s
retry_seconds: 1 1 1
health_check_timeout: 6s
health_check_retry_seconds: 1 1 1
prometheus_remote:
- cluster_id: default
cluster_name: Prometheus 1
Expand Down
15 changes: 9 additions & 6 deletions metric_source/remote/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@ import "time"

// Config represents config from remote storage.
type Config struct {
URL string
CheckInterval time.Duration
MetricsTTL time.Duration
Timeout time.Duration
User string
Password string
URL string
CheckInterval time.Duration
MetricsTTL time.Duration
Timeout time.Duration
User string
Password string
RetrySeconds []time.Duration
HealthCheckTimeout time.Duration
HealthCheckRetrySeconds []time.Duration
}
51 changes: 35 additions & 16 deletions metric_source/remote/remote.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"net/http"
"time"

"github.com/moira-alert/moira/clock"

"github.com/moira-alert/moira"
metricSource "github.com/moira-alert/moira/metric_source"
)
Expand All @@ -23,20 +25,34 @@ func (err ErrRemoteTriggerResponse) Error() string {
return err.InternalError.Error()
}

// ErrRemoteUnavailable is a custom error when remote trigger check fails.
type ErrRemoteUnavailable struct {
InternalError error
Target string
}

// Error is a representation of Error interface method.
func (err ErrRemoteUnavailable) Error() string {
return err.InternalError.Error()
}

// Remote is implementation of MetricSource interface, which implements fetch metrics method from remote graphite installation.
type Remote struct {
config *Config
client *http.Client
clock moira.Clock
}

// Create configures remote metric source.
func Create(config *Config) (metricSource.MetricSource, error) {
if config.URL == "" {
return nil, fmt.Errorf("remote graphite URL should not be empty")
}

return &Remote{
config: config,
client: &http.Client{Timeout: config.Timeout},
clock: clock.NewSystemClock(),
}, nil
}

Expand All @@ -53,20 +69,30 @@ func (remote *Remote) Fetch(target string, from, until int64, allowRealTimeAlert
Target: target,
}
}
body, err := remote.makeRequest(req)

body, isRemoteAvailable, err := remote.makeRequestWithRetries(req, remote.config.Timeout, remote.config.RetrySeconds)
if err != nil {
return nil, ErrRemoteTriggerResponse{
if isRemoteAvailable {
Tetrergeru marked this conversation as resolved.
Show resolved Hide resolved
return nil, ErrRemoteTriggerResponse{
InternalError: err,
Target: target,
}
}

return nil, ErrRemoteUnavailable{
InternalError: err,
Target: target,
}
}

resp, err := decodeBody(body)
if err != nil {
return nil, ErrRemoteTriggerResponse{
InternalError: err,
Target: target,
}
}

fetchResult := convertResponse(resp, allowRealTimeAlerting)
return &fetchResult, nil
}
Expand All @@ -76,25 +102,18 @@ func (remote *Remote) GetMetricsTTLSeconds() int64 {
return int64(remote.config.MetricsTTL.Seconds())
}

// IsConfigured returns false in cases that user does not properly configure remote settings like graphite URL.
func (remote *Remote) IsConfigured() (bool, error) {
return true, nil
}

// IsRemoteAvailable checks if graphite API is available and returns 200 response.
// IsAvailable checks if graphite API is available and returns 200 response.
func (remote *Remote) IsAvailable() (bool, error) {
maxRetries := 3
until := time.Now().Unix()
from := until - 600 //nolint

req, err := remote.prepareRequest(from, until, "NonExistingTarget")
if err != nil {
return false, err
}
for attempt := 0; attempt < maxRetries; attempt++ {
_, err = remote.makeRequest(req)
if err == nil {
return true, nil
}
}
return false, err

_, isRemoteAvailable, err := remote.makeRequestWithRetries(
req, remote.config.HealthCheckTimeout, remote.config.HealthCheckRetrySeconds)

return isRemoteAvailable, err
}
Loading
Loading