diff --git a/CHANGELOG.md b/CHANGELOG.md index 971c69322f..9ba268ee7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ such and the node will continue to scan new ATXs for their validity. ### Features ### Improvements +* [#6035](https://github.com/spacemeshos/go-spacemesh/issues/6035) Fixed an issue where the node retried registering for the PoET round + only for 15-20 minutes instead of continuing until the start of the round ## Release v1.6.6-hotfix1 diff --git a/activation/activation.go b/activation/activation.go index 567332e36e..cea8b0b778 100644 --- a/activation/activation.go +++ b/activation/activation.go @@ -39,11 +39,18 @@ var ( // PoetConfig is the configuration to interact with the poet server. type PoetConfig struct { - PhaseShift time.Duration `mapstructure:"phase-shift"` - CycleGap time.Duration `mapstructure:"cycle-gap"` - GracePeriod time.Duration `mapstructure:"grace-period"` - RequestTimeout time.Duration `mapstructure:"poet-request-timeout"` - RequestRetryDelay time.Duration `mapstructure:"retry-delay"` + // Offset from the epoch start when the poet round starts + PhaseShift time.Duration `mapstructure:"phase-shift"` + // CycleGap gives the duration between the end of a PoET round and the start of the next + CycleGap time.Duration `mapstructure:"cycle-gap"` + // GracePeriod defines the time before the start of the next PoET round until the node + // waits before building its NiPoST challenge. Shorter durations allow the node to + // possibly pick a better positioning ATX, but come with the risk that the node might + // not be able to validate that ATX and has to fall back to using its own previous ATX. + GracePeriod time.Duration `mapstructure:"grace-period"` + RequestTimeout time.Duration `mapstructure:"poet-request-timeout"` + RequestRetryDelay time.Duration `mapstructure:"retry-delay"` + // Period to find positioning ATX. Must be less, than GracePeriod PositioningATXSelectionTimeout time.Duration `mapstructure:"positioning-atx-selection-timeout"` CertifierInfoCacheTTL time.Duration `mapstructure:"certifier-info-cache-ttl"` PowParamsCacheTTL time.Duration `mapstructure:"pow-params-cache-ttl"` @@ -559,7 +566,6 @@ func (b *Builder) BuildNIPostChallenge(ctx context.Context, nodeID types.NodeID) case <-time.After(time.Until(wait)): } } - if b.poetCfg.PositioningATXSelectionTimeout > 0 { var cancel context.CancelFunc diff --git a/activation/nipost.go b/activation/nipost.go index ec9f8ac9c5..7dfd486159 100644 --- a/activation/nipost.go +++ b/activation/nipost.go @@ -366,10 +366,7 @@ func (nb *NIPostBuilder) submitPoetChallenge( logger.Debug("submitting challenge to poet proving service") - submitCtx, cancel := withConditionalTimeout(ctx, nb.poetCfg.RequestTimeout) - defer cancel() - - round, err := client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID) + round, err := client.Submit(ctx, deadline, prefix, challenge, signature, nodeID) if err != nil { return nipost.PoETRegistration{}, &PoetSvcUnstableError{msg: "failed to submit challenge to poet service", source: err} diff --git a/activation/poet.go b/activation/poet.go index af9f1f09a1..844cd0038b 100644 --- a/activation/poet.go +++ b/activation/poet.go @@ -5,6 +5,8 @@ import ( "errors" "fmt" "io" + "math" + "math/rand/v2" "net/http" "net/url" "sync" @@ -69,10 +71,11 @@ type PoetClient interface { // HTTPPoetClient implements PoetProvingServiceClient interface. type HTTPPoetClient struct { - id []byte - baseURL *url.URL - client *retryablehttp.Client - logger *zap.Logger + id []byte + baseURL *url.URL + client *retryablehttp.Client + submitChallengeClient *retryablehttp.Client + logger *zap.Logger } func checkRetry(ctx context.Context, resp *http.Response, err error) (bool, error) { @@ -126,6 +129,13 @@ func WithLogger(logger *zap.Logger) PoetClientOpts { } } +func customLinearJitterBackoff(min, max time.Duration, _ int, _ *http.Response) time.Duration { + if max <= min { + return min + } + return min + rand.N(max-min) +} + // NewHTTPPoetClient returns new instance of HTTPPoetClient connecting to the specified url. func NewHTTPPoetClient(server types.PoetServer, cfg PoetConfig, opts ...PoetClientOpts) (*HTTPPoetClient, error) { client := &retryablehttp.Client{ @@ -136,6 +146,14 @@ func NewHTTPPoetClient(server types.PoetServer, cfg PoetConfig, opts ...PoetClie CheckRetry: checkRetry, } + submitChallengeClient := &retryablehttp.Client{ + RetryMax: math.MaxInt, + RetryWaitMin: cfg.RequestRetryDelay, + RetryWaitMax: 2 * cfg.RequestRetryDelay, + Backoff: customLinearJitterBackoff, + CheckRetry: retryablehttp.DefaultRetryPolicy, + } + baseURL, err := url.Parse(server.Address) if err != nil { return nil, fmt.Errorf("parsing address: %w", err) @@ -145,11 +163,13 @@ func NewHTTPPoetClient(server types.PoetServer, cfg PoetConfig, opts ...PoetClie } poetClient := &HTTPPoetClient{ - id: server.Pubkey.Bytes(), - baseURL: baseURL, - client: client, - logger: zap.NewNop(), + id: server.Pubkey.Bytes(), + baseURL: baseURL, + client: client, + submitChallengeClient: submitChallengeClient, + logger: zap.NewNop(), } + for _, opt := range opts { opt(poetClient) } @@ -158,11 +178,11 @@ func NewHTTPPoetClient(server types.PoetServer, cfg PoetConfig, opts ...PoetClie "created poet client", zap.Stringer("url", baseURL), zap.Binary("pubkey", server.Pubkey.Bytes()), - zap.Int("max retries", client.RetryMax), + zap.Int("default max retries", client.RetryMax), + zap.Int("submit challenge max retries", submitChallengeClient.RetryMax), zap.Duration("min retry wait", client.RetryWaitMin), zap.Duration("max retry wait", client.RetryWaitMax), ) - return poetClient, nil } @@ -176,7 +196,7 @@ func (c *HTTPPoetClient) Address() string { func (c *HTTPPoetClient) PowParams(ctx context.Context) (*PoetPowParams, error) { resBody := rpcapi.PowParamsResponse{} - if err := c.req(ctx, http.MethodGet, "/v1/pow_params", nil, &resBody); err != nil { + if err := c.req(ctx, http.MethodGet, "/v1/pow_params", nil, &resBody, c.client); err != nil { return nil, fmt.Errorf("querying PoW params: %w", err) } @@ -228,7 +248,7 @@ func (c *HTTPPoetClient) Submit( } resBody := rpcapi.SubmitResponse{} - if err := c.req(ctx, http.MethodPost, "/v1/submit", &request, &resBody); err != nil { + if err := c.req(ctx, http.MethodPost, "/v1/submit", &request, &resBody, c.submitChallengeClient); err != nil { return nil, fmt.Errorf("submitting challenge: %w", err) } roundEnd := time.Time{} @@ -241,7 +261,7 @@ func (c *HTTPPoetClient) Submit( func (c *HTTPPoetClient) Info(ctx context.Context) (*types.PoetInfo, error) { resBody := rpcapi.InfoResponse{} - if err := c.req(ctx, http.MethodGet, "/v1/info", nil, &resBody); err != nil { + if err := c.req(ctx, http.MethodGet, "/v1/info", nil, &resBody, c.client); err != nil { return nil, fmt.Errorf("getting poet info: %w", err) } @@ -268,7 +288,7 @@ func (c *HTTPPoetClient) Info(ctx context.Context) (*types.PoetInfo, error) { // Proof implements PoetProvingServiceClient. func (c *HTTPPoetClient) Proof(ctx context.Context, roundID string) (*types.PoetProofMessage, []types.Hash32, error) { resBody := rpcapi.ProofResponse{} - if err := c.req(ctx, http.MethodGet, fmt.Sprintf("/v1/proofs/%s", roundID), nil, &resBody); err != nil { + if err := c.req(ctx, http.MethodGet, fmt.Sprintf("/v1/proofs/%s", roundID), nil, &resBody, c.client); err != nil { return nil, nil, fmt.Errorf("getting proof: %w", err) } @@ -300,7 +320,12 @@ func (c *HTTPPoetClient) Proof(ctx context.Context, roundID string) (*types.Poet return &proof, members, nil } -func (c *HTTPPoetClient) req(ctx context.Context, method, path string, reqBody, resBody proto.Message) error { +func (c *HTTPPoetClient) req( + ctx context.Context, + method, path string, + reqBody, resBody proto.Message, + client *retryablehttp.Client, +) error { jsonReqBody, err := protojson.Marshal(reqBody) if err != nil { return fmt.Errorf("marshaling request body: %w", err) @@ -312,7 +337,7 @@ func (c *HTTPPoetClient) req(ctx context.Context, method, path string, reqBody, } req.Header.Set("Content-Type", "application/json") - res, err := c.client.Do(req) + res, err := client.Do(req) if err != nil { return fmt.Errorf("doing request: %w", err) } @@ -343,7 +368,6 @@ func (c *HTTPPoetClient) req(ctx context.Context, method, path string, reqBody, return fmt.Errorf("decoding response body to proto: %w", err) } } - return nil } @@ -371,9 +395,10 @@ func (c *cachedData[T]) get(init func() (T, error)) (T, error) { // poetService is a higher-level interface to communicate with a PoET service. // It wraps the HTTP client, adding additional functionality. type poetService struct { - db poetDbAPI - logger *zap.Logger - client PoetClient + db poetDbAPI + logger *zap.Logger + client PoetClient + requestTimeout time.Duration // Used to avoid concurrent requests for proof. @@ -567,9 +592,7 @@ func (c *poetService) Submit( logger.Debug("submitting challenge to poet proving service") - submitCtx, cancel := withConditionalTimeout(ctx, c.requestTimeout) - defer cancel() - round, err := c.client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID, *auth) + round, err := c.client.Submit(ctx, deadline, prefix, challenge, signature, nodeID, *auth) switch { case err == nil: return round, nil @@ -579,7 +602,7 @@ func (c *poetService) Submit( if err != nil { return nil, fmt.Errorf("authorizing: %w", err) } - return c.client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID, *auth) + return c.client.Submit(ctx, deadline, prefix, challenge, signature, nodeID, *auth) } return nil, fmt.Errorf("submitting challenge: %w", err) } diff --git a/activation/poet_client_test.go b/activation/poet_client_test.go index 397fbc06b0..f613eb239c 100644 --- a/activation/poet_client_test.go +++ b/activation/poet_client_test.go @@ -83,6 +83,41 @@ func Test_HTTPPoetClient_Submit(t *testing.T) { require.NoError(t, err) } +func Test_HTTPPoetClient_SubmitTillCtxCanceled(t *testing.T) { + t.Parallel() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + tries := 0 + mux := http.NewServeMux() + mux.HandleFunc("POST /v1/submit", func(w http.ResponseWriter, r *http.Request) { + tries += 1 + if tries == 3 { + cancel() + } + http.Error(w, "some_error", http.StatusInternalServerError) + }) + ts := httptest.NewServer(mux) + defer ts.Close() + cfg := server.DefaultRoundConfig() + client, err := NewHTTPPoetClient(types.PoetServer{Address: ts.URL}, PoetConfig{ + PhaseShift: cfg.PhaseShift, + CycleGap: cfg.CycleGap, + MaxRequestRetries: 1, + }, withCustomHttpClient(ts.Client())) + require.NoError(t, err) + _, err = client.Submit( + ctx, + time.Time{}, + nil, + nil, + types.EmptyEdSignature, + types.NodeID{}, + PoetAuth{}, + ) + require.ErrorIs(t, err, context.Canceled) + require.Equal(t, 3, tries) +} + func Test_HTTPPoetClient_Address(t *testing.T) { t.Run("with scheme", func(t *testing.T) { t.Parallel() diff --git a/cmd/root.go b/cmd/root.go index 1f64748904..0b8b20b50b 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -317,13 +317,13 @@ func AddFlags(flagSet *pflag.FlagSet, cfg *config.Config) (configPath *string) { /**======================== PoET Flags ========================== **/ flagSet.DurationVar(&cfg.POET.PhaseShift, "phase-shift", - cfg.POET.PhaseShift, "phase shift of poet server") + cfg.POET.PhaseShift, "phase shift of poet server: duration after epoch start, at which poet round starts") flagSet.DurationVar(&cfg.POET.CycleGap, "cycle-gap", - cfg.POET.CycleGap, "cycle gap of poet server") + cfg.POET.CycleGap, "cycle gap of poet server: gap between poet rounds") flagSet.DurationVar(&cfg.POET.GracePeriod, "grace-period", - cfg.POET.GracePeriod, "time before PoET round starts when the node builds and submits a challenge") + cfg.POET.GracePeriod, "time before poet round starts, when the node builds and submits a challenge") flagSet.DurationVar(&cfg.POET.RequestTimeout, "poet-request-timeout", - cfg.POET.RequestTimeout, "timeout for poet requests") + cfg.POET.RequestTimeout, "default timeout for poet requests") /**======================== bootstrap data updater Flags ========================== **/ diff --git a/config/presets/fastnet.go b/config/presets/fastnet.go index 225a2482d5..2b9fe78417 100644 --- a/config/presets/fastnet.go +++ b/config/presets/fastnet.go @@ -96,7 +96,9 @@ func fastnet() config.Config { conf.POET.GracePeriod = 10 * time.Second conf.POET.CycleGap = 30 * time.Second conf.POET.PhaseShift = 30 * time.Second - conf.POET.RequestTimeout = 12 * time.Second // RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2 + conf.POET.PositioningATXSelectionTimeout = 8 * time.Second + // RequestTimeout = RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2 + conf.POET.RequestTimeout = 12 * time.Second conf.POET.RequestRetryDelay = 1 * time.Second conf.POET.MaxRequestRetries = 3 conf.POET.CertifierInfoCacheTTL = time.Minute diff --git a/config/presets/standalone.go b/config/presets/standalone.go index 899b1d48ba..937f3c8f4c 100644 --- a/config/presets/standalone.go +++ b/config/presets/standalone.go @@ -80,7 +80,9 @@ func standalone() config.Config { conf.POET.GracePeriod = 12 * time.Second conf.POET.CycleGap = 30 * time.Second conf.POET.PhaseShift = 30 * time.Second - conf.POET.RequestTimeout = 12 * time.Second // RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2 + conf.POET.PositioningATXSelectionTimeout = 8 * time.Second + // RequestTimeout = RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2 + conf.POET.RequestTimeout = 12 * time.Second conf.POET.RequestRetryDelay = 1 * time.Second conf.POET.MaxRequestRetries = 3 conf.POET.CertifierInfoCacheTTL = time.Minute