Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Merged by Bors] - Retry registration timeout fix #6136

Closed
wants to merge 31 commits into from
Closed
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
377a8c1
fixed timeouts
ConvallariaMaj Jul 15, 2024
d122094
fix tests + make linter happy
ConvallariaMaj Jul 15, 2024
fd3414e
revert config structure
ConvallariaMaj Jul 15, 2024
e3d6d8a
revert config structure2
ConvallariaMaj Jul 15, 2024
94383d7
fix tests
ConvallariaMaj Jul 15, 2024
4fad1a0
make linter happy
ConvallariaMaj Jul 15, 2024
9ff5958
fix review issue
ConvallariaMaj Jul 15, 2024
4046cc5
fix review issue + revert naming
ConvallariaMaj Jul 16, 2024
ab0b71e
removed overengineering
ConvallariaMaj Jul 17, 2024
c969205
removed overengineering2
ConvallariaMaj Jul 17, 2024
9435aee
removed overengineering2
ConvallariaMaj Jul 17, 2024
d515791
reverted timeout to Proof method, fixed some tests
ConvallariaMaj Jul 17, 2024
50a3e0e
added PositioningATXSelectionTimeout to all envs
ConvallariaMaj Jul 17, 2024
4d067c6
fix test and timeout
ConvallariaMaj Jul 17, 2024
fcd2ab1
revert unnecessary changes
ConvallariaMaj Jul 17, 2024
eb4e623
added PositioningATXSelectionTimeout to test env
ConvallariaMaj Jul 17, 2024
bfe0c34
fix flag
ConvallariaMaj Jul 17, 2024
bc743d2
added changelog
ConvallariaMaj Jul 18, 2024
0d30ee1
unlimited retry
ConvallariaMaj Jul 27, 2024
d9b0513
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Jul 28, 2024
c5be14d
fix issues
ConvallariaMaj Aug 1, 2024
2c53d3e
review fixes
ConvallariaMaj Aug 7, 2024
50e3129
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 7, 2024
8e729ea
minor fixes
ConvallariaMaj Aug 12, 2024
cd14fc0
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 12, 2024
1e79763
make linter happy
ConvallariaMaj Aug 12, 2024
1d0dfb0
fix comment
ConvallariaMaj Aug 12, 2024
de5cd55
custom linear jitter
ConvallariaMaj Aug 14, 2024
b7fcf55
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 14, 2024
3ac16db
fix review issues
ConvallariaMaj Aug 14, 2024
24a4056
Merge branch 'develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ See [RELEASE](./RELEASE.md) for workflow instructions.
* [#6142](https://github.com/spacemeshos/go-spacemesh/pull/6142) Fix node not dropping peers that are broadcasting
invalid ATXs.

fasmat marked this conversation as resolved.
Show resolved Hide resolved
* [#6035](https://github.com/spacemeshos/go-spacemesh/issues/6035) Fixed an issue where the node retried registering for the PoET round
only for 15-20 minutes instead of continuing until the start of the round
fasmat marked this conversation as resolved.
Show resolved Hide resolved
## Release v1.6.3

### Improvements
Expand Down
19 changes: 11 additions & 8 deletions activation/activation.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,18 @@ var (

// PoetConfig is the configuration to interact with the poet server.
type PoetConfig struct {
PhaseShift time.Duration `mapstructure:"phase-shift"`
CycleGap time.Duration `mapstructure:"cycle-gap"`
GracePeriod time.Duration `mapstructure:"grace-period"`
RequestTimeout time.Duration `mapstructure:"poet-request-timeout"`
RequestRetryDelay time.Duration `mapstructure:"retry-delay"`
RequestTimeout time.Duration `mapstructure:"poet-request-timeout"`
RequestRetryDelay time.Duration `mapstructure:"retry-delay"`
CertifierInfoCacheTTL time.Duration `mapstructure:"certifier-info-cache-ttl"`
MaxRequestRetries int `mapstructure:"retry-max"`
// Start of new PoET round
poszu marked this conversation as resolved.
Show resolved Hide resolved
PhaseShift time.Duration `mapstructure:"phase-shift"`
// A gap between end of old PoET round and start of new one
fasmat marked this conversation as resolved.
Show resolved Hide resolved
CycleGap time.Duration `mapstructure:"cycle-gap"`
// Time in the end of cycle gap, when PoST challenge must be build and send to PoET server
poszu marked this conversation as resolved.
Show resolved Hide resolved
GracePeriod time.Duration `mapstructure:"grace-period"`
// Period to find positioning ATX. Must be less, than GracePeriod
PositioningATXSelectionTimeout time.Duration `mapstructure:"positioning-atx-selection-timeout"`
CertifierInfoCacheTTL time.Duration `mapstructure:"certifier-info-cache-ttl"`
MaxRequestRetries int `mapstructure:"retry-max"`
}

func DefaultPoetConfig() PoetConfig {
Expand Down Expand Up @@ -547,7 +551,6 @@ func (b *Builder) BuildNIPostChallenge(ctx context.Context, nodeID types.NodeID)
case <-time.After(time.Until(wait)):
}
}

if b.poetCfg.PositioningATXSelectionTimeout > 0 {
var cancel context.CancelFunc

Expand Down
8 changes: 2 additions & 6 deletions activation/nipost.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ func (nb *NIPostBuilder) BuildNIPost(
}

events.EmitPoetWaitProof(signer.NodeID(), postChallenge.PublishEpoch, poetRoundEnd)
poetProofRef, membership, err = nb.getBestProof(ctx, signer.NodeID(), challenge, postChallenge.PublishEpoch)
poetProofRef, membership, err = nb.getBestProof(ctx, signer.NodeID(), challenge)
if err != nil {
return nil, &PoetSvcUnstableError{msg: "getBestProof failed", source: err}
}
Expand Down Expand Up @@ -372,10 +372,7 @@ func (nb *NIPostBuilder) submitPoetChallenge(

logger.Debug("submitting challenge to poet proving service")

submitCtx, cancel := withConditionalTimeout(ctx, nb.poetCfg.RequestTimeout)
defer cancel()

round, err := client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID)
round, err := client.Submit(ctx, deadline, prefix, challenge, signature, nodeID)
if err != nil {
return &PoetSvcUnstableError{msg: "failed to submit challenge to poet service", source: err}
}
Expand Down Expand Up @@ -451,7 +448,6 @@ func (nb *NIPostBuilder) getBestProof(
ctx context.Context,
nodeID types.NodeID,
challenge types.Hash32,
publishEpoch types.EpochID,
) (types.PoetProofRef, *types.MerkleProof, error) {
type poetProof struct {
poet *types.PoetProof
Expand Down
64 changes: 43 additions & 21 deletions activation/poet.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"io"
"math"
"net/http"
"net/url"
"sync"
Expand Down Expand Up @@ -33,6 +34,12 @@ var (
errCertificatesNotSupported = errors.New("poet doesn't support certificates")
)

const (
submitPath = "/v1/submit"
infoPath = "/v1/info"
proofPath = "/v1/proofs"
)
poszu marked this conversation as resolved.
Show resolved Hide resolved

type PoetPowParams struct {
Challenge []byte
Difficulty uint
Expand Down Expand Up @@ -67,10 +74,11 @@ type PoetClient interface {

// HTTPPoetClient implements PoetProvingServiceClient interface.
type HTTPPoetClient struct {
id []byte
baseURL *url.URL
client *retryablehttp.Client
logger *zap.Logger
id []byte
baseURL *url.URL
client *retryablehttp.Client
submitChallengeClient *retryablehttp.Client
logger *zap.Logger
}

func checkRetry(ctx context.Context, resp *http.Response, err error) (bool, error) {
Expand Down Expand Up @@ -134,6 +142,14 @@ func NewHTTPPoetClient(server types.PoetServer, cfg PoetConfig, opts ...PoetClie
CheckRetry: checkRetry,
}

submitChallengeClient := &retryablehttp.Client{
RetryMax: math.MaxInt,
RetryWaitMin: cfg.RequestRetryDelay,
RetryWaitMax: 2 * cfg.RequestRetryDelay,
Backoff: retryablehttp.LinearJitterBackoff,
fasmat marked this conversation as resolved.
Show resolved Hide resolved
CheckRetry: checkRetry,
poszu marked this conversation as resolved.
Show resolved Hide resolved
}

baseURL, err := url.Parse(server.Address)
if err != nil {
return nil, fmt.Errorf("parsing address: %w", err)
Expand All @@ -143,10 +159,11 @@ func NewHTTPPoetClient(server types.PoetServer, cfg PoetConfig, opts ...PoetClie
}

poetClient := &HTTPPoetClient{
id: server.Pubkey.Bytes(),
baseURL: baseURL,
client: client,
logger: zap.NewNop(),
id: server.Pubkey.Bytes(),
baseURL: baseURL,
client: client,
submitChallengeClient: submitChallengeClient,
logger: zap.NewNop(),
}
for _, opt := range opts {
opt(poetClient)
Expand All @@ -156,11 +173,11 @@ func NewHTTPPoetClient(server types.PoetServer, cfg PoetConfig, opts ...PoetClie
"created poet client",
zap.Stringer("url", baseURL),
zap.Binary("pubkey", server.Pubkey.Bytes()),
zap.Int("max retries", client.RetryMax),
zap.Int("default max retries", client.RetryMax),
zap.Int("submit challenge max retries", submitChallengeClient.RetryMax),
zap.Duration("min retry wait", client.RetryWaitMin),
zap.Duration("max retry wait", client.RetryWaitMax),
)

return poetClient, nil
}

Expand Down Expand Up @@ -231,7 +248,7 @@ func (c *HTTPPoetClient) Submit(
}

resBody := rpcapi.SubmitResponse{}
if err := c.req(ctx, http.MethodPost, "/v1/submit", &request, &resBody); err != nil {
if err := c.req(ctx, http.MethodPost, submitPath, &request, &resBody); err != nil {
return nil, fmt.Errorf("submitting challenge: %w", err)
}
roundEnd := time.Time{}
Expand All @@ -244,7 +261,7 @@ func (c *HTTPPoetClient) Submit(

func (c *HTTPPoetClient) info(ctx context.Context) (*rpcapi.InfoResponse, error) {
resBody := rpcapi.InfoResponse{}
if err := c.req(ctx, http.MethodGet, "/v1/info", nil, &resBody); err != nil {
if err := c.req(ctx, http.MethodGet, infoPath, nil, &resBody); err != nil {
return nil, fmt.Errorf("getting poet info: %w", err)
}
return &resBody, nil
Expand All @@ -253,7 +270,7 @@ func (c *HTTPPoetClient) info(ctx context.Context) (*rpcapi.InfoResponse, error)
// Proof implements PoetProvingServiceClient.
func (c *HTTPPoetClient) Proof(ctx context.Context, roundID string) (*types.PoetProofMessage, []types.Hash32, error) {
resBody := rpcapi.ProofResponse{}
if err := c.req(ctx, http.MethodGet, fmt.Sprintf("/v1/proofs/%s", roundID), nil, &resBody); err != nil {
if err := c.req(ctx, http.MethodGet, fmt.Sprintf(proofPath+"/%s", roundID), nil, &resBody); err != nil {
poszu marked this conversation as resolved.
Show resolved Hide resolved
return nil, nil, fmt.Errorf("getting proof: %w", err)
}

Expand Down Expand Up @@ -297,7 +314,13 @@ func (c *HTTPPoetClient) req(ctx context.Context, method, path string, reqBody,
}
req.Header.Set("Content-Type", "application/json")

res, err := c.client.Do(req)
var res *http.Response

if path == submitPath {
res, err = c.submitChallengeClient.Do(req)
} else {
res, err = c.client.Do(req)
}
fasmat marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return fmt.Errorf("doing request: %w", err)
}
Expand Down Expand Up @@ -341,9 +364,10 @@ type certifierInfo struct {
// poetService is a higher-level interface to communicate with a PoET service.
// It wraps the HTTP client, adding additional functionality.
type poetService struct {
db poetDbAPI
logger *zap.Logger
client PoetClient
db poetDbAPI
logger *zap.Logger
client PoetClient

requestTimeout time.Duration

// Used to avoid concurrent requests for proof.
Expand Down Expand Up @@ -481,9 +505,7 @@ func (c *poetService) Submit(

logger.Debug("submitting challenge to poet proving service")

submitCtx, cancel := withConditionalTimeout(ctx, c.requestTimeout)
defer cancel()
round, err := c.client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID, *auth)
round, err := c.client.Submit(ctx, deadline, prefix, challenge, signature, nodeID, *auth)
switch {
case err == nil:
return round, nil
Expand All @@ -493,7 +515,7 @@ func (c *poetService) Submit(
if err != nil {
return nil, fmt.Errorf("recertifying: %w", err)
}
return c.client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID, *auth)
return c.client.Submit(ctx, deadline, prefix, challenge, signature, nodeID, *auth)
}
return nil, fmt.Errorf("submitting challenge: %w", err)
}
Expand Down
51 changes: 51 additions & 0 deletions activation/poet_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,57 @@ func Test_HTTPPoetClient_Submit(t *testing.T) {
require.NoError(t, err)
}

func Test_HTTPPoetClient_SubmitTillCtxDeadlineReached(t *testing.T) {
t.Parallel()

ctxWithDeadline, cancel := context.WithDeadline(context.Background(), time.Now().Add(5*time.Second))
defer cancel()

mux := http.NewServeMux()
mux.HandleFunc("POST /v1/submit", func(w http.ResponseWriter, r *http.Request) {
now := time.Now()
deadline, _ := ctxWithDeadline.Deadline()
remain := deadline.Sub(now)

if remain.Seconds() <= time.Second.Seconds() {
resp, err := protojson.Marshal(&rpcapi.SubmitResponse{})
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
} else {
w.Write(resp)
}
return
}
http.Error(w, "some_error", http.StatusInternalServerError)
})
fasmat marked this conversation as resolved.
Show resolved Hide resolved

ts := httptest.NewServer(mux)
defer ts.Close()

cfg := server.DefaultRoundConfig()
client, err := NewHTTPPoetClient(types.PoetServer{Address: ts.URL}, PoetConfig{
PhaseShift: cfg.PhaseShift,
CycleGap: cfg.CycleGap,
MaxRequestRetries: 1,
}, withCustomHttpClient(ts.Client()))
require.NoError(t, err)

startTime := time.Now()
_, err = client.Submit(
ctxWithDeadline,
time.Time{},
nil,
nil,
types.EmptyEdSignature,
types.NodeID{},
PoetAuth{},
)
duration := time.Since(startTime)

require.NoError(t, err)
require.GreaterOrEqual(t, duration.Seconds(), float64(4))
}
poszu marked this conversation as resolved.
Show resolved Hide resolved

func Test_HTTPPoetClient_Address(t *testing.T) {
t.Parallel()
t.Run("with scheme", func(t *testing.T) {
Expand Down
8 changes: 4 additions & 4 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,13 +320,13 @@ func AddFlags(flagSet *pflag.FlagSet, cfg *config.Config) (configPath *string) {
/**======================== PoET Flags ========================== **/

flagSet.DurationVar(&cfg.POET.PhaseShift, "phase-shift",
cfg.POET.PhaseShift, "phase shift of poet server")
cfg.POET.PhaseShift, "phase shift of poet server: duration after epoch start, at which poet round starts")
flagSet.DurationVar(&cfg.POET.CycleGap, "cycle-gap",
cfg.POET.CycleGap, "cycle gap of poet server")
cfg.POET.CycleGap, "cycle gap of poet server: gap between poet rounds")
flagSet.DurationVar(&cfg.POET.GracePeriod, "grace-period",
cfg.POET.GracePeriod, "time before PoET round starts when the node builds and submits a challenge")
cfg.POET.GracePeriod, "time before poet round starts, when the node builds and submits a challenge")
flagSet.DurationVar(&cfg.POET.RequestTimeout, "poet-request-timeout",
cfg.POET.RequestTimeout, "timeout for poet requests")
cfg.POET.RequestTimeout, "default timeout for poet requests")

/**======================== bootstrap data updater Flags ========================== **/

Expand Down
8 changes: 4 additions & 4 deletions config/mainnet.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,14 +160,14 @@ func MainnetConfig() Config {
BeaconSyncWeightUnits: 800,
},
POET: activation.PoetConfig{
// RequestTimeout = RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
RequestTimeout: 1100 * time.Second,
RequestRetryDelay: 10 * time.Second,
MaxRequestRetries: 10,
PhaseShift: 240 * time.Hour,
CycleGap: 12 * time.Hour,
GracePeriod: 1 * time.Hour,
PositioningATXSelectionTimeout: 50 * time.Minute,
fasmat marked this conversation as resolved.
Show resolved Hide resolved
// RequestTimeout = RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
RequestTimeout: 1100 * time.Second,
RequestRetryDelay: 10 * time.Second,
MaxRequestRetries: 10,
},
POST: activation.PostConfig{
MinNumUnits: 4,
Expand Down
4 changes: 3 additions & 1 deletion config/presets/fastnet.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,9 @@ func fastnet() config.Config {
conf.POET.GracePeriod = 10 * time.Second
conf.POET.CycleGap = 30 * time.Second
conf.POET.PhaseShift = 30 * time.Second
conf.POET.RequestTimeout = 12 * time.Second // RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
conf.POET.PositioningATXSelectionTimeout = 8 * time.Second
// RequestTimeout = RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
conf.POET.RequestTimeout = 12 * time.Second
conf.POET.RequestRetryDelay = 1 * time.Second
conf.POET.MaxRequestRetries = 3
return conf
Expand Down
4 changes: 3 additions & 1 deletion config/presets/standalone.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ func standalone() config.Config {
conf.POET.GracePeriod = 12 * time.Second
conf.POET.CycleGap = 30 * time.Second
conf.POET.PhaseShift = 30 * time.Second
conf.POET.RequestTimeout = 12 * time.Second // RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
conf.POET.PositioningATXSelectionTimeout = 8 * time.Second
// RequestTimeout = RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
conf.POET.RequestTimeout = 12 * time.Second
conf.POET.RequestRetryDelay = 1 * time.Second
conf.POET.MaxRequestRetries = 3

Expand Down
14 changes: 8 additions & 6 deletions config/presets/testnet.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,14 @@ func testnet() config.Config {
BeaconSyncWeightUnits: 800,
},
POET: activation.PoetConfig{
PhaseShift: 12 * time.Hour,
CycleGap: 2 * time.Hour,
GracePeriod: 10 * time.Minute,
RequestTimeout: 550 * time.Second, // RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
RequestRetryDelay: 5 * time.Second,
MaxRequestRetries: 10,
// RequestTimeout = RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
RequestTimeout: 550 * time.Second,
RequestRetryDelay: 5 * time.Second,
MaxRequestRetries: 10,
PositioningATXSelectionTimeout: 8 * time.Minute,
PhaseShift: 12 * time.Hour,
CycleGap: 2 * time.Hour,
GracePeriod: 10 * time.Minute,
},
POST: activation.PostConfig{
MinNumUnits: 2,
Expand Down
Loading