Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Merged by Bors] - Retry registration timeout fix #6136

Closed
wants to merge 31 commits into from
Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
377a8c1
fixed timeouts
ConvallariaMaj Jul 15, 2024
d122094
fix tests + make linter happy
ConvallariaMaj Jul 15, 2024
fd3414e
revert config structure
ConvallariaMaj Jul 15, 2024
e3d6d8a
revert config structure2
ConvallariaMaj Jul 15, 2024
94383d7
fix tests
ConvallariaMaj Jul 15, 2024
4fad1a0
make linter happy
ConvallariaMaj Jul 15, 2024
9ff5958
fix review issue
ConvallariaMaj Jul 15, 2024
4046cc5
fix review issue + revert naming
ConvallariaMaj Jul 16, 2024
ab0b71e
removed overengineering
ConvallariaMaj Jul 17, 2024
c969205
removed overengineering2
ConvallariaMaj Jul 17, 2024
9435aee
removed overengineering2
ConvallariaMaj Jul 17, 2024
d515791
reverted timeout to Proof method, fixed some tests
ConvallariaMaj Jul 17, 2024
50a3e0e
added PositioningATXSelectionTimeout to all envs
ConvallariaMaj Jul 17, 2024
4d067c6
fix test and timeout
ConvallariaMaj Jul 17, 2024
fcd2ab1
revert unnecessary changes
ConvallariaMaj Jul 17, 2024
eb4e623
added PositioningATXSelectionTimeout to test env
ConvallariaMaj Jul 17, 2024
bfe0c34
fix flag
ConvallariaMaj Jul 17, 2024
bc743d2
added changelog
ConvallariaMaj Jul 18, 2024
0d30ee1
unlimited retry
ConvallariaMaj Jul 27, 2024
d9b0513
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Jul 28, 2024
c5be14d
fix issues
ConvallariaMaj Aug 1, 2024
2c53d3e
review fixes
ConvallariaMaj Aug 7, 2024
50e3129
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 7, 2024
8e729ea
minor fixes
ConvallariaMaj Aug 12, 2024
cd14fc0
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 12, 2024
1e79763
make linter happy
ConvallariaMaj Aug 12, 2024
1d0dfb0
fix comment
ConvallariaMaj Aug 12, 2024
de5cd55
custom linear jitter
ConvallariaMaj Aug 14, 2024
b7fcf55
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 14, 2024
3ac16db
fix review issues
ConvallariaMaj Aug 14, 2024
24a4056
Merge branch 'develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

See [RELEASE](./RELEASE.md) for workflow instructions.

## UNRELEASED
### Improvements
* [#6035](https://github.com/spacemeshos/go-spacemesh/issues/6035)
Added poet config parameters to establish submit registration challenge and fetch poet proof timeouts.
- `submit-challenge-timeout`: Timeout duration for submitting registration challenges to the PoET service.
Optional parameter, if not set, submitting will be continued till end of cyclegap
- `fetch-proof-timeout`: Timeout duration for fetching proofs from PoET service.
Optional parameter, if not set, fetching proof will be continued till beginning of cyclegap in publish epoch

## Release v1.6.1

### Improvements
Expand Down
23 changes: 15 additions & 8 deletions activation/activation.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,22 @@ var (

// PoetConfig is the configuration to interact with the poet server.
type PoetConfig struct {
PhaseShift time.Duration `mapstructure:"phase-shift"`
CycleGap time.Duration `mapstructure:"cycle-gap"`
GracePeriod time.Duration `mapstructure:"grace-period"`
RequestTimeout time.Duration `mapstructure:"poet-request-timeout"`
RequestRetryDelay time.Duration `mapstructure:"retry-delay"`
RequestTimeout time.Duration `mapstructure:"poet-request-timeout"`
RequestRetryDelay time.Duration `mapstructure:"retry-delay"`
CertifierInfoCacheTTL time.Duration `mapstructure:"certifier-info-cache-ttl"`
MaxRequestRetries int `mapstructure:"retry-max"`
// Start of new PoET round
poszu marked this conversation as resolved.
Show resolved Hide resolved
PhaseShift time.Duration `mapstructure:"phase-shift"`
// A gap between end of old PoET round and start of new one
fasmat marked this conversation as resolved.
Show resolved Hide resolved
CycleGap time.Duration `mapstructure:"cycle-gap"`
// Time in the end of cycle gap, when PoST challenge must be build and send to PoET server
poszu marked this conversation as resolved.
Show resolved Hide resolved
GracePeriod time.Duration `mapstructure:"grace-period"`
// Period to find positioning ATX. Must be less, than GracePeriod
PositioningATXSelectionTimeout time.Duration `mapstructure:"positioning-atx-selection-timeout"`
CertifierInfoCacheTTL time.Duration `mapstructure:"certifier-info-cache-ttl"`
MaxRequestRetries int `mapstructure:"retry-max"`
// Period to submit PoST challenge to PoET server. Must be not greater than GracePeriod
SubmitChallengeTimeout time.Duration `mapstructure:"submit-challenge-timeout"`
poszu marked this conversation as resolved.
Show resolved Hide resolved
// Period to fetch PoET proof
FetchProofTimeout time.Duration `mapstructure:"fetch-proof-timeout"`
}

func DefaultPoetConfig() PoetConfig {
Expand Down Expand Up @@ -547,7 +555,6 @@ func (b *Builder) BuildNIPostChallenge(ctx context.Context, nodeID types.NodeID)
case <-time.After(time.Until(wait)):
}
}

if b.poetCfg.PositioningATXSelectionTimeout > 0 {
var cancel context.CancelFunc

Expand Down
11 changes: 9 additions & 2 deletions activation/activation_multi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,10 @@ func TestRegossip(t *testing.T) {
}

func Test_Builder_Multi_InitialPost(t *testing.T) {
tab := newTestBuilder(t, 5, WithPoetConfig(PoetConfig{PhaseShift: layerDuration * 4}))
tab := newTestBuilder(t, 5, WithPoetConfig(
PoetConfig{
PhaseShift: layerDuration * 4,
}))

var eg errgroup.Group
for _, sig := range tab.signers {
Expand Down Expand Up @@ -271,7 +274,11 @@ func Test_Builder_Multi_InitialPost(t *testing.T) {

func Test_Builder_Multi_HappyPath(t *testing.T) {
layerDuration := 2 * time.Second
tab := newTestBuilder(t, 3, WithPoetConfig(PoetConfig{PhaseShift: layerDuration * 4, CycleGap: layerDuration}))
tab := newTestBuilder(t, 3, WithPoetConfig(
PoetConfig{
PhaseShift: layerDuration * 4,
CycleGap: layerDuration,
}))

// step 1: build initial posts
initialPostChan := make(chan struct{})
Expand Down
21 changes: 13 additions & 8 deletions activation/activation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,10 @@ func TestBuilder_StopSmeshing_failsWhenNotStarted(t *testing.T) {
}

func TestBuilder_PublishActivationTx_HappyFlow(t *testing.T) {
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{PhaseShift: layerDuration}))
tab := newTestBuilder(t, 1, WithPoetConfig(
PoetConfig{
PhaseShift: layerDuration,
}))
sig := maps.Values(tab.signers)[0]

posEpoch := postGenesisEpoch
Expand Down Expand Up @@ -380,7 +383,9 @@ func TestBuilder_PublishActivationTx_HappyFlow(t *testing.T) {
// failing with ErrATXChallengeExpired.
func TestBuilder_Loop_WaitsOnStaleChallenge(t *testing.T) {
// Arrange
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{PhaseShift: layerDuration * 4}))
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{
PhaseShift: layerDuration * 4,
}))
sig := maps.Values(tab.signers)[0]

// current layer is too late to be able to build a nipost on time
Expand Down Expand Up @@ -429,7 +434,9 @@ func TestBuilder_Loop_WaitsOnStaleChallenge(t *testing.T) {
}

func TestBuilder_PublishActivationTx_FaultyNet(t *testing.T) {
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{PhaseShift: layerDuration * 4}))
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{
PhaseShift: layerDuration * 4,
}))
sig := maps.Values(tab.signers)[0]

posEpoch := postGenesisEpoch
Expand Down Expand Up @@ -501,9 +508,7 @@ func TestBuilder_PublishActivationTx_FaultyNet(t *testing.T) {
}

func TestBuilder_PublishActivationTx_UsesExistingChallengeOnLatePublish(t *testing.T) {
poetCfg := PoetConfig{
PhaseShift: layerDuration * 4,
}
poetCfg := PoetConfig{PhaseShift: layerDuration * 4}
tab := newTestBuilder(t, 1, WithPoetConfig(poetCfg))
sig := maps.Values(tab.signers)[0]

Expand Down Expand Up @@ -1140,7 +1145,7 @@ func TestBuilder_RetryPublishActivationTx(t *testing.T) {
}

func TestBuilder_InitialProofGeneratedOnce(t *testing.T) {
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{PhaseShift: layerDuration * 4}))
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{PhaseShift: layerDuration}))
fasmat marked this conversation as resolved.
Show resolved Hide resolved
sig := maps.Values(tab.signers)[0]

post := nipost.Post{
Expand Down Expand Up @@ -1179,7 +1184,7 @@ func TestBuilder_InitialProofGeneratedOnce(t *testing.T) {
}

func TestBuilder_InitialPostIsPersisted(t *testing.T) {
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{PhaseShift: layerDuration * 4}))
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{PhaseShift: layerDuration}))
fasmat marked this conversation as resolved.
Show resolved Hide resolved
sig := maps.Values(tab.signers)[0]

commitmentATX := types.RandomATXID()
Expand Down
14 changes: 8 additions & 6 deletions activation/e2e/activation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,14 @@ func Test_BuilderWithMultipleClients(t *testing.T) {
epoch := layerDuration * layersPerEpoch
genesis := time.Now().Add(layerDuration).Round(layerDuration)
poetCfg := activation.PoetConfig{
PhaseShift: epoch,
CycleGap: 3 * epoch / 4,
GracePeriod: epoch / 4,
RequestTimeout: epoch / 5,
RequestRetryDelay: epoch / 50,
MaxRequestRetries: 10,
RequestTimeout: epoch / 5,
RequestRetryDelay: epoch / 50,
MaxRequestRetries: 10,
PhaseShift: epoch,
CycleGap: 3 * epoch / 4,
GracePeriod: epoch / 4,
SubmitChallengeTimeout: epoch / 4,
FetchProofTimeout: epoch / 5,
}

scrypt := testPostSetupOpts(t).Scrypt
Expand Down
8 changes: 2 additions & 6 deletions activation/nipost.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ func (nb *NIPostBuilder) BuildNIPost(
}

events.EmitPoetWaitProof(signer.NodeID(), postChallenge.PublishEpoch, poetRoundEnd)
poetProofRef, membership, err = nb.getBestProof(ctx, signer.NodeID(), challenge, postChallenge.PublishEpoch)
poetProofRef, membership, err = nb.getBestProof(ctx, signer.NodeID(), challenge)
if err != nil {
return nil, &PoetSvcUnstableError{msg: "getBestProof failed", source: err}
}
Expand Down Expand Up @@ -372,10 +372,7 @@ func (nb *NIPostBuilder) submitPoetChallenge(

logger.Debug("submitting challenge to poet proving service")

submitCtx, cancel := withConditionalTimeout(ctx, nb.poetCfg.RequestTimeout)
defer cancel()

round, err := client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID)
round, err := client.Submit(ctx, deadline, prefix, challenge, signature, nodeID)
if err != nil {
return &PoetSvcUnstableError{msg: "failed to submit challenge to poet service", source: err}
}
Expand Down Expand Up @@ -451,7 +448,6 @@ func (nb *NIPostBuilder) getBestProof(
ctx context.Context,
nodeID types.NodeID,
challenge types.Hash32,
publishEpoch types.EpochID,
) (types.PoetProofRef, *types.MerkleProof, error) {
type poetProof struct {
poet *types.PoetProof
Expand Down
84 changes: 84 additions & 0 deletions activation/nipost_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -807,6 +807,90 @@ func TestNIPSTBuilder_PoetUnstable(t *testing.T) {
})
}

func TestPoetTimeoutsParameters(t *testing.T) {
t.Parallel()

sig, err := signing.NewEdSigner()
require.NoError(t, err)

challenge := &types.NIPostChallenge{
PublishEpoch: postGenesisEpoch + 2,
}

t.Run("submit challenge timeout is too short -> fail", func(t *testing.T) {
db := localsql.InMemory()

require.NoError(t, nipost.AddChallenge(db, sig.NodeID(), challenge))
challengeHash := wire.NIPostChallengeToWireV1(challenge).Hash()

ctrl := gomock.NewController(t)
mclock := defaultLayerClockMock(ctrl)

poetProver := NewMockPoetService(ctrl)
poetProver.EXPECT().
Submit(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), sig.NodeID()).
Return(nil, context.DeadlineExceeded)
poetProver.EXPECT().Address().AnyTimes().Return("http://localhost:9999")

postService := NewMockpostService(ctrl)

nb, err := NewNIPostBuilder(
db,
postService,
zaptest.NewLogger(t),
PoetConfig{SubmitChallengeTimeout: 1 * time.Microsecond},
mclock,
nil,
WithPoetServices(poetProver),
)
require.NoError(t, err)

nipst, err := nb.BuildNIPost(context.Background(), sig, challengeHash, challenge)
require.ErrorIs(t, err, ErrPoetServiceUnstable)
require.Nil(t, nipst)
})
fasmat marked this conversation as resolved.
Show resolved Hide resolved

t.Run("get proof timeout is too short -> fail", func(t *testing.T) {
db := localsql.InMemory()

require.NoError(t, nipost.AddChallenge(db, sig.NodeID(), challenge))
challengeHash := wire.NIPostChallengeToWireV1(challenge).Hash()

ctrl := gomock.NewController(t)
mclock := defaultLayerClockMock(ctrl)

poetProver := NewMockPoetService(ctrl)
poetProver.EXPECT().Address().AnyTimes().Return("http://localhost:9999")
poetProver.EXPECT().Proof(gomock.Any(), "1").Return(nil, nil, context.DeadlineExceeded)
postService := NewMockpostService(ctrl)

err = nipost.AddPoetRegistration(db, sig.NodeID(), nipost.PoETRegistration{
ChallengeHash: challengeHash,
Address: "http://localhost:9999",
RoundID: "1",
RoundEnd: time.Now().Add(1 * time.Second),
})
require.NoError(t, err)

nb, err := NewNIPostBuilder(
db,
postService,
zaptest.NewLogger(t),
PoetConfig{
FetchProofTimeout: 1 * time.Microsecond,
},
mclock,
nil,
WithPoetServices(poetProver),
)
require.NoError(t, err)

nipst, err := nb.BuildNIPost(context.Background(), sig, challengeHash, challenge)
require.ErrorIs(t, err, ErrPoetServiceUnstable)
require.Nil(t, nipst)
})
fasmat marked this conversation as resolved.
Show resolved Hide resolved
}

// TestNIPoSTBuilder_StaleChallenge checks if
// it properly detects that the challenge is stale and the poet round has already started.
func TestNIPoSTBuilder_StaleChallenge(t *testing.T) {
Expand Down
33 changes: 20 additions & 13 deletions activation/poet.go
Original file line number Diff line number Diff line change
Expand Up @@ -340,10 +340,13 @@ type certifierInfo struct {
// poetService is a higher-level interface to communicate with a PoET service.
// It wraps the HTTP client, adding additional functionality.
type poetService struct {
db poetDbAPI
logger *zap.Logger
client PoetClient
requestTimeout time.Duration
db poetDbAPI
logger *zap.Logger
client PoetClient

defaultRequestTimeout time.Duration
submitChallengeRequestTimeout time.Duration
fetchProofRequestTimeout time.Duration

// Used to avoid concurrent requests for proof.
gettingProof sync.Mutex
Expand Down Expand Up @@ -393,12 +396,14 @@ func NewPoetServiceWithClient(
opts ...PoetServiceOpt,
) *poetService {
poetClient := &poetService{
db: db,
logger: logger,
client: client,
requestTimeout: cfg.RequestTimeout,
certifierInfoTTL: cfg.CertifierInfoCacheTTL,
proofMembers: make(map[string][]types.Hash32, 1),
db: db,
logger: logger,
client: client,
defaultRequestTimeout: cfg.RequestTimeout,
submitChallengeRequestTimeout: cfg.SubmitChallengeTimeout,
fetchProofRequestTimeout: cfg.FetchProofTimeout,
certifierInfoTTL: cfg.CertifierInfoCacheTTL,
proofMembers: make(map[string][]types.Hash32, 1),
}

for _, opt := range opts {
Expand Down Expand Up @@ -432,7 +437,7 @@ func (c *poetService) authorize(
// TODO: remove this fallback once we migrate to certificates fully.

logger.Debug("querying for poet pow parameters")
powCtx, cancel := withConditionalTimeout(ctx, c.requestTimeout)
powCtx, cancel := withConditionalTimeout(ctx, c.defaultRequestTimeout)
defer cancel()
powParams, err := c.client.PowParams(powCtx)
if err != nil {
Expand Down Expand Up @@ -480,7 +485,7 @@ func (c *poetService) Submit(

logger.Debug("submitting challenge to poet proving service")

submitCtx, cancel := withConditionalTimeout(ctx, c.requestTimeout)
submitCtx, cancel := withConditionalTimeout(ctx, c.submitChallengeRequestTimeout)
fasmat marked this conversation as resolved.
Show resolved Hide resolved
defer cancel()
round, err := c.client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID, *auth)
switch {
Expand All @@ -493,12 +498,14 @@ func (c *poetService) Submit(
return nil, fmt.Errorf("recertifying: %w", err)
}
return c.client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID, *auth)
case errors.Is(err, context.DeadlineExceeded):
logger.Warn("failed to submit challenge: probably submit challenge timeout is too short", zap.Error(err))
fasmat marked this conversation as resolved.
Show resolved Hide resolved
}
return nil, fmt.Errorf("submitting challenge: %w", err)
}

func (c *poetService) Proof(ctx context.Context, roundID string) (*types.PoetProof, []types.Hash32, error) {
getProofsCtx, cancel := withConditionalTimeout(ctx, c.requestTimeout)
getProofsCtx, cancel := withConditionalTimeout(ctx, c.fetchProofRequestTimeout)
defer cancel()

c.gettingProof.Lock()
Expand Down
Loading
Loading