From 56221634241287b9c7206e1b21e853bcd9d1a5d5 Mon Sep 17 00:00:00 2001 From: Dmitry Shulyak Date: Mon, 5 Feb 2024 09:54:44 +0000 Subject: [PATCH] tortoise: stay in verifying mode while recovering from disk (#5514) closes https://github.com/spacemeshos/go-spacemesh/issues/5246 verifying mode is a faster way to count ballots, and it is also has fewer known security issues. during debug i observed the following problems, which are mostly fixed here: - certificates are not kept forever and hence tortoise cannot form an opinion on the layer without them. to correctly handle that we update last layer and opinion is loaded the database. - when data is recovered from disk we need to start counting from the first layer in the epoch, in order to compute reference epoch height. this variable is used later in verifyLayers call - after layers were recovered we compute matching opinions and evict unnecessary events. this eviction was too agressive and should be limited to layers that were verified by tortoise. this is to avoid the same race as in https://github.com/spacemeshos/go-spacemesh/pull/5523 additionally i switched tortoise to use IterateAtxs directly on the database without using lru cache, as it is faster and intend to work on dropping that in other components. --- node/node.go | 2 +- sql/blocks/blocks.go | 20 ++++++++ sql/blocks/blocks_test.go | 53 +++++++++++++++++++++ tortoise/algorithm.go | 71 +++++++++++++++++++++++---- tortoise/model/core.go | 2 +- tortoise/recover.go | 87 +++++++++++++++++++++------------- tortoise/recover_test.go | 64 ++++++++++++++++++++----- tortoise/replay/replay_test.go | 74 +++++++++++++++++++++++++++++ tortoise/tortoise.go | 52 +++++++++++++++----- tortoise/tortoise_test.go | 6 +-- tortoise/tracer.go | 49 ++++++++++++++++--- tortoise/tracer_test.go | 2 +- tortoise/util.go | 1 + 13 files changed, 406 insertions(+), 77 deletions(-) create mode 100644 tortoise/replay/replay_test.go diff --git a/node/node.go b/node/node.go index 2067715745..5db55e095e 100644 --- a/node/node.go +++ b/node/node.go @@ -677,7 +677,7 @@ func (app *App) initServices(ctx context.Context) error { start := time.Now() trtl, err := tortoise.Recover( ctx, - app.cachedDB, + app.db, app.clock.CurrentLayer(), trtlopts..., ) if err != nil { diff --git a/sql/blocks/blocks.go b/sql/blocks/blocks.go index f48ad65c91..3c04aee2e0 100644 --- a/sql/blocks/blocks.go +++ b/sql/blocks/blocks.go @@ -71,6 +71,26 @@ func Get(db sql.Executor, id types.BlockID) (rst *types.Block, err error) { return rst, err } +func LastValid(db sql.Executor) (types.LayerID, error) { + var lid types.LayerID + // it doesn't use max(layer) in order to get rows == 0 when there are no layers. + // aggregation always returns rows == 1, hence the check below doesn't work. + rows, err := db.Exec( + "select layer from blocks where validity = 1 order by layer desc limit 1;", + nil, + func(stmt *sql.Statement) bool { + lid = types.LayerID(uint32(stmt.ColumnInt64(0))) + return false + }, + ) + if err != nil { + return lid, fmt.Errorf("get last valid layer: %w", err) + } else if rows == 0 { + return lid, fmt.Errorf("%w: no valid layers", sql.ErrNotFound) + } + return lid, nil +} + func UpdateValid(db sql.Executor, id types.BlockID, valid bool) error { if valid { return SetValid(db, id) diff --git a/sql/blocks/blocks_test.go b/sql/blocks/blocks_test.go index e1d6589ec4..096dc62f2b 100644 --- a/sql/blocks/blocks_test.go +++ b/sql/blocks/blocks_test.go @@ -217,3 +217,56 @@ func TestGetLayer(t *testing.T) { require.Equal(t, b.LayerIndex, lid) } } + +func TestLastValid(t *testing.T) { + t.Run("empty", func(t *testing.T) { + db := sql.InMemory() + _, err := LastValid(db) + require.ErrorIs(t, err, sql.ErrNotFound) + }) + t.Run("all valid", func(t *testing.T) { + db := sql.InMemory() + blocks := map[types.BlockID]struct { + lid types.LayerID + }{ + {1}: {lid: 11}, + {2}: {lid: 22}, + {3}: {lid: 33}, + } + for bid, layer := range blocks { + block := types.NewExistingBlock( + bid, + types.InnerBlock{LayerIndex: layer.lid}, + ) + require.NoError(t, Add(db, block)) + require.NoError(t, SetValid(db, bid)) + } + last, err := LastValid(db) + require.NoError(t, err) + require.Equal(t, 33, int(last)) + }) + t.Run("last is invalid", func(t *testing.T) { + db := sql.InMemory() + blocks := map[types.BlockID]struct { + invalid bool + lid types.LayerID + }{ + {1}: {lid: 11}, + {2}: {lid: 22}, + {3}: {invalid: true, lid: 33}, + } + for bid, layer := range blocks { + block := types.NewExistingBlock( + bid, + types.InnerBlock{LayerIndex: layer.lid}, + ) + require.NoError(t, Add(db, block)) + if !layer.invalid { + require.NoError(t, SetValid(db, bid)) + } + } + last, err := LastValid(db) + require.NoError(t, err) + require.Equal(t, 22, int(last)) + }) +} diff --git a/tortoise/algorithm.go b/tortoise/algorithm.go index 181a133edd..8655dbdfd3 100644 --- a/tortoise/algorithm.go +++ b/tortoise/algorithm.go @@ -130,6 +130,12 @@ func New(opts ...Opt) (*Tortoise, error) { } func (t *Tortoise) RecoverFrom(lid types.LayerID, opinion, prev types.Hash32) { + if lid <= types.GetEffectiveGenesis() { + t.logger.Panic("recover should be after effective genesis", + zap.Uint32("lid", lid.Uint32()), + zap.Uint32("effective genesis", types.GetEffectiveGenesis().Uint32()), + ) + } t.mu.Lock() defer t.mu.Unlock() t.logger.Debug("recover from", @@ -140,7 +146,7 @@ func (t *Tortoise) RecoverFrom(lid types.LayerID, opinion, prev types.Hash32) { t.trtl.evicted = lid - 1 t.trtl.pending = lid t.trtl.verified = lid - t.trtl.processed = lid + t.trtl.processed = lid - 1 // -1 so that iteration in tallyVotes starts from the target layer t.trtl.last = lid layer := t.trtl.layer(lid) layer.opinion = opinion @@ -276,7 +282,7 @@ func (t *Tortoise) TallyVotes(ctx context.Context, lid types.LayerID) { defer t.mu.Unlock() waitTallyVotes.Observe(float64(time.Since(start).Nanoseconds())) start = time.Now() - t.trtl.onLayer(ctx, lid) + t.trtl.tallyVotes(ctx, lid) executeTallyVotes.Observe(float64(time.Since(start).Nanoseconds())) if t.tracer != nil { t.tracer.On(&TallyTrace{Layer: lid}) @@ -307,16 +313,43 @@ func (t *Tortoise) OnBlock(header types.BlockHeader) { } } -// OnValidBlock inserts block, updates that data is stored locally -// and that block was previously considered valid by tortoise. -func (t *Tortoise) OnValidBlock(header types.BlockHeader) { - start := time.Now() +// OnRecoveredBlocks uploads blocks to the state with all metadata. +// +// Implementation assumes that they will be uploaded in order. +func (t *Tortoise) OnRecoveredBlocks(lid types.LayerID, validity map[types.BlockHeader]bool, hare *types.BlockID) { t.mu.Lock() defer t.mu.Unlock() - waitBlockDuration.Observe(float64(time.Since(start).Nanoseconds())) - t.trtl.onBlock(header, true, true) + + for block, valid := range validity { + if exists := t.trtl.getBlock(block); exists != nil { + continue + } + info := newBlockInfo(block) + info.data = true + if valid { + info.validity = support + } + t.trtl.addBlock(info) + } + if hare != nil { + t.trtl.onHareOutput(lid, *hare) + } else if !withinDistance(t.cfg.Hdist, lid, t.trtl.last) { + layer := t.trtl.state.layer(lid) + layer.hareTerminated = true + for _, info := range layer.blocks { + if info.validity == abstain { + info.validity = against + } + } + } + + t.logger.Debug("loaded recovered blocks", + zap.Uint32("lid", lid.Uint32()), + zap.Uint32("last", t.trtl.last.Uint32()), + zapBlocks(t.trtl.state.layer(lid).blocks), + ) if t.tracer != nil { - t.tracer.On(&BlockTrace{Header: header, Valid: true}) + t.tracer.On(newRecoveredBlocksTrace(lid, validity, hare)) } } @@ -607,3 +640,23 @@ func (t *Tortoise) Mode() Mode { } return Verifying } + +// UpdateLastLayer updates last layer which is used for determining weight thresholds. +func (t *Tortoise) UpdateLastLayer(last types.LayerID) { + t.mu.Lock() + defer t.mu.Unlock() + t.trtl.updateLast(last) +} + +// UpdateVerified layers based on the previously known verified layer. +func (t *Tortoise) UpdateVerified(verified types.LayerID) { + t.mu.Lock() + defer t.mu.Unlock() + t.trtl.verified = verified +} + +func (t *Tortoise) WithinHdist(lid types.LayerID) bool { + t.mu.Lock() + defer t.mu.Unlock() + return withinDistance(t.cfg.Hdist, lid, t.trtl.last) +} diff --git a/tortoise/model/core.go b/tortoise/model/core.go index bea577cf1d..f249c61385 100644 --- a/tortoise/model/core.go +++ b/tortoise/model/core.go @@ -128,7 +128,7 @@ func (c *core) OnMessage(m Messenger, event Message) { m.Send(MessageBallot{Ballot: ballot}) case MessageLayerEnd: if ev.LayerID.After(types.GetEffectiveGenesis()) { - tortoise.RecoverLayer(context.Background(), c.tortoise, c.cdb, ev.LayerID, c.tortoise.OnBallot) + tortoise.RecoverLayer(context.Background(), c.tortoise, c.cdb.Executor, ev.LayerID, c.tortoise.OnBallot) c.tortoise.TallyVotes(context.Background(), ev.LayerID) m.Notify(EventVerified{ID: c.id, Verified: c.tortoise.LatestComplete(), Layer: ev.LayerID}) } diff --git a/tortoise/recover.go b/tortoise/recover.go index 3e7f8b6777..5e7b541358 100644 --- a/tortoise/recover.go +++ b/tortoise/recover.go @@ -6,7 +6,6 @@ import ( "fmt" "github.com/spacemeshos/go-spacemesh/common/types" - "github.com/spacemeshos/go-spacemesh/datastore" "github.com/spacemeshos/go-spacemesh/sql" "github.com/spacemeshos/go-spacemesh/sql/atxs" "github.com/spacemeshos/go-spacemesh/sql/ballots" @@ -20,7 +19,7 @@ import ( // Recover tortoise state from database. func Recover( ctx context.Context, - db *datastore.CachedDB, + db sql.Executor, current types.LayerID, opts ...Opt, ) (*Tortoise, error) { @@ -33,17 +32,21 @@ func Recover( if err != nil { return nil, fmt.Errorf("failed to load latest known layer: %w", err) } - applied, err := layers.GetLastApplied(db) if err != nil { return nil, fmt.Errorf("get last applied: %w", err) } + start := types.GetEffectiveGenesis() + 1 if applied > types.LayerID(trtl.cfg.WindowSize) { + // we want to emulate the same condition as during genesis with one difference. + // genesis starts with zero opinion (aggregated hash) - see computeOpinion method. + // but in this case first processed layer should use non-zero opinion of the the previous layer. + window := applied - types.LayerID(trtl.cfg.WindowSize) - window = window.GetEpoch(). - FirstLayer() - // windback to the start of the epoch to load ref ballots + // we start tallying votes from the first layer of the epoch to guarantee that we load reference ballots. + // reference ballots track beacon and eligibilities + window = window.GetEpoch().FirstLayer() if window > start { prev, err1 := layers.GetAggregatedHash(db, window-1) opinion, err2 := layers.GetAggregatedHash(db, window) @@ -70,18 +73,14 @@ func Recover( } } - epoch, err := atxs.LatestEpoch(db) - if err != nil { - return nil, fmt.Errorf("failed to load latest epoch: %w", err) + valid, err := blocks.LastValid(db) + if err != nil && !errors.Is(err, sql.ErrNotFound) { + return nil, fmt.Errorf("get last valid: %w", err) } - epoch++ // recoverEpoch expects target epoch, rather than publish - if last.GetEpoch() != epoch { - for eid := last.GetEpoch(); eid <= epoch; eid++ { - if err := recoverEpoch(eid, trtl, db); err != nil { - return nil, err - } - } + if err == nil { + trtl.UpdateVerified(valid) } + trtl.UpdateLastLayer(last) for lid := start; !lid.After(last); lid = lid.Add(1) { select { case <-ctx.Done(): @@ -101,9 +100,9 @@ func Recover( return trtl, nil } trtl.TallyVotes(ctx, last) - // find topmost layer that was already applied and reset pending - // so that result for that layer is not returned - for prev := last - 1; prev >= start; prev-- { + // find topmost layer that was already applied with same result + // and reset pending so that result for that layer is not returned + for prev := valid; prev >= start; prev-- { opinion, err := layers.GetAggregatedHash(db, prev) if err == nil && opinion != types.EmptyLayerHash { if trtl.OnApplied(prev, opinion) { @@ -114,19 +113,33 @@ func Recover( return nil, fmt.Errorf("check opinion %w", err) } } + // load activations from future epochs that are not yet referenced by the ballots + epoch, err := atxs.LatestEpoch(db) + if err != nil { + return nil, fmt.Errorf("failed to load latest epoch: %w", err) + } + epoch++ // recoverEpoch expects target epoch, rather than publish + if last.GetEpoch() != epoch { + for eid := last.GetEpoch() + 1; eid <= epoch; eid++ { + if err := recoverEpoch(eid, trtl, db); err != nil { + return nil, err + } + } + } return trtl, nil } -func recoverEpoch(epoch types.EpochID, trtl *Tortoise, db *datastore.CachedDB) error { - if err := db.IterateEpochATXHeaders(epoch, func(header *types.ActivationTxHeader) error { - trtl.OnAtx(header.ToData()) - return nil +func recoverEpoch(target types.EpochID, trtl *Tortoise, db sql.Executor) error { + publish := target - 1 + if err := atxs.IterateAtxs(db, publish, publish, func(atx *types.VerifiedActivationTx) bool { + trtl.OnAtx(atx.ToHeader().ToData()) + return true }); err != nil { - return err + return fmt.Errorf("iterate atxs: %w", err) } - beacon, err := beacons.Get(db, epoch) + beacon, err := beacons.Get(db, target) if err == nil && beacon != types.EmptyBeacon { - trtl.OnBeacon(epoch, beacon) + trtl.OnBeacon(target, beacon) } return nil } @@ -136,7 +149,7 @@ type ballotFunc func(*types.BallotTortoiseData) func RecoverLayer( ctx context.Context, trtl *Tortoise, - db *datastore.CachedDB, + db sql.Executor, lid types.LayerID, onBallot ballotFunc, ) error { @@ -149,29 +162,37 @@ func RecoverLayer( if err != nil { return err } + + results := map[types.BlockHeader]bool{} for _, block := range blocksrst { valid, err := blocks.IsValid(db, block.ID()) if err != nil && errors.Is(err, sql.ErrNotFound) { return err } - if valid { - trtl.OnValidBlock(block.ToVote()) - } else { - trtl.OnBlock(block.ToVote()) - } + results[block.ToVote()] = valid + } + var hareResult *types.BlockID + if trtl.WithinHdist(lid) { hare, err := certificates.GetHareOutput(db, lid) if err != nil && !errors.Is(err, sql.ErrNotFound) { return err } if err == nil { - trtl.OnHareOutput(lid, hare) + hareResult = &hare } } + trtl.OnRecoveredBlocks(lid, results, hareResult) + // tortoise votes according to the hare only within hdist (protocol parameter). + // also node is free to prune certificates outside hdist to minimize space usage. + // NOTE(dshulyak) we loaded information about malicious identities earlier. ballotsrst, err := ballots.LayerNoMalicious(db, lid) if err != nil { return err } + // NOTE(dshulyak) it is done in two steps so that if ballot from the same layer was used + // as reference or base ballot we will be able to decode it. + // it might be possible to invalidate such ballots, but until then this is required for _, ballot := range ballotsrst { if ballot.EpochData != nil { onBallot(ballot.ToTortoiseData()) diff --git a/tortoise/recover_test.go b/tortoise/recover_test.go index 307ecaab68..5d07289af7 100644 --- a/tortoise/recover_test.go +++ b/tortoise/recover_test.go @@ -8,8 +8,9 @@ import ( "github.com/spacemeshos/go-spacemesh/common/types" "github.com/spacemeshos/go-spacemesh/common/types/result" - "github.com/spacemeshos/go-spacemesh/datastore" "github.com/spacemeshos/go-spacemesh/log/logtest" + "github.com/spacemeshos/go-spacemesh/sql" + "github.com/spacemeshos/go-spacemesh/sql/blocks" "github.com/spacemeshos/go-spacemesh/sql/layers" "github.com/spacemeshos/go-spacemesh/tortoise/sim" ) @@ -17,7 +18,7 @@ import ( type recoveryAdapter struct { testing.TB *Tortoise - db *datastore.CachedDB + db sql.Executor prev types.LayerID } @@ -51,7 +52,13 @@ func TestRecoverState(t *testing.T) { } require.Equal(t, last.Sub(1), verified) - tortoise2, err := Recover(context.Background(), s.GetState(0).DB, last, WithLogger(logtest.New(t)), WithConfig(cfg)) + tortoise2, err := Recover( + context.Background(), + s.GetState(0).DB.Executor, + last, + WithLogger(logtest.New(t)), + WithConfig(cfg), + ) require.NoError(t, err) verified = tortoise2.LatestComplete() require.Equal(t, last.Sub(1), verified) @@ -68,7 +75,13 @@ func TestRecoverEmpty(t *testing.T) { cfg := defaultTestConfig() cfg.LayerSize = size - tortoise, err := Recover(context.Background(), s.GetState(0).DB, 100, WithLogger(logtest.New(t)), WithConfig(cfg)) + tortoise, err := Recover( + context.Background(), + s.GetState(0).DB.Executor, + 100, + WithLogger(logtest.New(t)), + WithConfig(cfg), + ) require.NoError(t, err) require.NotNil(t, tortoise) } @@ -88,13 +101,18 @@ func TestRecoverWithOpinion(t *testing.T) { var last result.Layer for _, rst := range trt.Updates() { if rst.Verified { - require.NoError(t, layers.SetMeshHash(s.GetState(0).DB, rst.Layer, rst.Opinion)) + require.NoError(t, layers.SetMeshHash(s.GetState(0).DB.Executor, rst.Layer, rst.Opinion)) + } + for _, block := range rst.Blocks { + if block.Valid { + require.NoError(t, blocks.SetValid(s.GetState(0).DB.Executor, block.Header.ID)) + } } last = rst } tortoise, err := Recover( context.Background(), - s.GetState(0).DB, + s.GetState(0).DB.Executor, last.Layer, WithLogger(logtest.New(t)), WithConfig(cfg), @@ -125,11 +143,22 @@ func TestResetPending(t *testing.T) { require.Len(t, updates1, n+1) require.Equal(t, types.GetEffectiveGenesis(), updates1[0].Layer) require.Equal(t, last, updates1[n].Layer) - for _, item := range updates1[:n/2] { - require.NoError(t, layers.SetMeshHash(s.GetState(0).DB, item.Layer, item.Opinion)) + for _, rst := range updates1[:n/2] { + require.NoError(t, layers.SetMeshHash(s.GetState(0).DB, rst.Layer, rst.Opinion)) + for _, block := range rst.Blocks { + if block.Valid { + require.NoError(t, blocks.SetValid(s.GetState(0).DB.Executor, block.Header.ID)) + } + } } - recovered, err := Recover(context.Background(), s.GetState(0).DB, last, WithLogger(logtest.New(t)), WithConfig(cfg)) + recovered, err := Recover( + context.Background(), + s.GetState(0).DB.Executor, + last, + WithLogger(logtest.New(t)), + WithConfig(cfg), + ) require.NoError(t, err) updates2 := recovered.Updates() require.Len(t, updates2, n/2+1) @@ -159,11 +188,22 @@ func TestWindowRecovery(t *testing.T) { require.Len(t, updates1, n+1) require.Equal(t, types.GetEffectiveGenesis(), updates1[0].Layer) require.Equal(t, last, updates1[n].Layer) - for _, item := range updates1[:epochSize*4] { - require.NoError(t, layers.SetMeshHash(s.GetState(0).DB, item.Layer, item.Opinion)) + for _, rst := range updates1[:epochSize*4] { + require.NoError(t, layers.SetMeshHash(s.GetState(0).DB, rst.Layer, rst.Opinion)) + for _, block := range rst.Blocks { + if block.Valid { + require.NoError(t, blocks.SetValid(s.GetState(0).DB.Executor, block.Header.ID)) + } + } } - recovered, err := Recover(context.Background(), s.GetState(0).DB, last, WithLogger(logtest.New(t)), WithConfig(cfg)) + recovered, err := Recover( + context.Background(), + s.GetState(0).DB.Executor, + last, + WithLogger(logtest.New(t)), + WithConfig(cfg), + ) require.NoError(t, err) updates2 := recovered.Updates() require.Len(t, updates2, epochSize+1) diff --git a/tortoise/replay/replay_test.go b/tortoise/replay/replay_test.go new file mode 100644 index 0000000000..942c9defeb --- /dev/null +++ b/tortoise/replay/replay_test.go @@ -0,0 +1,74 @@ +package replay + +import ( + "context" + "flag" + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/require" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + + "github.com/spacemeshos/go-spacemesh/common/types" + "github.com/spacemeshos/go-spacemesh/config" + "github.com/spacemeshos/go-spacemesh/log" + "github.com/spacemeshos/go-spacemesh/sql" + "github.com/spacemeshos/go-spacemesh/timesync" + "github.com/spacemeshos/go-spacemesh/tortoise" +) + +var ( + dbpath = flag.String("dbpath", "", "path to database") + level = zap.LevelFlag("level", zapcore.InfoLevel, "set log level") +) + +func TestReplayMainnet(t *testing.T) { + if len(*dbpath) == 0 { + t.Skip("set -dbpath= to run this test") + } + + cfg := config.MainnetConfig() + types.SetLayersPerEpoch(cfg.LayersPerEpoch) + + log.JSONLog(true) + logger := log.NewWithLevel("replay", zap.NewAtomicLevelAt(*level)) + zlog := logger.Zap() + opts := []tortoise.Opt{ + tortoise.WithLogger(logger), + tortoise.WithConfig(cfg.Tortoise), + } + + genesis, err := time.Parse(time.RFC3339, cfg.Genesis.GenesisTime) + require.NoError(t, err) + clock, err := timesync.NewClock( + timesync.WithLayerDuration(cfg.LayerDuration), + timesync.WithTickInterval(1*time.Second), + timesync.WithGenesisTime(genesis), + timesync.WithLogger(log.NewNop().Zap()), + ) + require.NoError(t, err) + + db, err := sql.Open(fmt.Sprintf("file:%s?mode=ro", *dbpath)) + require.NoError(t, err) + + start := time.Now() + trtl, err := tortoise.Recover( + context.Background(), + db, + clock.CurrentLayer(), opts..., + ) + require.NoError(t, err) + updates := trtl.Updates() + zlog.Info( + "initialized", + zap.Duration("duration", time.Since(start)), + zap.Array("updates", log.ArrayMarshalerFunc(func(encoder log.ArrayEncoder) error { + for _, rst := range updates { + encoder.AppendObject(&rst) + } + return nil + })), + ) +} diff --git a/tortoise/tortoise.go b/tortoise/tortoise.go index 064f264715..c9ff9a1e47 100644 --- a/tortoise/tortoise.go +++ b/tortoise/tortoise.go @@ -307,9 +307,7 @@ func (t *turtle) getFullVote(verified, current types.LayerID, block *blockInfo) return layer.coinflip, reasonCoinflip, nil } -func (t *turtle) onLayer(ctx context.Context, last types.LayerID) { - t.logger.Debug("on layer", zap.Uint32("last", last.Uint32())) - defer t.evict(ctx) +func (t *turtle) updateLast(last types.LayerID) { if last.After(t.last) { update := t.last.GetEpoch() != last.GetEpoch() t.last = last @@ -321,19 +319,32 @@ func (t *turtle) onLayer(ctx context.Context, last types.LayerID) { Div(fixed.New64(int64(types.GetLayersPerEpoch()))) } } +} + +func (t *turtle) tallyVotes(ctx context.Context, last types.LayerID) { + defer t.evict(ctx) + + t.logger.Debug("on layer", zap.Uint32("last", last.Uint32())) + t.updateLast(last) if err := t.drainRetriable(); err != nil { return } for process := t.processed.Add(1); !process.After(t.last); process = process.Add(1) { if process.FirstInEpoch() { - t.computeEpochHeight(process.GetEpoch()) + t.computeEpochHeight(process) } layer := t.layer(process) for _, block := range layer.blocks { t.updateRefHeight(layer, block) } - prev := t.layer(process.Sub(1)) - layer.verifying.goodUncounted = layer.verifying.goodUncounted.Add(prev.verifying.goodUncounted) + + // NOTE(dshulyak) i need this when running verifying tortoise not from the genesis. + if previous := process - 1; previous > t.evicted { + prev := t.layer(previous) + layer.verifying.goodUncounted = layer.verifying.goodUncounted.Add(prev.verifying.goodUncounted) + layer.prevOpinion = &prev.opinion + } + t.processed = process processedLayer.Set(float64(t.processed)) @@ -351,7 +362,6 @@ func (t *turtle) onLayer(ctx context.Context, last types.LayerID) { } } - layer.prevOpinion = &prev.opinion opinion := layer.opinion layer.computeOpinion(t.Hdist, t.last) if opinion != layer.opinion { @@ -362,9 +372,11 @@ func (t *turtle) onLayer(ctx context.Context, last types.LayerID) { } t.logger.Debug("initial local opinion", + zap.Bool("hate terminated", layer.hareTerminated), zap.Uint32("lid", layer.lid.Uint32()), log.ZShortStringer("previous", opinion), log.ZShortStringer("opinion", layer.opinion), + zapBlocks(layer.blocks), ) // terminate layer that falls out of the zdist window and wasn't terminated // by any other component @@ -506,8 +518,8 @@ func (t *turtle) runFull() (verified, changed types.LayerID) { return verified, changed } -func (t *turtle) computeEpochHeight(epoch types.EpochID) { - einfo := t.epoch(epoch) +func (t *turtle) computeEpochHeight(lid types.LayerID) { + einfo := t.epoch(lid.GetEpoch()) heights := make([]uint64, 0, len(einfo.atxs)) for _, info := range einfo.atxs { if !info.malfeasant { @@ -515,12 +527,22 @@ func (t *turtle) computeEpochHeight(epoch types.EpochID) { } } einfo.height = getMedian(heights) + t.logger.Debug( + "computed epoch height", + zap.Uint32("in layer", lid.GetEpoch().Uint32()), + zap.Uint32("for epoch", lid.GetEpoch().Uint32()), + zap.Uint64("height", einfo.height), + ) } func (t *turtle) onBlock(header types.BlockHeader, data, valid bool) { if header.LayerID <= t.evicted { return } + + t.logger.Debug("on block", zap.Inline(&header), zap.Bool("data", data), zap.Bool("valid", valid)) + + // update existing state without calling t.addBlock if binfo := t.state.getBlock(header); binfo != nil { binfo.data = data if valid { @@ -528,8 +550,6 @@ func (t *turtle) onBlock(header types.BlockHeader, data, valid bool) { } return } - t.logger.Debug("on data block", zap.Inline(&header)) - binfo := newBlockInfo(header) binfo.data = data if valid { @@ -579,6 +599,9 @@ func (t *turtle) onHareOutput(lid types.LayerID, bid types.BlockID) { if exists && previous == bid { return } + // we do not compute opinion because opinion hashing recursive. + // so if we didn't receive layer in order this opinion will be wrong + // and we also need to copy previous layer opinion into layer.prevOpinion if !lid.After(t.processed) && withinDistance(t.Config.Hdist, lid, t.last) { t.logger.Debug("local opinion changed within hdist", zap.Uint32("lid", lid.Uint32()), @@ -600,6 +623,7 @@ func (t *turtle) onOpinionChange(lid types.LayerID, early bool) { t.logger.Debug("computed local opinion", zap.Uint32("last", t.last.Uint32()), zap.Uint32("lid", layer.lid.Uint32()), + zap.Bool("changed from previous", opinion != layer.opinion), log.ZShortStringer("previous", opinion), log.ZShortStringer("new", layer.opinion), log.ZShortStringer("prev layer", layer.prevOpinion), @@ -800,6 +824,10 @@ func (t *turtle) storeBallot(ballot *ballotInfo, offset types.LayerID) error { if existing != nil { current.supported[i] = existing } else { + if !withinDistance(t.Hdist, block.layer, t.last) { + block.validity = against + block.hare = against + } t.addBlock(block) } } @@ -889,6 +917,8 @@ func getLocalVote(config Config, verified, last types.LayerID, block *blockInfo) if withinDistance(config.Hdist, block.layer, last) { return block.hare, reasonHareOutput } + // if layer was verified, but then global threshold became unreachable we will not + // update validity, but verified variable will be lowered if block.layer.After(verified) { return abstain, reasonValidity } diff --git a/tortoise/tortoise_test.go b/tortoise/tortoise_test.go index 8a4d4810a7..0438438f80 100644 --- a/tortoise/tortoise_test.go +++ b/tortoise/tortoise_test.go @@ -335,7 +335,7 @@ func tortoiseFromSimState(tb testing.TB, state sim.State, opts ...Opt) *recovery return &recoveryAdapter{ TB: tb, Tortoise: trtl, - db: state.DB, + db: state.DB.Executor, } } @@ -2112,13 +2112,13 @@ func TestFutureHeight(t *testing.T) { s.Next(sim.WithNumBlocks(1), sim.WithBlockTickHeights(slow+1)), ) tortoise.TallyVotes(context.Background(), - s.Next(sim.WithEmptyHareOutput(), sim.WithNumBlocks(0))) + s.Next(sim.WithoutHareOutput(), sim.WithNumBlocks(0))) // 3 is handpicked so that threshold will be crossed if bug wasn't fixed for i := 0; i < 3; i++ { tortoise.TallyVotes(context.Background(), s.Next(sim.WithNumBlocks(1))) } - require.Equal(t, types.GetEffectiveGenesis(), tortoise.LatestComplete()) + require.Equal(t, types.GetEffectiveGenesis().String(), tortoise.LatestComplete().String()) }) t.Run("median above slow smeshers", func(t *testing.T) { s := sim.New( diff --git a/tortoise/tracer.go b/tortoise/tracer.go index 361ab13111..06fa2bbae6 100644 --- a/tortoise/tracer.go +++ b/tortoise/tracer.go @@ -122,6 +122,7 @@ const ( traceUpdates traceApplied traceMalfeasance + traceRecoveredBlocks ) type traceEvent interface { @@ -408,7 +409,6 @@ func (a *AppliedTrace) Run(r *traceRunner) error { type BlockTrace struct { Header types.BlockHeader `json:",inline"` - Valid bool `json:"v"` } func (b *BlockTrace) Type() eventType { @@ -420,11 +420,7 @@ func (b *BlockTrace) New() traceEvent { } func (b *BlockTrace) Run(r *traceRunner) error { - if b.Valid { - r.trt.OnValidBlock(b.Header) - } else { - r.trt.OnBlock(b.Header) - } + r.trt.OnBlock(b.Header) return nil } @@ -445,6 +441,46 @@ func (m *MalfeasanceTrace) Run(r *traceRunner) error { return nil } +type headerWithValidity struct { + Header types.BlockHeader `json:"header"` + Valid bool `json:"valid"` +} + +func newRecoveredBlocksTrace( + layer types.LayerID, + blocks map[types.BlockHeader]bool, + hare *types.BlockID, +) *RecoveredBlocksTrace { + rst := make([]headerWithValidity, 0, len(blocks)) + for header, validity := range blocks { + rst = append(rst, headerWithValidity{header, validity}) + } + return &RecoveredBlocksTrace{Layer: layer, Blocks: rst, Hare: hare} +} + +type RecoveredBlocksTrace struct { + Layer types.LayerID `json:"layer"` + Blocks []headerWithValidity `json:"blocks"` + Hare *types.BlockID `json:"hare"` +} + +func (r *RecoveredBlocksTrace) Type() eventType { + return traceRecoveredBlocks +} + +func (r *RecoveredBlocksTrace) New() traceEvent { + return &RecoveredBlocksTrace{} +} + +func (r *RecoveredBlocksTrace) Run(tr *traceRunner) error { + validity := map[types.BlockHeader]bool{} + for _, block := range r.Blocks { + validity[block.Header] = block.Valid + } + tr.trt.OnRecoveredBlocks(r.Layer, validity, r.Hare) + return nil +} + func assertErrors(err error, expect string) error { msg := "" if err != nil { @@ -472,6 +508,7 @@ func newEventEnum() eventEnum { enum.Register(&UpdatesTrace{}) enum.Register(&AppliedTrace{}) enum.Register(&MalfeasanceTrace{}) + enum.Register(&RecoveredBlocksTrace{}) return enum } diff --git a/tortoise/tracer_test.go b/tortoise/tracer_test.go index a6803f6a59..627a06c284 100644 --- a/tortoise/tracer_test.go +++ b/tortoise/tracer_test.go @@ -45,7 +45,7 @@ func TestTracer(t *testing.T) { path := filepath.Join(t.TempDir(), "tortoise.trace") trt, err := Recover( context.Background(), - s.GetState(0).DB, + s.GetState(0).DB.Executor, last, WithTracer(WithOutput(path)), ) diff --git a/tortoise/util.go b/tortoise/util.go index 0e8487e5f6..c04631455c 100644 --- a/tortoise/util.go +++ b/tortoise/util.go @@ -96,6 +96,7 @@ func zapBlocks(blocks []*blockInfo) zap.Field { for i := range blocks { encoder.AppendObject(zapcore.ObjectMarshalerFunc(func(encoder zapcore.ObjectEncoder) error { encoder.AddString("decision", blocks[i].validity.String()) + encoder.AddString("hare", blocks[i].hare.String()) encoder.AddString("id", blocks[i].id.String()) encoder.AddString("weight", blocks[i].margin.String()) encoder.AddUint64("height", blocks[i].height)