-
Notifications
You must be signed in to change notification settings - Fork 648
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(resharding): fix Chain::get_shards_to_state_sync() #12617
base: master
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -793,6 +793,7 @@ impl Chain { | |
fn get_state_sync_info( | ||
&self, | ||
me: &Option<AccountId>, | ||
prev_prev_hash: &CryptoHash, | ||
epoch_first_block: &Block, | ||
) -> Result<Option<StateSyncInfo>, Error> { | ||
let prev_hash = *epoch_first_block.header().prev_hash(); | ||
|
@@ -801,6 +802,7 @@ impl Chain { | |
&self.shard_tracker, | ||
me, | ||
&prev_hash, | ||
prev_prev_hash, | ||
)?; | ||
if shards_to_state_sync.is_empty() { | ||
Ok(None) | ||
|
@@ -2392,7 +2394,7 @@ impl Chain { | |
// For the first block of the epoch we check if we need to start download states for | ||
// shards that we will care about in the next epoch. If there is no state to be downloaded, | ||
// we consider that we are caught up, otherwise not | ||
let state_sync_info = self.get_state_sync_info(me, block)?; | ||
let state_sync_info = self.get_state_sync_info(me, &prev_prev_hash, block)?; | ||
Ok((state_sync_info.is_none(), state_sync_info)) | ||
} else { | ||
Ok((self.prev_block_is_caught_up(&prev_prev_hash, &prev_hash)?, None)) | ||
|
@@ -2424,56 +2426,69 @@ impl Chain { | |
shard_tracker: &ShardTracker, | ||
me: &Option<AccountId>, | ||
parent_hash: &CryptoHash, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
prev_prev_hash: &CryptoHash, | ||
) -> Result<Vec<ShardId>, Error> { | ||
let epoch_id = epoch_manager.get_epoch_id_from_prev_block(parent_hash)?; | ||
Ok((epoch_manager.shard_ids(&epoch_id)?) | ||
.into_iter() | ||
.filter(|shard_id| { | ||
Self::should_catch_up_shard( | ||
epoch_manager, | ||
shard_tracker, | ||
me, | ||
parent_hash, | ||
*shard_id, | ||
) | ||
}) | ||
.collect()) | ||
let mut shards_to_sync = Vec::new(); | ||
for shard_id in epoch_manager.shard_ids(&epoch_id)? { | ||
if Self::should_catch_up_shard( | ||
epoch_manager, | ||
shard_tracker, | ||
me, | ||
&epoch_id, | ||
parent_hash, | ||
prev_prev_hash, | ||
shard_id, | ||
)? { | ||
shards_to_sync.push(shard_id) | ||
} | ||
} | ||
Ok(shards_to_sync) | ||
} | ||
|
||
fn should_catch_up_shard( | ||
epoch_manager: &dyn EpochManagerAdapter, | ||
shard_tracker: &ShardTracker, | ||
me: &Option<AccountId>, | ||
epoch_id: &EpochId, | ||
parent_hash: &CryptoHash, | ||
prev_prev_hash: &CryptoHash, | ||
shard_id: ShardId, | ||
) -> bool { | ||
let result = epoch_manager.will_shard_layout_change(parent_hash); | ||
let will_shard_layout_change = match result { | ||
Ok(_will_shard_layout_change) => { | ||
// TODO(#11881): before state sync is fixed, we don't catch up | ||
// split shards. Assume that all needed shards are tracked | ||
// already. | ||
// will_shard_layout_change, | ||
false | ||
} | ||
Err(err) => { | ||
// TODO(resharding) This is a problem, if this happens the node | ||
// will not perform resharding and fall behind the network. | ||
tracing::error!(target: "chain", ?err, "failed to check if shard layout will change"); | ||
false | ||
} | ||
}; | ||
// if shard layout will change the next epoch, we should catch up the shard regardless | ||
// whether we already have the shard's state this epoch, because we need to generate | ||
// new states for shards split from the current shard for the next epoch | ||
let will_care_about_shard = | ||
shard_tracker.will_care_about_shard(me.as_ref(), parent_hash, shard_id, true); | ||
let does_care_about_shard = | ||
shard_tracker.care_about_shard(me.as_ref(), parent_hash, shard_id, true); | ||
) -> Result<bool, Error> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that the old condition is no longer relevant but the logic here seems too complicated. In V2 we needed to get a second copy of the parent, even the node tracks it, to perform the preprocessing. In V3 this should be needed so it feels like the following should be sufficient: Please keep in mind that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
So, relative to the current PR, something like: diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs
index 931f711f8..ac6b3d650 100644
--- a/chain/chain/src/chain.rs
+++ b/chain/chain/src/chain.rs
@@ -2465,32 +2465,7 @@ impl Chain {
if shard_tracker.care_about_shard(me.as_ref(), parent_hash, shard_id, true) {
return Ok(false);
}
- // Now we need to state sync it unless this is a post-resharding child shard whose parent we were tracking in the
- // previous epoch, in which case we don't need to state sync because we'll generate the child when we do the resharding
-
- let shard_layout = epoch_manager.get_shard_layout(epoch_id)?;
- let prev_epoch_id = epoch_manager.get_prev_epoch_id_from_prev_block(parent_hash)?;
- let prev_shard_layout = epoch_manager.get_shard_layout(&prev_epoch_id)?;
-
- let resharded = shard_layout != prev_shard_layout;
- if !resharded {
- return Ok(true);
- }
- let Some(parent_shard_id) = shard_layout.try_get_parent_shard_id(shard_id)? else {
- return Ok(true);
- };
- let was_split = parent_shard_id != shard_id;
- if !was_split {
- return Ok(true);
- }
-
- // Note that here passing `prev_prev_hash` to care_about_shard() will have us check whether we were tracking it in
- // the previous epoch, because the current block is the first block of an epoch, so prev_prev_hash is the "parent_hash"
- // of the last block of the previous epoch. TODO: consider refactoring these ShardTracker functions to accept an epoch_id
- // to make this less tricky.
- let splitting_child =
- shard_tracker.care_about_shard(me.as_ref(), prev_prev_hash, parent_shard_id, true);
- Ok(!splitting_child)
+ Ok(true)
}
/// Check if any block with missing chunk is ready to be processed and start processing these blocks
Now that the other PR is merged, we can try the modification to diff --git a/integration-tests/src/test_loop/tests/resharding_v3.rs b/integration-tests/src/test_loop/tests/resharding_v3.rs
index c04d89ba6..457db7723 100644
--- a/integration-tests/src/test_loop/tests/resharding_v3.rs
+++ b/integration-tests/src/test_loop/tests/resharding_v3.rs
@@ -985,6 +985,7 @@ fn test_resharding_v3_shard_shuffling_slower_post_processing_tasks() {
.track_all_shards(false)
.all_chunks_expected(false)
.delay_flat_state_resharding(2)
+ .epoch_length(10)
.build();
test_resharding_v3_base(params);
}
Now unfortunately this passes because something about the order of the test loop task processing for state sync and resharding catchup has changed between when I wrote that and the current head of master. Without trying to figure it out, we can just hack it to run in the same order as before: diff --git a/chain/chain/src/flat_storage_resharder.rs b/chain/chain/src/flat_storage_resharder.rs
index 83bd481f3..a401c51ce 100644
--- a/chain/chain/src/flat_storage_resharder.rs
+++ b/chain/chain/src/flat_storage_resharder.rs
@@ -609,6 +609,16 @@ impl FlatStorageResharder {
if self.controller.is_cancelled() {
return FlatStorageReshardingTaskResult::Cancelled;
}
+ let head = chain_store.head().unwrap();
+ let Some(sync_hash) = chain_store.get_current_epoch_sync_hash(&head.epoch_id).unwrap() else {
+ return FlatStorageReshardingTaskResult::Postponed;
+ };
+ let key = borsh::to_vec(&near_primitives::state_sync::StatePartKey(sync_hash, shard_uid.shard_id(), 0)).unwrap();
+ let part = chain_store.store().get(near_store::DBCol::StateParts, &key).unwrap();
+ if !part.is_some() {
+ return FlatStorageReshardingTaskResult::Postponed;
+ }
+
info!(target: "resharding", ?shard_uid, "flat storage shard catchup task started");
let metrics = FlatStorageReshardingShardCatchUpMetrics::new(&shard_uid);
// Apply deltas and then create the flat storage.
Then (with the above epoch length change), run it and you get:
with this error message at the end of the logs:
What's happening here is the problem with
Yeah it's a bit messy... maybe we should split somethiing out of this? |
||
// Won't care about it next epoch, no need to state sync it. | ||
if !shard_tracker.will_care_about_shard(me.as_ref(), parent_hash, shard_id, true) { | ||
return Ok(false); | ||
} | ||
// Currently tracking the shard, so no need to state sync it. | ||
if shard_tracker.care_about_shard(me.as_ref(), parent_hash, shard_id, true) { | ||
return Ok(false); | ||
} | ||
// Now we need to state sync it unless this is a post-resharding child shard whose parent we were tracking in the | ||
// previous epoch, in which case we don't need to state sync because we'll generate the child when we do the resharding | ||
|
||
tracing::debug!(target: "chain", does_care_about_shard, will_care_about_shard, will_shard_layout_change, "should catch up shard"); | ||
let shard_layout = epoch_manager.get_shard_layout(epoch_id)?; | ||
let prev_epoch_id = epoch_manager.get_prev_epoch_id_from_prev_block(parent_hash)?; | ||
let prev_shard_layout = epoch_manager.get_shard_layout(&prev_epoch_id)?; | ||
|
||
let resharded = shard_layout != prev_shard_layout; | ||
if !resharded { | ||
return Ok(true); | ||
} | ||
let Some(parent_shard_id) = shard_layout.try_get_parent_shard_id(shard_id)? else { | ||
return Ok(true); | ||
}; | ||
let was_split = parent_shard_id != shard_id; | ||
if !was_split { | ||
return Ok(true); | ||
} | ||
|
||
will_care_about_shard && (will_shard_layout_change || !does_care_about_shard) | ||
// Note that here passing `prev_prev_hash` to care_about_shard() will have us check whether we were tracking it in | ||
// the previous epoch, because the current block is the first block of an epoch, so prev_prev_hash is the "parent_hash" | ||
// of the last block of the previous epoch. TODO: consider refactoring these ShardTracker functions to accept an epoch_id | ||
// to make this less tricky. | ||
let splitting_child = | ||
shard_tracker.care_about_shard(me.as_ref(), prev_prev_hash, parent_shard_id, true); | ||
Ok(!splitting_child) | ||
} | ||
|
||
/// Check if any block with missing chunk is ready to be processed and start processing these blocks | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mini nit: I slightly dislike mixing parameters type (hash & block), would it make any sense to pass, hash, prev_hash and prev_prev_hash as separate arguments?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking below, the other signature is:
And maybe specify in the comment that header is the epoch first
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ya agreed, just changed them all to hashes and epoch IDs