Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 2404: mv WrongMigratedDeviceOwnership critical event to AddMigration Function #2955

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 42 additions & 13 deletions cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -609,8 +609,19 @@ void TDiskRegistryState::ProcessCheckpoints()
void TDiskRegistryState::AddMigration(
const TDiskState& disk,
const TString& diskId,
const TString& sourceDeviceId)
const TString& sourceDeviceId,
bool needToReportInvalidMigration)
{
if (needToReportInvalidMigration && !FindPtr(disk.Devices, sourceDeviceId))
{
ReportDiskRegistryWrongMigratedDeviceOwnership(Sprintf(
"%s: device[DeviceUUID = %s] not found in disk[DiskId = %s]",
"AddMigration",
sourceDeviceId.c_str(),
diskId.c_str()));
return;
}

if (disk.CheckpointReplica.GetCheckpointId()) {
// Don't start migrations for shadow disks.
return;
Expand Down Expand Up @@ -667,7 +678,12 @@ void TDiskRegistryState::FillMigrations()
}

if (device->GetState() == NProto::DEVICE_STATE_WARNING) {
AddMigration(disk, diskId, uuid);
AddMigration(
disk,
diskId,
uuid,
false // needToReportInvalidMigration
);

continue;
}
Expand All @@ -677,7 +693,12 @@ void TDiskRegistryState::FillMigrations()
}

if (agent->GetState() == NProto::AGENT_STATE_WARNING) {
AddMigration(disk, diskId, uuid);
AddMigration(
disk,
diskId,
uuid,
false // needToReportInvalidMigration
);
}
}
}
Expand Down Expand Up @@ -5023,15 +5044,13 @@ void TDiskRegistryState::ApplyAgentStateChange(
}

if (agent.GetState() == NProto::AGENT_STATE_WARNING) {
if (Find(disk.Devices, deviceId) == disk.Devices.end()) {
ReportDiskRegistryWrongMigratedDeviceOwnership(
TStringBuilder() << "ApplyAgentStateChange: device "
<< deviceId << " not found");
continue;
}

if (MigrationCanBeStarted(disk, deviceId)) {
AddMigration(disk, diskId, deviceId);
Comment on lines 5046 to -5034
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Не надо AddMigration менять. Достаточно перенести проверку MigrationCanBeStarted перед Find. И написать тест, который: создает диск, вызывает REMOVE_HOST (запускает миграцию), завершает миграцию, снова вызывает REMOVE_HOST - тут проверяем, что после фикса крит не тригерится

Copy link
Contributor Author

@vladstepanyuk vladstepanyuk Jan 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Я перенес Крит ивент в AddMigration, чтобы написать тест, который проверяет, что, если вдруг мы попытаемся запустить миграцию девайса не принадлежащую этому диску, мы зарепортим это событие. Что при попытке повторной миграции не будет Крит ивента, проверяется в тесте ShouldNotDuplicateMigrations.

NMonitoring::TDynamicCountersPtr counters =
new NMonitoring::TDynamicCounters();
InitCriticalEventsCounter(counters);
auto configCounter = counters->GetCounter(
"AppCriticalEvents/DiskRegistryWrongMigratedDeviceOwnership",
true);
UNIT_ASSERT_VALUES_EQUAL(0, configCounter->Val());
executor.WriteTx([&] (TDiskRegistryDatabase db) mutable {
auto affectedDisks = ChangeAgentState(
state,
db,
agents[0],
NProto::AGENT_STATE_WARNING);
UNIT_ASSERT_VALUES_EQUAL(0, affectedDisks.size());
});
{
for (const auto& m: state.BuildMigrationList()) {
Cerr << "migration: " << m.DiskId << " " << m.SourceDeviceId << Endl;
}
}
UNIT_ASSERT(state.IsMigrationListEmpty());
UNIT_ASSERT_VALUES_EQUAL(0, configCounter->Val());

AddMigration(
disk,
diskId,
deviceId,
true // needToReportInvalidMigration
);
}
} else {
if (agent.GetState() == NProto::AGENT_STATE_UNAVAILABLE
Expand Down Expand Up @@ -6017,7 +6036,12 @@ void TDiskRegistryState::ApplyDeviceStateChange(
}

if (MigrationCanBeStarted(*disk, uuid)) {
AddMigration(*disk, diskId, uuid);
AddMigration(
*disk,
diskId,
uuid,
true // needToReportInvalidMigration
);
}
}

Expand All @@ -6038,7 +6062,12 @@ bool TDiskRegistryState::RestartDeviceMigration(

CancelDeviceMigration(now, db, diskId, disk, sourceId);

AddMigration(disk, diskId, sourceId);
AddMigration(
disk,
diskId,
sourceId,
false // needToReportInvalidMigration
);

return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,19 @@ struct TPlacementGroupInfo
}
};

////////////////////////////////////////////////////////////////////////////////
namespace NTestSuiteTDiskRegistryStateMigrationTest {

struct TTestCaseShouldReportMigrationsWithWrongDeviceOwnership;

} // namespace NTestSuiteTDiskRegistryStateMigrationTest

////////////////////////////////////////////////////////////////////////////////

class TDiskRegistryState
{
friend struct NTestSuiteTDiskRegistryStateMigrationTest::
TTestCaseShouldReportMigrationsWithWrongDeviceOwnership;
using TAgentId = TString;
using TDeviceId = TString;
using TDiskId = TString;
Expand Down Expand Up @@ -882,7 +891,8 @@ class TDiskRegistryState
void AddMigration(
const TDiskState& disk,
const TString& diskId,
const TString& sourceDeviceId);
const TString& sourceDeviceId,
bool needToReportInvalidMigration);
void FillMigrations();

const TDiskState* FindDiskState(const TDiskId& diskId) const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -388,9 +388,59 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateMigrationTest)
}

UNIT_ASSERT(state.IsMigrationListEmpty());
// Now bug is fixed, but, if it reproduce in future, we must report
// event.
UNIT_ASSERT_VALUES_EQUAL(0, configCounter->Val());
}

Y_UNIT_TEST(ShouldReportMigrationsWithWrongDeviceOwnership)
{
TTestExecutor executor;
executor.WriteTx([&](TDiskRegistryDatabase db) { db.InitSchema(); });

const TVector agents{
AgentConfig(1, {Device("dev-1", "uuid-1.1", "rack-1")}),
AgentConfig(2, {Device("dev-1", "uuid-2.1", "rack-1")}),
AgentConfig(3, {Device("dev-1", "uuid-3.1", "rack-1")}),
};

TDiskRegistryState state = TDiskRegistryStateBuilder()
.WithKnownAgents(agents)
.WithDisks({Disk("foo", {"uuid-1.1"})})
.Build();

NMonitoring::TDynamicCountersPtr counters =
new NMonitoring::TDynamicCounters();
InitCriticalEventsCounter(counters);
auto configCounter = counters->GetCounter(
"AppCriticalEvents/DiskRegistryWrongMigratedDeviceOwnership",
true);
UNIT_ASSERT_VALUES_EQUAL(0, configCounter->Val());

state.AddMigration(
state.Disks["foo"],
"foo",
"wrong-uuid",
true // needToReportInvalidMigration
);
UNIT_ASSERT(state.IsMigrationListEmpty());

UNIT_ASSERT_VALUES_EQUAL(1, configCounter->Val());

executor.WriteTx(
[&](TDiskRegistryDatabase db) mutable
{
bool stateUpdated = false;
auto err = state.FinishDeviceMigration(
db,
"foo",
"wrong-uuid-0",
"wrong-uuid-1",
Now(),
&stateUpdated);

UNIT_ASSERT(!stateUpdated);
UNIT_ASSERT(HasError(err));
});
UNIT_ASSERT_VALUES_EQUAL(2, configCounter->Val());
}

Y_UNIT_TEST(ShouldEraseMigrationsForDeletedDisk)
Expand Down
Loading