Skip to content

Commit

Permalink
Allow restarting broken disk in broken group (#13185)
Browse files Browse the repository at this point in the history
(cherry picked from commit 6b6fafa)
  • Loading branch information
SammyVimes authored Jan 7, 2025
1 parent 56e8b11 commit 0927051
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 1 deletion.
8 changes: 7 additions & 1 deletion ydb/core/blobstorage/ut_blobstorage/lib/env.h
Original file line number Diff line number Diff line change
Expand Up @@ -824,8 +824,14 @@ struct TEnvironmentSetup {
}

void UpdateDriveStatus(ui32 nodeId, ui32 pdiskId, NKikimrBlobStorage::EDriveStatus status,
NKikimrBlobStorage::EDecommitStatus decommitStatus) {
NKikimrBlobStorage::EDecommitStatus decommitStatus, bool force = false) {
NKikimrBlobStorage::TConfigRequest request;
if (force) {
request.SetIgnoreGroupFailModelChecks(true);
request.SetIgnoreDegradedGroupsChecks(true);
request.SetIgnoreDisintegratedGroupsChecks(true);
request.SetIgnoreGroupSanityChecks(true);
}
auto *cmd = request.AddCommand();
auto *ds = cmd->MutableUpdateDriveStatus();
ds->MutableHostKey()->SetNodeId(nodeId);
Expand Down
107 changes: 107 additions & 0 deletions ydb/core/blobstorage/ut_blobstorage/restart_pdisk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,113 @@ Y_UNIT_TEST_SUITE(BSCRestartPDisk) {
}
}

auto GetGroupVDisks(TEnvironmentSetup& env) {
struct TVDisk {
ui32 NodeId;
ui32 PDiskId;
ui32 VSlotId;
TVDiskID VDiskId;
};

std::vector<TVDisk> vdisks;

auto config = env.FetchBaseConfig();

auto& group = config.get_idx_group(0);

for (auto& vslot : config.GetVSlot()) {
if (group.GetGroupId() == vslot.GetGroupId()) {
auto slotId = vslot.GetVSlotId();
auto nodeId = slotId.GetNodeId();
auto pdiskId = slotId.GetPDiskId();
auto vdiskId = TVDiskID(group.GetGroupId(), group.GetGroupGeneration(), vslot.GetFailRealmIdx(), vslot.GetFailDomainIdx(), vslot.GetVDiskIdx());
vdisks.push_back({nodeId, pdiskId, slotId.GetVSlotId(), vdiskId});
}
}

return vdisks;
}

Y_UNIT_TEST(RestartBrokenDiskInBrokenGroup) {
TEnvironmentSetup env({
.NodeCount = 8,
.Erasure = TBlobStorageGroupType::Erasure4Plus2Block
});

env.UpdateSettings(false, false);
env.CreateBoxAndPool(1, 1);
env.Sim(TDuration::Seconds(30));

auto vdisks = GetGroupVDisks(env);

// Making all vdisks bad, group is disintegrated
const TActorId sender = env.Runtime->AllocateEdgeActor(env.Settings.ControllerNodeId, __FILE__, __LINE__);
for (auto& pdisk : env.PDiskActors) {
env.Runtime->WrapInActorContext(sender, [&] () {
env.Runtime->Send(new IEventHandle(EvBecomeError, 0, pdisk, sender, nullptr, 0));
});
}

env.Sim(TDuration::Minutes(1));

// Restarting the owner of an already broken disk in a broken group must be allowed
auto& [targetNodeId, targetPDiskId, unused1, unused2] = vdisks[0];

NKikimrBlobStorage::TConfigRequest request;

NKikimrBlobStorage::TRestartPDisk* cmd = request.AddCommand()->MutableRestartPDisk();
auto pdiskId = cmd->MutableTargetPDiskId();
pdiskId->SetNodeId(targetNodeId);
pdiskId->SetPDiskId(targetPDiskId);

auto response = env.Invoke(request);
UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription());
}

Y_UNIT_TEST(RestartGoodDiskInBrokenGroupNotAllowed) {
TEnvironmentSetup env({
.NodeCount = 8,
.Erasure = TBlobStorageGroupType::Erasure4Plus2Block
});

env.UpdateSettings(false, false);
env.CreateBoxAndPool(1, 1);
env.Sim(TDuration::Seconds(30));

// Making all but one vdisks bad, group is disintegrated
const TActorId sender = env.Runtime->AllocateEdgeActor(env.Settings.ControllerNodeId, __FILE__, __LINE__);
for (size_t i = 0; i < env.PDiskActors.size() - 1; i++) {
env.Runtime->WrapInActorContext(sender, [&] () {
env.Runtime->Send(new IEventHandle(EvBecomeError, 0, env.PDiskActors[i], sender, nullptr, 0));
});
}

env.Sim(TDuration::Minutes(1));

ui32 targetNodeId = 0;
ui32 targetPDiskId = 0;

for (auto& [k, v] : env.PDiskMockStates) {
if (v.Get()->GetStateErrorReason().Empty()) {
targetNodeId = k.first;
targetPDiskId = k.second;
}
}

// However restarting the owner of a single good disk must be prohibited
NKikimrBlobStorage::TConfigRequest request;

NKikimrBlobStorage::TRestartPDisk* cmd = request.AddCommand()->MutableRestartPDisk();
auto pdiskId = cmd->MutableTargetPDiskId();
pdiskId->SetNodeId(targetNodeId);
pdiskId->SetPDiskId(targetPDiskId);

auto response = env.Invoke(request);

UNIT_ASSERT_C(!response.GetSuccess(), "Restart should've been prohibited");
UNIT_ASSERT_STRING_CONTAINS(response.GetErrorDescription(), "Disintegrated");
}

Y_UNIT_TEST(RestartOneByOne) {
TEnvironmentSetup env({
.NodeCount = 10,
Expand Down
20 changes: 20 additions & 0 deletions ydb/core/mind/bscontroller/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,27 @@ namespace NKikimr::NBsController {

// check that group modification would not degrade failure model
if (!suppressFailModelChecking) {
THashSet<TGroupId> groupsToCheck;
for (auto&& [base, overlay] : state.VSlots.Diff()) {
if (base && base->second->Group) {
if (!overlay->second || !overlay->second->Group) {
// Disk moved or became inactive
groupsToCheck.emplace(base->second->GroupId);
} else {
const NKikimrBlobStorage::EVDiskStatus prevStatus = base->second->GetStatus();
const NKikimrBlobStorage::EVDiskStatus curStatus = overlay->second->GetStatus();

if (prevStatus != NKikimrBlobStorage::EVDiskStatus::ERROR && curStatus == NKikimrBlobStorage::EVDiskStatus::ERROR) {
// VDisk's status has changed to ERROR
groupsToCheck.emplace(overlay->second->GroupId);
}
}
}
}
for (TGroupId groupId : state.GroupFailureModelChanged) {
if (!groupsToCheck.contains(groupId)) {
continue;
}
if (const TGroupInfo *group = state.Groups.Find(groupId); group && group->VDisksInGroup) {
// process only groups with changed content; create topology for group
auto& topology = *group->Topology;
Expand Down

0 comments on commit 0927051

Please sign in to comment.