Skip to content

Commit

Permalink
issue-2917: don't discard Disk Agent stats with unknown devices (#2919)
Browse files Browse the repository at this point in the history
  • Loading branch information
sharpeye authored Jan 27, 2025
1 parent 64358ed commit 77951e7
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3960,26 +3960,18 @@ NProto::TError TDiskRegistryState::UpdateAgentCounters(
for (const auto& device: stats.GetDeviceStats()) {
const auto& uuid = device.GetDeviceUUID();

const bool deviceIsUnknown = std::ranges::any_of(
agent->GetUnknownDevices(),
[&uuid](const NProto::TDeviceConfig& device)
{ return device.GetDeviceUUID() == uuid; });
if (deviceIsUnknown) {
return MakeError(
S_FALSE,
TStringBuilder() << "Device: \"" << uuid << "\" is unknown");
const NProto::TDeviceConfig* knownDevice = DeviceList.FindDevice(uuid);
if (!knownDevice) {
continue;
}

const NProto::TDeviceConfig* knownDevice = DeviceList.FindDevice(uuid);
if (!knownDevice || stats.GetNodeId() != knownDevice->GetNodeId()) {
if (stats.GetNodeId() != knownDevice->GetNodeId()) {
return MakeError(
E_ARGUMENT,
TStringBuilder()
<< "Unexpected device. DeviceId: \"" << uuid
<< "\" Sender node id: " << stats.GetNodeId()
<< " Found node id: "
<< (knownDevice ? ToString(knownDevice->GetNodeId())
: "null"));
<< "Unexpected device. DeviceId: " << uuid.Quote()
<< " Sender node id: " << stats.GetNodeId()
<< " Found node id: " << knownDevice->GetNodeId());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,7 @@ class TDiskRegistryState
}

TVector<TString> CollectBrokenDevices(const NProto::TAgentStats& stats) const;
NProto::TError UpdateAgentCounters(const NProto::TAgentStats& source);
NProto::TError UpdateAgentCounters(const NProto::TAgentStats& stats);
void PublishCounters(TInstant now);

void DeleteDiskStateChanges(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1852,10 +1852,11 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateTest)
});

auto config1 = AgentConfig(1000, {
Device("dev-1", "uuid-1", "rack-1"),
Device("dev-1", "uuid-1.1"),
});
auto config2 = AgentConfig(1001, {
Device("dev-2", "uuid-2", "rack-2"),
Device("dev-1", "uuid-2.1"),
Device("dev-3", "uuid-2.3"),
});

auto monitoring = CreateMonitoringServiceStub();
Expand All @@ -1865,9 +1866,16 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateTest)

TDiskRegistryState state = TDiskRegistryStateBuilder()
.With(diskRegistryGroup)
.WithConfig({ config1 })
.WithConfig({ config1, config2 })
.Build();

config2 = AgentConfig(1001, {
Device("dev-1", "uuid-2.1"),
// add an unknown device to #1001
Device("dev-2", "uuid-2.2"),
Device("dev-3", "uuid-2.3"),
});

executor.WriteTx([&] (TDiskRegistryDatabase db) {
UNIT_ASSERT_SUCCESS(RegisterAgent(state, db, config1));
UNIT_ASSERT_SUCCESS(RegisterAgent(state, db, config2));
Expand All @@ -1877,9 +1885,9 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateTest)

{
NProto::TAgentStats stats;
stats.SetNodeId(1000);
stats.SetNodeId(1001);
auto* d = stats.AddDeviceStats();
d->SetDeviceUUID("garbage");
d->SetDeviceUUID("uuid-1.1"); // uuid-1.1 belongs to agent-1000

auto error = state.UpdateAgentCounters(stats);
UNIT_ASSERT_VALUES_EQUAL(E_ARGUMENT, error.GetCode());
Expand All @@ -1888,14 +1896,78 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateTest)
{
NProto::TAgentStats stats;
stats.SetNodeId(1001);
auto* d = stats.AddDeviceStats();
d->SetDeviceUUID("uuid-2");

{
auto* d = stats.AddDeviceStats();
d->SetDeviceUUID("uuid-2.1");
d->SetDeviceName("dev-1");
d->SetBytesRead(4_KB);
d->SetNumReadOps(1);
}

{
auto* d = stats.AddDeviceStats();
d->SetDeviceUUID("uuid-2.2");
d->SetDeviceName("dev-2");
d->SetBytesRead(8_KB);
d->SetNumReadOps(2);
}

{
auto* d = stats.AddDeviceStats();
d->SetDeviceUUID("uuid-2.3");
d->SetDeviceName("dev-3");
d->SetBytesRead(12_KB);
d->SetNumReadOps(3);
}

auto error = state.UpdateAgentCounters(stats);
UNIT_ASSERT_VALUES_EQUAL(S_FALSE, error.GetCode());
UNIT_ASSERT_VALUES_EQUAL(S_OK, error.GetCode());
}

state.PublishCounters(Now());

UNIT_ASSERT(diskRegistryGroup->FindSubgroup("agent", "agent-1000"));

auto counters = diskRegistryGroup->FindSubgroup("agent", "agent-1001");
UNIT_ASSERT(counters);

auto totalReadCount = counters->FindCounter("ReadCount");
UNIT_ASSERT(totalReadCount);
UNIT_ASSERT_VALUES_EQUAL(4, totalReadCount->Val());

auto totalReadBytes = counters->FindCounter("ReadBytes");
UNIT_ASSERT(totalReadBytes);
UNIT_ASSERT_VALUES_EQUAL(16_KB, totalReadBytes->Val());

{
auto device = counters->FindSubgroup("device", "agent-1001:dev-1");
UNIT_ASSERT(device);

auto readCount = device->FindCounter("ReadCount");
UNIT_ASSERT(readCount);
UNIT_ASSERT_VALUES_EQUAL(1, readCount->Val());

auto readBytes = device->FindCounter("ReadBytes");
UNIT_ASSERT(readBytes);
UNIT_ASSERT_VALUES_EQUAL(4_KB, readBytes->Val());
}

// no metrics for the unknown device
UNIT_ASSERT(!counters->FindSubgroup("device", "agent-1001:dev-2"));

{
auto device = counters->FindSubgroup("device", "agent-1001:dev-3");
UNIT_ASSERT(device);

auto readCount = device->FindCounter("ReadCount");
UNIT_ASSERT(readCount);
UNIT_ASSERT_VALUES_EQUAL(3, readCount->Val());

auto readBytes = device->FindCounter("ReadBytes");
UNIT_ASSERT(readBytes);
UNIT_ASSERT_VALUES_EQUAL(12_KB, readBytes->Val());
}
}

Y_UNIT_TEST(ShouldRemoveAgentWithSameId)
Expand Down Expand Up @@ -12160,9 +12232,8 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateTest)
device.SetId(leakedSuspendedDevice);
db.UpdateSuspendedDevice(device);
db.AddAutomaticallyReplacedDevice(
TAutomaticallyReplacedDeviceInfo{
leakedAutomaticallyReplacedDevice,
Now()});
{.DeviceId = leakedAutomaticallyReplacedDevice,
.ReplacementTs = Now()});
});

// Register agent.
Expand Down Expand Up @@ -12248,4 +12319,5 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateTest)
});
}
}

} // namespace NCloud::NBlockStore::NStorage

0 comments on commit 77951e7

Please sign in to comment.