From 4fb0cc7665a1f55aacf13cac7fd0041e258115c2 Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Thu, 26 Sep 2024 10:58:47 -0700 Subject: [PATCH 01/12] check buildteam consistently failed --- fdbclient/ServerKnobs.cpp | 1 + fdbclient/include/fdbclient/ServerKnobs.h | 3 + fdbserver/DDTeamCollection.actor.cpp | 252 +++++++++++++++++- .../include/fdbserver/DDTeamCollection.h | 12 + 4 files changed, 262 insertions(+), 6 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 27293f65967..d80017c9062 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -365,6 +365,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DD_LARGE_TEAM_DELAY, 60.0 ); init( DD_FIX_WRONG_REPLICAS_DELAY, 60.0 ); init (DD_VALIDATE_SERVER_TEAM_COUNT_AFTER_BUILD_TEAM, false ); if (isSimulated) DD_VALIDATE_SERVER_TEAM_COUNT_AFTER_BUILD_TEAM = true; + init( DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR, 0 ); if (isSimulated) DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR = 10 * 60; // TeamRemover init( TR_LOW_SPACE_PIVOT_DELAY_SEC, 0 ); if (isSimulated) TR_LOW_SPACE_PIVOT_DELAY_SEC = deterministicRandom()->randomInt(0, 3); diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h index 0ec1e3c98a1..b773c992d6e 100644 --- a/fdbclient/include/fdbclient/ServerKnobs.h +++ b/fdbclient/include/fdbclient/ServerKnobs.h @@ -346,6 +346,9 @@ class ServerKnobs : public KnobsImpl { double DD_WAIT_TSS_DATA_MOVE_DELAY; bool DD_VALIDATE_SERVER_TEAM_COUNT_AFTER_BUILD_TEAM; // Enable to validate server team count per server after build // team + double DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR; // Trigger SevError when the time span of successive buildTeam + // failures exceeds the specified value + // Set to 0 to disable it // TeamRemover to remove redundant teams double TR_LOW_SPACE_PIVOT_DELAY_SEC; // teamRedundant data moves can make the min SS available % smaller in diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 89854947e6d..a5b9caa60e8 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -74,6 +74,22 @@ unsigned EligibilityCounter::getCount(int combinedType) const { } // namespace data_distribution +int getNChooseKLowerBound(int n, int k) { + if (k > n) + return 0; + if (k * 2 > n) + k = n - k; + if (k == 0) + return 1; + + int result = n; + for (int i = 2; i <= k; ++i) { + result = result * (n - i + 1); + result = result / i; + } + return result; +} + class DDTeamCollectionImpl { ACTOR static Future checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) { state double start = now(); @@ -848,6 +864,10 @@ class DDTeamCollectionImpl { self->evaluateTeamQuality(); + if (SERVER_KNOBS->DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR > 0) { + self->traceBuildTeamFailedForLongTime(); + } + // Building teams can cause servers to become undesired, which can make teams unhealthy. // Let all of these changes get worked out before responding to the get team request wait(delay(0, TaskPriority::DataDistributionLaunch)); @@ -4474,6 +4494,123 @@ void DDTeamCollection::evaluateTeamQuality() const { .detail("MachineMaxTeams", maxMachineTeams); } +// Decide if the current layout of servers and machines is good enough to meet the target when building serverTeams and +// machineTeams. Sometimes, buildTeams is consistently failed because the layout is bad For example, when each machine +// team is smaller than targetTeamNumPerServer, then buildTeam always fails. This can be caused by: (1) each machine has +// too few SSes; (2) machine count is too small Complexity: scan the server_info list +bool DDTeamCollection::isServerMachineDCLayoutGood() const { + // The decision is made based on the tracked healthy SSes + // Collect data points from server_info and filter out unhealthy SSes + std::tuple tmp = getHealthyStorageServerCountPerMachine(); + int minServerCountPerMachine = std::get<0>(tmp); + int maxServerCountPerMachine = std::get<1>(tmp); + int machineCount = std::get<2>(tmp); + if (machineCount == 0) { + // Layout is bad if the number of machine is 0 + TraceEvent(SevWarnAlways, "BuildTeamZeroMachineFromServerList", distributorId) + .suppressFor(60.0) + .detail("Primary", primary); + return false; + } + if (maxServerCountPerMachine - minServerCountPerMachine > 1) { + // The unbalance count makes the condition harder to meet given the same amount of SSes. + // So, we make an alert here + TraceEvent(SevWarn, "BuildTeamUnbalanceSSPerMachineFromServerList", distributorId) + .suppressFor(60.0) + .detail("MaxServerCountPerMachine", maxServerCountPerMachine) + .detail("MinServerCountPerMachine", minServerCountPerMachine) + .detail("Primary", primary); + } + // Collect data points from server_info and filter out unhealthy SSes + // If a machine has no healthy SS, we do not count this machine + tmp = getMachineCountPerZone(); + int minMachineCountPerZone = std::get<0>(tmp); + int maxMachineCountPerZone = std::get<1>(tmp); + int zoneCount = std::get<2>(tmp); + if (zoneCount == 0) { + // Layout is bad if the number of zone is 0 + TraceEvent(SevWarnAlways, "BuildTeamZeroZoneFromServerList", distributorId) + .suppressFor(60.0) + .detail("Primary", primary); + return false; + } + if (maxMachineCountPerZone - minMachineCountPerZone > 1) { + // The unbalance count makes the condition harder to meet given the same amount of machines. + // So, we make an alert here + TraceEvent(SevWarn, "BuildTeamUnbalanceMachinePerDCFromServerList", distributorId) + .suppressFor(60.0) + .detail("MaxMachineCountPerZone", maxMachineCountPerZone) + .detail("MinMachineCountPerZone", minMachineCountPerZone) + .detail("ZoneCount", zoneCount) + .detail("Primary", primary); + } + int possibleMachineTeamsGivenAMachine = + getNChooseKLowerBound(minMachineCountPerZone - 1, configuration.storageTeamSize - 1); + + // Make decision: + // Layout is bad if there exists a machine having too few SSes or the machine count and minServerCountPerMachine + // cannot satisfy the targetTeamNumPerServer when building serverTeams. + // If this is the case, it is possible that buildTeam consistently fails but not always fails. + // However, we conservatively mark the layout bad + int targetTeamNumPerServer = (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (configuration.storageTeamSize + 1)) / 2; + bool sufficientToCreateEnoughServerTeams = + (possibleMachineTeamsGivenAMachine * pow(minServerCountPerMachine, configuration.storageTeamSize - 1) >= + targetTeamNumPerServer); + + // Layout is bad if the number of machines is not efficient to satisfy the targetMachineTeamNumPerMachine when + // building machineTeams. If this is the case, it is possible that buildTeam consistently fails. + int targetMachineTeamNumPerMachine = + SERVER_KNOBS->TR_FLAG_REMOVE_MT_WITH_MOST_TEAMS + ? (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (configuration.storageTeamSize + 1)) / 2 + : SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER; + bool sufficientToCreateEnoughMachineTeams = possibleMachineTeamsGivenAMachine >= targetMachineTeamNumPerMachine; + + TraceEvent(SevInfo, "BuildTeamsDecideGoodLayout", distributorId) + .suppressFor(60.0) + .setMaxEventLength(-1) + .setMaxFieldLength(-1) + .detail("Primary", primary) + .detail("MachineCount", machineCount) + .detail("MachineInfoSize", machine_info.size()) + .detail("MinServerCountPerMachine", minServerCountPerMachine) + .detail("MaxServerCountPerMachine", maxServerCountPerMachine) + .detail("ConfigTeamSize", configuration.storageTeamSize) + .detail("TargetTeamNumPerServer", targetTeamNumPerServer) + .detail("TargetMachineTeamNumPerMachine", targetMachineTeamNumPerMachine) + .detail("PossibleMachineTeamsGivenAMachine", possibleMachineTeamsGivenAMachine) + .detail("MinMachineCountPerZone", minMachineCountPerZone) + .detail("MaxMachineCountPerZone", maxMachineCountPerZone) + .detail("ZoneCount", zoneCount) + .detail("SufficientToCreateEnoughServerTeams", sufficientToCreateEnoughServerTeams) + .detail("SufficientToCreateEnoughMachineTeams", sufficientToCreateEnoughMachineTeams); + return sufficientToCreateEnoughServerTeams && sufficientToCreateEnoughMachineTeams; +} + +// Trace error if buildTeams keep failing for sufficient long time when the layout is good. +// For the layout, see comments of isServerMachineDCLayoutGood() +void DDTeamCollection::traceBuildTeamFailedForLongTime() { + if (!isServerMachineDCLayoutGood()) { + buildTeamsFailedStartTime.reset(); + TraceEvent(SevWarn, "BuildTeamsFailedTimerResetDueToBadLayout", distributorId).detail("Primary", primary); + } else if (!lastBuildTeamsFailed) { + buildTeamsFailedStartTime.reset(); + TraceEvent(SevInfo, "BuildTeamsFailedTimerResetDueToSucceed", distributorId).detail("Primary", primary); + } else if (!buildTeamsFailedStartTime.present()) { + buildTeamsFailedStartTime = now(); + TraceEvent(SevWarn, "BuildTeamsFailedTimerStart", distributorId); + } else if (now() - buildTeamsFailedStartTime.get() > SERVER_KNOBS->DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR) { + TraceEvent(SevError, "BuildTeamsFailedForLongTime", distributorId) + .detail("Primary", primary) + .detail("KeepFailedForSecond", now() - buildTeamsFailedStartTime.get()); + buildTeamsFailedStartTime.reset(); + } else { + TraceEvent(SevInfo, "BuildTeamsKeepFailed", distributorId) + .suppressFor(60.0) + .detail("KeepFailedForSecond", now() - buildTeamsFailedStartTime.get()); + } + return; +} + int DDTeamCollection::overlappingMembers(const std::vector& team) const { if (team.empty()) { return 0; @@ -5017,6 +5154,10 @@ int DDTeamCollection::addBestMachineTeams(int machineTeamsToBuild) { // NOTE: selectReplicas() should always return success when storageTeamSize = 1 ASSERT_WE_THINK(configuration.storageTeamSize > 1 || (configuration.storageTeamSize == 1 && success)); if (!success) { + TraceEvent(SevDebug, "AddBestMachineTeamFailedSelectReplicas", distributorId) + .detail("Primary", primary) + .detail("Name", configuration.storagePolicy->name()) + .detail("Policy", configuration.storagePolicy->info()); continue; // Try up to maxAttempts, since next time we may choose a different forcedAttributes } ASSERT_GT(forcedAttributes.size(), 0); @@ -5046,6 +5187,11 @@ int DDTeamCollection::addBestMachineTeams(int machineTeamsToBuild) { int overlap = overlappingMachineMembers(machineIDs); if (overlap == machineIDs.size()) { maxAttempts += 1; + TraceEvent(SevInfo, "AddBestMachineTeamAlreadyExist", distributorId) + .suppressFor(1.0) + .detail("Primary", primary) + .detail("Name", configuration.storagePolicy->name()) + .detail("Policy", configuration.storagePolicy->info()); continue; } score += SERVER_KNOBS->DD_OVERLAP_PENALTY * overlap; @@ -5078,7 +5224,11 @@ int DDTeamCollection::addBestMachineTeams(int machineTeamsToBuild) { TraceEvent(SevWarn, "BuildTeamsLastBuildTeamsFailed", distributorId) .detail("Primary", primary) .detail("Reason", "Unable to make desired machineTeams") - .detail("Hint", "Check TraceAllInfo event"); + .detail("Hint", "Check TraceAllInfo event") + .detail("BestTeam", bestTeam.size()) + .detail("ConfigTeamSize", configuration.storageTeamSize) + .detail("Name", configuration.storagePolicy->name()) + .detail("Policy", configuration.storagePolicy->info()); lastBuildTeamsFailed = true; break; } @@ -5349,6 +5499,77 @@ bool DDTeamCollection::notEnoughTeamsForAServer() const { return false; } +std::tuple DDTeamCollection::getHealthyStorageServerCountPerMachine() const { + std::unordered_map serverCountOnMachines; + for (auto& [serverID, server] : server_info) { + if (server_status.get(serverID).isUnhealthy()) { + continue; + } + std::string machineID = server->machine->machineID.toString(); + if (serverCountOnMachines.find(machineID) == serverCountOnMachines.end()) { + serverCountOnMachines[machineID] = 0; + } + serverCountOnMachines[machineID] = serverCountOnMachines[machineID] + 1; + } + int maxCount = -1; + int minCount = std::numeric_limits::max(); + for (const auto& [id, count] : serverCountOnMachines) { + if (count > maxCount) { + maxCount = count; + } + if (count < minCount) { + minCount = count; + } + } + return std::make_tuple(minCount, maxCount, serverCountOnMachines.size()); +} + +std::tuple DDTeamCollection::getMachineCountPerZone() const { + std::unordered_map> machinesPerDC; + for (auto& [serverID, server] : server_info) { + if (server_status.get(serverID).isUnhealthy()) { + continue; + } + std::string zoneId; + if (server->getLastKnownInterface().locality.zoneId().present()) { + zoneId = server->getLastKnownInterface().locality.zoneId().get().toString(); + } else { + zoneId = server->machine->machineID.toString(); + } + std::string machineID = server->machine->machineID.toString(); + machinesPerDC[zoneId].insert(machineID); + } + int maxCount = -1; + int minCount = std::numeric_limits::max(); + for (const auto& [id, machines] : machinesPerDC) { + int machineCount = machines.size(); + if (machineCount > maxCount) { + maxCount = machineCount; + } + if (machineCount < minCount) { + minCount = machineCount; + } + } + return std::make_tuple(minCount, maxCount, machinesPerDC.size()); +} + +std::vector DDTeamCollection::getServersOnMachineTeamDescription( + const std::vector>& machines) const { + std::vector res; + for (auto& machine : machines) { + int healthySSCount = 0; + for (auto it : machine->serversOnMachine) { + if (!server_status.get(it->getId()).isUnhealthy()) { + healthySSCount++; + } + } + res.push_back("[Machine]:" + machine->machineID.toString() + + ",[all]:" + std::to_string(machine->serversOnMachine.size()) + + ",[Healthy]:" + std::to_string(healthySSCount)); + } + return res; +} + int DDTeamCollection::addTeamsBestOf(int teamsToBuild, int desiredTeams, int maxTeams) { ASSERT_GE(teamsToBuild, 0); ASSERT_WE_THINK(machine_info.size() > 0 || server_info.size() == 0); @@ -5402,12 +5623,13 @@ int DDTeamCollection::addTeamsBestOf(int teamsToBuild, int desiredTeams, int max te.detail("MachineTeamsAdded", addedMachineTeams); } } - while (addedTeams < teamsToBuild || notEnoughTeamsForAServer()) { std::vector bestServerTeam; int bestScore = std::numeric_limits::max(); int maxAttempts = SERVER_KNOBS->BEST_OF_AMT; // BEST_OF_AMT = 4 bool earlyQuitBuild = false; + int duplicatedAttemptCount = 0; + int machineTeamNotFoundCount = 0; for (int i = 0; i < maxAttempts && i < 100; ++i) { // Step 1: Choose 1 least used server and then choose 1 least used machine team from the server Reference chosenServer = findOneLeastUsedServer(); @@ -5424,9 +5646,10 @@ int DDTeamCollection::addTeamsBestOf(int teamsToBuild, int desiredTeams, int max if (!chosenMachineTeam.isValid()) { // We may face the situation that temporarily we have no healthy machine. - TraceEvent(SevWarn, "MachineTeamNotFound") + TraceEvent(SevWarn, "MachineTeamNotFound", distributorId) .detail("Primary", primary) .detail("MachineTeams", machineTeams.size()); + machineTeamNotFoundCount++; continue; // try randomly to find another least used server } @@ -5460,6 +5683,17 @@ int DDTeamCollection::addTeamsBestOf(int teamsToBuild, int desiredTeams, int max int overlap = overlappingMembers(serverTeam); if (overlap == serverTeam.size()) { maxAttempts += 1; + duplicatedAttemptCount++; + TraceEvent(SevInfo, "BuildServerTeamsCandidateAlreadyExist", distributorId) + .suppressFor(1.0) + .setMaxEventLength(-1) + .setMaxFieldLength(-1) + .detail("Primary", primary) + .detail("MachineCount", chosenMachineTeam->getMachines().size()) + .detail("ServersOnMachineTeam", + describe(getServersOnMachineTeamDescription(chosenMachineTeam->getMachines()))) + .detail("ServerTeam", describe(serverTeam)) + .detail("StorageTeamSize", configuration.storageTeamSize); continue; } @@ -5470,7 +5704,8 @@ int DDTeamCollection::addTeamsBestOf(int teamsToBuild, int desiredTeams, int max for (auto& server : serverTeam) { score += server_info[server]->getTeams().size(); } - TraceEvent(SevDebug, "BuildServerTeams") + TraceEvent(SevDebug, "BuildServerTeams", distributorId) + .detail("Primary", primary) .detail("Score", score) .detail("BestScore", bestScore) .detail("TeamSize", serverTeam.size()) @@ -5488,10 +5723,15 @@ int DDTeamCollection::addTeamsBestOf(int teamsToBuild, int desiredTeams, int max // Not find any team and will unlikely find a team lastBuildTeamsFailed = true; TraceEvent(SevWarn, "BuildTeamsLastBuildTeamsFailed", distributorId) - .detail("Reason", "Unable to find any valid serverTeam") + .detail("Reason", "Unable to find more valid serverTeam") .detail("Primary", primary) .detail("BestServerTeam", describe(bestServerTeam)) - .detail("ConfigStorageTeamSize", configuration.storageTeamSize); + .detail("ConfigStorageTeamSize", configuration.storageTeamSize) + .detail("AddedTeams", addedTeams) + .detail("TeamsToBuild", teamsToBuild) + .detail("NotEnoughTeamsForAServer", notEnoughTeamsForAServer()) + .detail("DuplicatedAttemptCount", duplicatedAttemptCount) + .detail("MachineTeamNotFoundCount", machineTeamNotFoundCount); break; } diff --git a/fdbserver/include/fdbserver/DDTeamCollection.h b/fdbserver/include/fdbserver/DDTeamCollection.h index a90aae48f1f..87a6224f42a 100644 --- a/fdbserver/include/fdbserver/DDTeamCollection.h +++ b/fdbserver/include/fdbserver/DDTeamCollection.h @@ -215,6 +215,7 @@ class DDTeamCollection : public ReferenceCounted { bool doBuildTeams; bool lastBuildTeamsFailed; + Optional buildTeamsFailedStartTime; Future teamBuilder; AsyncTrigger restartTeamBuilder; AsyncVar waitUntilRecruited; // make teambuilder wait until one new SS is recruited @@ -348,6 +349,10 @@ class DDTeamCollection : public ReferenceCounted { void evaluateTeamQuality() const; + void traceBuildTeamFailedForLongTime(); + + bool isServerMachineDCLayoutGood() const; + int overlappingMembers(const std::vector& team) const; int overlappingMachineMembers(std::vector> const& team) const; @@ -423,6 +428,13 @@ class DDTeamCollection : public ReferenceCounted { // Return true if there exists a server that does not have enough teams. bool notEnoughTeamsForAServer() const; + std::vector getServersOnMachineTeamDescription( + const std::vector>& machines) const; + + std::tuple getHealthyStorageServerCountPerMachine() const; + + std::tuple getMachineCountPerZone() const; + // Use the current set of known processes (from server_info) to compute an optimized set of storage server teams. // The following are guarantees of the process: // - Each newly-built team will meet the replication policy From b62ed94181b9387a4447b157ac5778cfeb1c9b7b Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Thu, 26 Sep 2024 21:21:52 -0700 Subject: [PATCH 02/12] refactor and fix --- fdbserver/DDTeamCollection.actor.cpp | 193 +++++++++++------- .../include/fdbserver/DDTeamCollection.h | 10 +- 2 files changed, 129 insertions(+), 74 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index a5b9caa60e8..f57154b550e 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -74,7 +74,7 @@ unsigned EligibilityCounter::getCount(int combinedType) const { } // namespace data_distribution -int getNChooseKLowerBound(int n, int k) { +size_t getNChooseKLowerBound(size_t n, size_t k) { if (k > n) return 0; if (k * 2 > n) @@ -82,8 +82,8 @@ int getNChooseKLowerBound(int n, int k) { if (k == 0) return 1; - int result = n; - for (int i = 2; i <= k; ++i) { + size_t result = n; + for (size_t i = 2; i <= k; ++i) { result = result * (n - i + 1); result = result / i; } @@ -4494,78 +4494,109 @@ void DDTeamCollection::evaluateTeamQuality() const { .detail("MachineMaxTeams", maxMachineTeams); } -// Decide if the current layout of servers and machines is good enough to meet the target when building serverTeams and -// machineTeams. Sometimes, buildTeams is consistently failed because the layout is bad For example, when each machine -// team is smaller than targetTeamNumPerServer, then buildTeam always fails. This can be caused by: (1) each machine has -// too few SSes; (2) machine count is too small Complexity: scan the server_info list -bool DDTeamCollection::isServerMachineDCLayoutGood() const { +bool DDTeamCollection::isMachineLayoutGood(uint64_t& maxMachineTeamCountGivenAMachine) const { // The decision is made based on the tracked healthy SSes - // Collect data points from server_info and filter out unhealthy SSes - std::tuple tmp = getHealthyStorageServerCountPerMachine(); - int minServerCountPerMachine = std::get<0>(tmp); - int maxServerCountPerMachine = std::get<1>(tmp); - int machineCount = std::get<2>(tmp); - if (machineCount == 0) { - // Layout is bad if the number of machine is 0 - TraceEvent(SevWarnAlways, "BuildTeamZeroMachineFromServerList", distributorId) + // Collect data points from server_info and filter out unhealthy machines + std::vector tmp = getHealthyMachineCountPerZoneOrDataHall(); + size_t minMachineCountPerZone = tmp[0]; + size_t maxMachineCountPerZone = tmp[1]; + size_t zoneCount = tmp[2]; + size_t totalHealthyMachineCount = tmp[3]; + if (zoneCount == 0) { + // Layout is bad if the number of zone is 0 + TraceEvent(SevWarnAlways, "BuildTeamZeroZoneFromServerList", distributorId) .suppressFor(60.0) .detail("Primary", primary); return false; } - if (maxServerCountPerMachine - minServerCountPerMachine > 1) { - // The unbalance count makes the condition harder to meet given the same amount of SSes. + if (maxMachineCountPerZone - minMachineCountPerZone > 1) { + // The unbalance count makes the condition harder to meet given the same amount of machines. // So, we make an alert here - TraceEvent(SevWarn, "BuildTeamUnbalanceSSPerMachineFromServerList", distributorId) + TraceEvent(SevWarn, "BuildTeamUnbalanceMachinePerZoneFromServerList", distributorId) .suppressFor(60.0) - .detail("MaxServerCountPerMachine", maxServerCountPerMachine) - .detail("MinServerCountPerMachine", minServerCountPerMachine) + .detail("MaxMachineCountPerZone", maxMachineCountPerZone) + .detail("MinMachineCountPerZone", minMachineCountPerZone) + .detail("ZoneCount", zoneCount) .detail("Primary", primary); } + // Make decision: + // (1) Layout is bad if the number of machine/zone is not efficient to satisfy the targetMachineTeamNumPerMachine + // when building machineTeams. If this is the case, it is possible that buildTeam consistently fails. + maxMachineTeamCountGivenAMachine = getNChooseKLowerBound(zoneCount - 1, configuration.storageTeamSize - 1) * + pow(minMachineCountPerZone, configuration.storageTeamSize - 1); + int targetMachineTeamNumPerMachine = + SERVER_KNOBS->TR_FLAG_REMOVE_MT_WITH_MOST_TEAMS + ? (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (configuration.storageTeamSize + 1)) / 2 + : SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER; + bool sufficientToCreateEnoughMachineTeams = targetMachineTeamNumPerMachine <= maxMachineTeamCountGivenAMachine; + + // (2) Layout is bad if the number of all healthy machines is not efficient to satisfy the desiredTotalMachineTeams + int desiredTotalMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount; + uint64_t maxTotalMachineTeamCount = + getNChooseKLowerBound(totalHealthyMachineCount, configuration.storageTeamSize); // wrong! + sufficientToCreateEnoughMachineTeams &= (desiredTotalMachineTeams <= maxTotalMachineTeamCount); + + TraceEvent(SevInfo, "BuildTeamsDecideMachineGoodLayout", distributorId) + .suppressFor(60.0) + .setMaxEventLength(-1) + .setMaxFieldLength(-1) + .detail("Primary", primary) + .detail("HealthyMachineCount", totalHealthyMachineCount) + .detail("MachineInfoSize", machine_info.size()) + .detail("ConfigTeamSize", configuration.storageTeamSize) + .detail("TargetMachineTeamNumPerMachine", targetMachineTeamNumPerMachine) + .detail("MaxMachineTeamsGivenAMachine", maxMachineTeamCountGivenAMachine) + .detail("MaxTotalMachineTeamCount", maxTotalMachineTeamCount) + .detail("MinMachineCountPerZone", minMachineCountPerZone) + .detail("MaxMachineCountPerZone", maxMachineCountPerZone) + .detail("ZoneCount", zoneCount) + .detail("TotalHealthyMachineCount", totalHealthyMachineCount) + .detail("DesiredTotalMachineTeams", desiredTotalMachineTeams) + .detail("SufficientToCreateEnoughMachineTeams", sufficientToCreateEnoughMachineTeams); + return sufficientToCreateEnoughMachineTeams; +} + +bool DDTeamCollection::isServerLayoutGood(const uint64_t maxMachineTeamCountGivenAMachine) const { + // The decision is made based on the tracked healthy SSes // Collect data points from server_info and filter out unhealthy SSes - // If a machine has no healthy SS, we do not count this machine - tmp = getMachineCountPerZone(); - int minMachineCountPerZone = std::get<0>(tmp); - int maxMachineCountPerZone = std::get<1>(tmp); - int zoneCount = std::get<2>(tmp); - if (zoneCount == 0) { - // Layout is bad if the number of zone is 0 - TraceEvent(SevWarnAlways, "BuildTeamZeroZoneFromServerList", distributorId) + std::vector tmp = getHealthyStorageServerCountPerMachine(); + size_t minServerCountPerMachine = tmp[0]; + size_t maxServerCountPerMachine = tmp[1]; + size_t machineCount = tmp[2]; + size_t totalHealthyServerCount = tmp[3]; + if (machineCount == 0) { + // Layout is bad if the number of machine is 0 + TraceEvent(SevWarnAlways, "BuildTeamZeroMachineFromServerList", distributorId) .suppressFor(60.0) .detail("Primary", primary); return false; } - if (maxMachineCountPerZone - minMachineCountPerZone > 1) { - // The unbalance count makes the condition harder to meet given the same amount of machines. + if (maxServerCountPerMachine - minServerCountPerMachine > 1) { + // The unbalance count makes the condition harder to meet given the same amount of SSes. // So, we make an alert here - TraceEvent(SevWarn, "BuildTeamUnbalanceMachinePerDCFromServerList", distributorId) + TraceEvent(SevWarn, "BuildTeamUnbalanceSSPerMachineFromServerList", distributorId) .suppressFor(60.0) - .detail("MaxMachineCountPerZone", maxMachineCountPerZone) - .detail("MinMachineCountPerZone", minMachineCountPerZone) - .detail("ZoneCount", zoneCount) + .detail("MaxServerCountPerMachine", maxServerCountPerMachine) + .detail("MinServerCountPerMachine", minServerCountPerMachine) .detail("Primary", primary); } - int possibleMachineTeamsGivenAMachine = - getNChooseKLowerBound(minMachineCountPerZone - 1, configuration.storageTeamSize - 1); // Make decision: - // Layout is bad if there exists a machine having too few SSes or the machine count and minServerCountPerMachine + // (1) Layout is bad if there exists a machine having too few SSes or the machine count and minServerCountPerMachine // cannot satisfy the targetTeamNumPerServer when building serverTeams. // If this is the case, it is possible that buildTeam consistently fails but not always fails. // However, we conservatively mark the layout bad int targetTeamNumPerServer = (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (configuration.storageTeamSize + 1)) / 2; bool sufficientToCreateEnoughServerTeams = - (possibleMachineTeamsGivenAMachine * pow(minServerCountPerMachine, configuration.storageTeamSize - 1) >= + (maxMachineTeamCountGivenAMachine * pow(minServerCountPerMachine, configuration.storageTeamSize - 1) >= targetTeamNumPerServer); - // Layout is bad if the number of machines is not efficient to satisfy the targetMachineTeamNumPerMachine when - // building machineTeams. If this is the case, it is possible that buildTeam consistently fails. - int targetMachineTeamNumPerMachine = - SERVER_KNOBS->TR_FLAG_REMOVE_MT_WITH_MOST_TEAMS - ? (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (configuration.storageTeamSize + 1)) / 2 - : SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER; - bool sufficientToCreateEnoughMachineTeams = possibleMachineTeamsGivenAMachine >= targetMachineTeamNumPerMachine; + // (2) Layout is bad if the number of all healthy servers is not efficient to satisfy the desiredTotalServerTeams + int desiredTotalServerTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyServerCount; + uint64_t maxTotalServerTeamCount = getNChooseKLowerBound(totalHealthyServerCount, configuration.storageTeamSize); + sufficientToCreateEnoughServerTeams &= (desiredTotalServerTeams <= maxTotalServerTeamCount); - TraceEvent(SevInfo, "BuildTeamsDecideGoodLayout", distributorId) + TraceEvent(SevInfo, "BuildTeamsDecideServerGoodLayout", distributorId) .suppressFor(60.0) .setMaxEventLength(-1) .setMaxFieldLength(-1) @@ -4576,20 +4607,29 @@ bool DDTeamCollection::isServerMachineDCLayoutGood() const { .detail("MaxServerCountPerMachine", maxServerCountPerMachine) .detail("ConfigTeamSize", configuration.storageTeamSize) .detail("TargetTeamNumPerServer", targetTeamNumPerServer) - .detail("TargetMachineTeamNumPerMachine", targetMachineTeamNumPerMachine) - .detail("PossibleMachineTeamsGivenAMachine", possibleMachineTeamsGivenAMachine) - .detail("MinMachineCountPerZone", minMachineCountPerZone) - .detail("MaxMachineCountPerZone", maxMachineCountPerZone) - .detail("ZoneCount", zoneCount) - .detail("SufficientToCreateEnoughServerTeams", sufficientToCreateEnoughServerTeams) - .detail("SufficientToCreateEnoughMachineTeams", sufficientToCreateEnoughMachineTeams); + .detail("MaxMachineTeamsGivenAMachine", maxMachineTeamCountGivenAMachine) + .detail("DesiredTotalServerTeams", desiredTotalServerTeams) + .detail("TotalHealthyServerCount", totalHealthyServerCount) + .detail("SufficientToCreateEnoughServerTeams", sufficientToCreateEnoughServerTeams); + return sufficientToCreateEnoughServerTeams; +} + +// Decide if the current layout of servers and machines is good enough to meet the target when building serverTeams and +// machineTeams. Sometimes, buildTeams is consistently failed because the layout is bad For example, when each machine +// team is smaller than targetTeamNumPerServer, then buildTeam always fails. This can be caused by: (1) each machine has +// too few SSes; (2) machine count is too small Complexity: scan the server_info list +bool DDTeamCollection::isServerMachineLayoutGood() const { + uint64_t maxMachineTeamCountGivenAMachine = 0; + bool sufficientToCreateEnoughMachineTeams = isMachineLayoutGood(maxMachineTeamCountGivenAMachine); + // maxMachineTeamCountGivenAMachine is updated by isMachineLayoutGood + bool sufficientToCreateEnoughServerTeams = isServerLayoutGood(maxMachineTeamCountGivenAMachine); return sufficientToCreateEnoughServerTeams && sufficientToCreateEnoughMachineTeams; } // Trace error if buildTeams keep failing for sufficient long time when the layout is good. -// For the layout, see comments of isServerMachineDCLayoutGood() +// For the layout, see comments of isServerMachineLayoutGood() void DDTeamCollection::traceBuildTeamFailedForLongTime() { - if (!isServerMachineDCLayoutGood()) { + if (!isServerMachineLayoutGood()) { buildTeamsFailedStartTime.reset(); TraceEvent(SevWarn, "BuildTeamsFailedTimerResetDueToBadLayout", distributorId).detail("Primary", primary); } else if (!lastBuildTeamsFailed) { @@ -5188,7 +5228,11 @@ int DDTeamCollection::addBestMachineTeams(int machineTeamsToBuild) { if (overlap == machineIDs.size()) { maxAttempts += 1; TraceEvent(SevInfo, "AddBestMachineTeamAlreadyExist", distributorId) - .suppressFor(1.0) + // .suppressFor(1.0) + .detail("Index", i) + .detail("AddedMachineTeams", addedMachineTeams) + .detail("MachineTeamsToBuild", machineTeamsToBuild) + .detail("NotEnoughMachineTeamsForAMachine", notEnoughMachineTeamsForAMachine()) .detail("Primary", primary) .detail("Name", configuration.storagePolicy->name()) .detail("Policy", configuration.storagePolicy->info()); @@ -5499,8 +5543,9 @@ bool DDTeamCollection::notEnoughTeamsForAServer() const { return false; } -std::tuple DDTeamCollection::getHealthyStorageServerCountPerMachine() const { +std::vector DDTeamCollection::getHealthyStorageServerCountPerMachine() const { std::unordered_map serverCountOnMachines; + size_t serverCount = 0; for (auto& [serverID, server] : server_info) { if (server_status.get(serverID).isUnhealthy()) { continue; @@ -5510,9 +5555,10 @@ std::tuple DDTeamCollection::getHealthyStorageServerCountPerMachi serverCountOnMachines[machineID] = 0; } serverCountOnMachines[machineID] = serverCountOnMachines[machineID] + 1; + serverCount++; } - int maxCount = -1; - int minCount = std::numeric_limits::max(); + size_t maxCount = 0; + size_t minCount = std::numeric_limits::max(); for (const auto& [id, count] : serverCountOnMachines) { if (count > maxCount) { maxCount = count; @@ -5521,28 +5567,33 @@ std::tuple DDTeamCollection::getHealthyStorageServerCountPerMachi minCount = count; } } - return std::make_tuple(minCount, maxCount, serverCountOnMachines.size()); + return { minCount, maxCount, serverCountOnMachines.size(), serverCount }; } -std::tuple DDTeamCollection::getMachineCountPerZone() const { - std::unordered_map> machinesPerDC; +std::vector DDTeamCollection::getHealthyMachineCountPerZoneOrDataHall() const { + std::unordered_map> machinesPerZone; + std::unordered_set machines; for (auto& [serverID, server] : server_info) { - if (server_status.get(serverID).isUnhealthy()) { + if (!isMachineHealthy(server->machine)) { continue; } std::string zoneId; - if (server->getLastKnownInterface().locality.zoneId().present()) { + if (configuration.storagePolicy->info().find("data_hall") != std::string::npos) { + ASSERT(server->getLastKnownInterface().locality.dataHallId().present()); + zoneId = server->getLastKnownInterface().locality.dataHallId().get().toString(); + } else if (server->getLastKnownInterface().locality.zoneId().present()) { zoneId = server->getLastKnownInterface().locality.zoneId().get().toString(); } else { zoneId = server->machine->machineID.toString(); } std::string machineID = server->machine->machineID.toString(); - machinesPerDC[zoneId].insert(machineID); + machinesPerZone[zoneId].insert(machineID); + machines.insert(machineID); } - int maxCount = -1; - int minCount = std::numeric_limits::max(); - for (const auto& [id, machines] : machinesPerDC) { - int machineCount = machines.size(); + size_t maxCount = 0; + size_t minCount = std::numeric_limits::max(); + for (const auto& [id, machines] : machinesPerZone) { + size_t machineCount = machines.size(); if (machineCount > maxCount) { maxCount = machineCount; } @@ -5550,7 +5601,7 @@ std::tuple DDTeamCollection::getMachineCountPerZone() const { minCount = machineCount; } } - return std::make_tuple(minCount, maxCount, machinesPerDC.size()); + return { minCount, maxCount, machinesPerZone.size(), machines.size() }; } std::vector DDTeamCollection::getServersOnMachineTeamDescription( diff --git a/fdbserver/include/fdbserver/DDTeamCollection.h b/fdbserver/include/fdbserver/DDTeamCollection.h index 87a6224f42a..b6837cec9c5 100644 --- a/fdbserver/include/fdbserver/DDTeamCollection.h +++ b/fdbserver/include/fdbserver/DDTeamCollection.h @@ -351,7 +351,7 @@ class DDTeamCollection : public ReferenceCounted { void traceBuildTeamFailedForLongTime(); - bool isServerMachineDCLayoutGood() const; + bool isServerMachineLayoutGood() const; int overlappingMembers(const std::vector& team) const; @@ -431,9 +431,13 @@ class DDTeamCollection : public ReferenceCounted { std::vector getServersOnMachineTeamDescription( const std::vector>& machines) const; - std::tuple getHealthyStorageServerCountPerMachine() const; + std::vector getHealthyStorageServerCountPerMachine() const; - std::tuple getMachineCountPerZone() const; + std::vector getHealthyMachineCountPerZoneOrDataHall() const; + + bool isMachineLayoutGood(uint64_t& maxMachineTeamCountGivenAMachine) const; + + bool isServerLayoutGood(const uint64_t maxMachineTeamCountGivenAMachine) const; // Use the current set of known processes (from server_info) to compute an optimized set of storage server teams. // The following are guarantees of the process: From eb935c374edfe22d42c2c0a95457639067f320fe Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Thu, 26 Sep 2024 22:32:49 -0700 Subject: [PATCH 03/12] fix --- fdbserver/DDTeamCollection.actor.cpp | 27 +++++++++++-------- .../include/fdbserver/DDTeamCollection.h | 4 +-- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index f57154b550e..7e1ef1362a6 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -4494,7 +4494,8 @@ void DDTeamCollection::evaluateTeamQuality() const { .detail("MachineMaxTeams", maxMachineTeams); } -bool DDTeamCollection::isMachineLayoutGood(uint64_t& maxMachineTeamCountGivenAMachine) const { +bool DDTeamCollection::isMachineLayoutGood(uint64_t& maxMachineTeamCountGivenAMachine, + uint64_t& maxMachineTeamCount) const { // The decision is made based on the tracked healthy SSes // Collect data points from server_info and filter out unhealthy machines std::vector tmp = getHealthyMachineCountPerZoneOrDataHall(); @@ -4532,9 +4533,9 @@ bool DDTeamCollection::isMachineLayoutGood(uint64_t& maxMachineTeamCountGivenAMa // (2) Layout is bad if the number of all healthy machines is not efficient to satisfy the desiredTotalMachineTeams int desiredTotalMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount; - uint64_t maxTotalMachineTeamCount = - getNChooseKLowerBound(totalHealthyMachineCount, configuration.storageTeamSize); // wrong! - sufficientToCreateEnoughMachineTeams &= (desiredTotalMachineTeams <= maxTotalMachineTeamCount); + maxMachineTeamCount = getNChooseKLowerBound(zoneCount, configuration.storageTeamSize) * + pow(minMachineCountPerZone, configuration.storageTeamSize); + sufficientToCreateEnoughMachineTeams &= (desiredTotalMachineTeams <= maxMachineTeamCount); TraceEvent(SevInfo, "BuildTeamsDecideMachineGoodLayout", distributorId) .suppressFor(60.0) @@ -4546,7 +4547,7 @@ bool DDTeamCollection::isMachineLayoutGood(uint64_t& maxMachineTeamCountGivenAMa .detail("ConfigTeamSize", configuration.storageTeamSize) .detail("TargetMachineTeamNumPerMachine", targetMachineTeamNumPerMachine) .detail("MaxMachineTeamsGivenAMachine", maxMachineTeamCountGivenAMachine) - .detail("MaxTotalMachineTeamCount", maxTotalMachineTeamCount) + .detail("MaxMachineTeamCount", maxMachineTeamCount) .detail("MinMachineCountPerZone", minMachineCountPerZone) .detail("MaxMachineCountPerZone", maxMachineCountPerZone) .detail("ZoneCount", zoneCount) @@ -4556,7 +4557,8 @@ bool DDTeamCollection::isMachineLayoutGood(uint64_t& maxMachineTeamCountGivenAMa return sufficientToCreateEnoughMachineTeams; } -bool DDTeamCollection::isServerLayoutGood(const uint64_t maxMachineTeamCountGivenAMachine) const { +bool DDTeamCollection::isServerLayoutGood(const uint64_t maxMachineTeamCountGivenAMachine, + const uint64_t maxMachineTeamCount) const { // The decision is made based on the tracked healthy SSes // Collect data points from server_info and filter out unhealthy SSes std::vector tmp = getHealthyStorageServerCountPerMachine(); @@ -4593,8 +4595,7 @@ bool DDTeamCollection::isServerLayoutGood(const uint64_t maxMachineTeamCountGive // (2) Layout is bad if the number of all healthy servers is not efficient to satisfy the desiredTotalServerTeams int desiredTotalServerTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyServerCount; - uint64_t maxTotalServerTeamCount = getNChooseKLowerBound(totalHealthyServerCount, configuration.storageTeamSize); - sufficientToCreateEnoughServerTeams &= (desiredTotalServerTeams <= maxTotalServerTeamCount); + sufficientToCreateEnoughServerTeams &= (desiredTotalServerTeams <= maxMachineTeamCount); TraceEvent(SevInfo, "BuildTeamsDecideServerGoodLayout", distributorId) .suppressFor(60.0) @@ -4608,6 +4609,7 @@ bool DDTeamCollection::isServerLayoutGood(const uint64_t maxMachineTeamCountGive .detail("ConfigTeamSize", configuration.storageTeamSize) .detail("TargetTeamNumPerServer", targetTeamNumPerServer) .detail("MaxMachineTeamsGivenAMachine", maxMachineTeamCountGivenAMachine) + .detail("MaxMachineTeamCount", maxMachineTeamCount) .detail("DesiredTotalServerTeams", desiredTotalServerTeams) .detail("TotalHealthyServerCount", totalHealthyServerCount) .detail("SufficientToCreateEnoughServerTeams", sufficientToCreateEnoughServerTeams); @@ -4620,9 +4622,12 @@ bool DDTeamCollection::isServerLayoutGood(const uint64_t maxMachineTeamCountGive // too few SSes; (2) machine count is too small Complexity: scan the server_info list bool DDTeamCollection::isServerMachineLayoutGood() const { uint64_t maxMachineTeamCountGivenAMachine = 0; - bool sufficientToCreateEnoughMachineTeams = isMachineLayoutGood(maxMachineTeamCountGivenAMachine); - // maxMachineTeamCountGivenAMachine is updated by isMachineLayoutGood - bool sufficientToCreateEnoughServerTeams = isServerLayoutGood(maxMachineTeamCountGivenAMachine); + uint64_t maxMachineTeamCount = 0; + bool sufficientToCreateEnoughMachineTeams = + isMachineLayoutGood(maxMachineTeamCountGivenAMachine, maxMachineTeamCount); + // maxMachineTeamCountGivenAMachine and maxMachineTeamCount updated by isMachineLayoutGood + bool sufficientToCreateEnoughServerTeams = + isServerLayoutGood(maxMachineTeamCountGivenAMachine, maxMachineTeamCount); return sufficientToCreateEnoughServerTeams && sufficientToCreateEnoughMachineTeams; } diff --git a/fdbserver/include/fdbserver/DDTeamCollection.h b/fdbserver/include/fdbserver/DDTeamCollection.h index b6837cec9c5..a1a29288dd7 100644 --- a/fdbserver/include/fdbserver/DDTeamCollection.h +++ b/fdbserver/include/fdbserver/DDTeamCollection.h @@ -435,9 +435,9 @@ class DDTeamCollection : public ReferenceCounted { std::vector getHealthyMachineCountPerZoneOrDataHall() const; - bool isMachineLayoutGood(uint64_t& maxMachineTeamCountGivenAMachine) const; + bool isMachineLayoutGood(uint64_t& maxMachineTeamCountGivenAMachine, uint64_t& maxMachineTeamCount) const; - bool isServerLayoutGood(const uint64_t maxMachineTeamCountGivenAMachine) const; + bool isServerLayoutGood(const uint64_t maxMachineTeamCountGivenAMachine, const uint64_t maxMachineTeamCount) const; // Use the current set of known processes (from server_info) to compute an optimized set of storage server teams. // The following are guarantees of the process: From 4ad5545d3bd412cd48256e58ecc06f81956138fe Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Fri, 27 Sep 2024 09:40:19 -0700 Subject: [PATCH 04/12] add it to printSnapshotTeamsInfo --- fdbserver/DDTeamCollection.actor.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 7e1ef1362a6..9dcd8e82a11 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -3477,7 +3477,8 @@ class DDTeamCollectionImpl { TraceEvent("DDPrintSnapshotTeamsInfo", self->getDistributorId()) .detail("SnapshotSpeed", now() - snapshotStart) - .detail("Primary", self->isPrimary()); + .detail("Primary", self->isPrimary()) + .detail("ServerMachineLayoutIsGood", self->isServerMachineLayoutGood()); // Print to TraceEvents TraceEvent("DDConfig", self->getDistributorId()) From 4b41c8abcc8340ad9f2b30c92f1b169ebac5b952 Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Mon, 4 Nov 2024 14:01:56 -0800 Subject: [PATCH 05/12] address comments --- fdbserver/DDTeamCollection.actor.cpp | 53 ++++++++----------- .../include/fdbserver/DDTeamCollection.h | 30 ++++++++++- 2 files changed, 50 insertions(+), 33 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 9dcd8e82a11..95adfcc145e 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -82,12 +82,11 @@ size_t getNChooseKLowerBound(size_t n, size_t k) { if (k == 0) return 1; - size_t result = n; + double result = n; for (size_t i = 2; i <= k; ++i) { - result = result * (n - i + 1); - result = result / i; + result *= ((n - i + 1) * 1.0 / i); } - return result; + return static_cast(ceil(result)); } class DDTeamCollectionImpl { @@ -4499,11 +4498,11 @@ bool DDTeamCollection::isMachineLayoutGood(uint64_t& maxMachineTeamCountGivenAMa uint64_t& maxMachineTeamCount) const { // The decision is made based on the tracked healthy SSes // Collect data points from server_info and filter out unhealthy machines - std::vector tmp = getHealthyMachineCountPerZoneOrDataHall(); - size_t minMachineCountPerZone = tmp[0]; - size_t maxMachineCountPerZone = tmp[1]; - size_t zoneCount = tmp[2]; - size_t totalHealthyMachineCount = tmp[3]; + TCMachineZoneMapCounting res = getMachineZoneMapCounting(); + size_t minMachineCountPerZone = res.minMachineCountPerZone; + size_t maxMachineCountPerZone = res.maxMachineCountPerZone; + size_t zoneCount = res.zoneCount; + size_t totalHealthyMachineCount = res.totalHealthyMachineCount; if (zoneCount == 0) { // Layout is bad if the number of zone is 0 TraceEvent(SevWarnAlways, "BuildTeamZeroZoneFromServerList", distributorId) @@ -4562,11 +4561,11 @@ bool DDTeamCollection::isServerLayoutGood(const uint64_t maxMachineTeamCountGive const uint64_t maxMachineTeamCount) const { // The decision is made based on the tracked healthy SSes // Collect data points from server_info and filter out unhealthy SSes - std::vector tmp = getHealthyStorageServerCountPerMachine(); - size_t minServerCountPerMachine = tmp[0]; - size_t maxServerCountPerMachine = tmp[1]; - size_t machineCount = tmp[2]; - size_t totalHealthyServerCount = tmp[3]; + TCStorageServerMachineMapCounting res = getStorageServerMachineMapCounting(); + size_t minServerCountPerMachine = res.minServerCountPerMachine; + size_t maxServerCountPerMachine = res.maxServerCountPerMachine; + size_t machineCount = res.machineCount; + size_t totalHealthyServerCount = res.totalHealthyServerCount; if (machineCount == 0) { // Layout is bad if the number of machine is 0 TraceEvent(SevWarnAlways, "BuildTeamZeroMachineFromServerList", distributorId) @@ -5549,8 +5548,8 @@ bool DDTeamCollection::notEnoughTeamsForAServer() const { return false; } -std::vector DDTeamCollection::getHealthyStorageServerCountPerMachine() const { - std::unordered_map serverCountOnMachines; +TCStorageServerMachineMapCounting DDTeamCollection::getStorageServerMachineMapCounting() const { + std::unordered_map serverCountOnMachines; size_t serverCount = 0; for (auto& [serverID, server] : server_info) { if (server_status.get(serverID).isUnhealthy()) { @@ -5566,17 +5565,13 @@ std::vector DDTeamCollection::getHealthyStorageServerCountPerMachine() c size_t maxCount = 0; size_t minCount = std::numeric_limits::max(); for (const auto& [id, count] : serverCountOnMachines) { - if (count > maxCount) { - maxCount = count; - } - if (count < minCount) { - minCount = count; - } + maxCount = std::max(count, maxCount); + minCount = std::min(count, minCount); } - return { minCount, maxCount, serverCountOnMachines.size(), serverCount }; + return TCStorageServerMachineMapCounting(minCount, maxCount, serverCountOnMachines.size(), serverCount); } -std::vector DDTeamCollection::getHealthyMachineCountPerZoneOrDataHall() const { +TCMachineZoneMapCounting DDTeamCollection::getMachineZoneMapCounting() const { std::unordered_map> machinesPerZone; std::unordered_set machines; for (auto& [serverID, server] : server_info) { @@ -5600,14 +5595,10 @@ std::vector DDTeamCollection::getHealthyMachineCountPerZoneOrDataHall() size_t minCount = std::numeric_limits::max(); for (const auto& [id, machines] : machinesPerZone) { size_t machineCount = machines.size(); - if (machineCount > maxCount) { - maxCount = machineCount; - } - if (machineCount < minCount) { - minCount = machineCount; - } + maxCount = std::max(machineCount, maxCount); + minCount = std::min(machineCount, maxCount); } - return { minCount, maxCount, machinesPerZone.size(), machines.size() }; + return TCMachineZoneMapCounting(minCount, maxCount, machinesPerZone.size(), machines.size()); } std::vector DDTeamCollection::getServersOnMachineTeamDescription( diff --git a/fdbserver/include/fdbserver/DDTeamCollection.h b/fdbserver/include/fdbserver/DDTeamCollection.h index a1a29288dd7..5a24cffcd18 100644 --- a/fdbserver/include/fdbserver/DDTeamCollection.h +++ b/fdbserver/include/fdbserver/DDTeamCollection.h @@ -202,6 +202,32 @@ struct DDTeamCollectionInitParams { PromiseStream triggerStorageQueueRebalance; }; +struct TCStorageServerMachineMapCounting { + size_t minServerCountPerMachine = 0; + size_t maxServerCountPerMachine = 0; + size_t machineCount = 0; + size_t totalHealthyServerCount = 0; + TCStorageServerMachineMapCounting(size_t minServerCountPerMachine, + size_t maxServerCountPerMachine, + size_t machineCount, + size_t totalHealthyServerCount) + : minServerCountPerMachine(minServerCountPerMachine), maxServerCountPerMachine(maxServerCountPerMachine), + machineCount(machineCount), totalHealthyServerCount(totalHealthyServerCount) {} +}; + +struct TCMachineZoneMapCounting { + size_t minMachineCountPerZone = 0; + size_t maxMachineCountPerZone = 0; + size_t zoneCount = 0; + size_t totalHealthyMachineCount = 0; + TCMachineZoneMapCounting(size_t minMachineCountPerZone, + size_t maxMachineCountPerZone, + size_t zoneCount, + size_t totalHealthyMachineCount) + : minMachineCountPerZone(minMachineCountPerZone), maxMachineCountPerZone(maxMachineCountPerZone), + zoneCount(zoneCount), totalHealthyMachineCount(totalHealthyMachineCount) {} +}; + class DDTeamCollection : public ReferenceCounted { friend class DDTeamCollectionImpl; friend class DDTeamCollectionUnitTest; @@ -431,9 +457,9 @@ class DDTeamCollection : public ReferenceCounted { std::vector getServersOnMachineTeamDescription( const std::vector>& machines) const; - std::vector getHealthyStorageServerCountPerMachine() const; + TCStorageServerMachineMapCounting getStorageServerMachineMapCounting() const; - std::vector getHealthyMachineCountPerZoneOrDataHall() const; + TCMachineZoneMapCounting getMachineZoneMapCounting() const; bool isMachineLayoutGood(uint64_t& maxMachineTeamCountGivenAMachine, uint64_t& maxMachineTeamCount) const; From 37d21caf665897cca3b642dc5b22e01078479315 Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Mon, 4 Nov 2024 14:05:46 -0800 Subject: [PATCH 06/12] address comments --- fdbserver/DDTeamCollection.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 95adfcc145e..cebf1eaf784 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -77,10 +77,10 @@ unsigned EligibilityCounter::getCount(int combinedType) const { size_t getNChooseKLowerBound(size_t n, size_t k) { if (k > n) return 0; - if (k * 2 > n) - k = n - k; if (k == 0) return 1; + if (k * 2 > n) + k = n - k; double result = n; for (size_t i = 2; i <= k; ++i) { From ba5259fa2325c7da30c0274caff479fbadf6b82a Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Mon, 4 Nov 2024 14:11:59 -0800 Subject: [PATCH 07/12] nits --- fdbserver/DDTeamCollection.actor.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index cebf1eaf784..c8aa4fcb0b1 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -5232,8 +5232,7 @@ int DDTeamCollection::addBestMachineTeams(int machineTeamsToBuild) { int overlap = overlappingMachineMembers(machineIDs); if (overlap == machineIDs.size()) { maxAttempts += 1; - TraceEvent(SevInfo, "AddBestMachineTeamAlreadyExist", distributorId) - // .suppressFor(1.0) + TraceEvent(SevDebug, "AddBestMachineTeamAlreadyExist", distributorId) .detail("Index", i) .detail("AddedMachineTeams", addedMachineTeams) .detail("MachineTeamsToBuild", machineTeamsToBuild) @@ -5732,8 +5731,7 @@ int DDTeamCollection::addTeamsBestOf(int teamsToBuild, int desiredTeams, int max if (overlap == serverTeam.size()) { maxAttempts += 1; duplicatedAttemptCount++; - TraceEvent(SevInfo, "BuildServerTeamsCandidateAlreadyExist", distributorId) - .suppressFor(1.0) + TraceEvent(SevDebug, "BuildServerTeamsCandidateAlreadyExist", distributorId) .setMaxEventLength(-1) .setMaxFieldLength(-1) .detail("Primary", primary) From 59a6c5c42c75a42906840fc39951f3dc5d10178e Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Mon, 4 Nov 2024 15:00:38 -0800 Subject: [PATCH 08/12] nits --- fdbclient/ServerKnobs.cpp | 2 +- fdbserver/DDTeamCollection.actor.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index d80017c9062..6fc8a7d0cbf 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -365,7 +365,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DD_LARGE_TEAM_DELAY, 60.0 ); init( DD_FIX_WRONG_REPLICAS_DELAY, 60.0 ); init (DD_VALIDATE_SERVER_TEAM_COUNT_AFTER_BUILD_TEAM, false ); if (isSimulated) DD_VALIDATE_SERVER_TEAM_COUNT_AFTER_BUILD_TEAM = true; - init( DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR, 0 ); if (isSimulated) DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR = 10 * 60; + init( DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR, 0 ); if (isSimulated) DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR = 15 * 60; // TeamRemover init( TR_LOW_SPACE_PIVOT_DELAY_SEC, 0 ); if (isSimulated) TR_LOW_SPACE_PIVOT_DELAY_SEC = deterministicRandom()->randomInt(0, 3); diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index c8aa4fcb0b1..5862f2b9fbd 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -86,7 +86,7 @@ size_t getNChooseKLowerBound(size_t n, size_t k) { for (size_t i = 2; i <= k; ++i) { result *= ((n - i + 1) * 1.0 / i); } - return static_cast(ceil(result)); + return static_cast(floor(result)); } class DDTeamCollectionImpl { @@ -4619,7 +4619,7 @@ bool DDTeamCollection::isServerLayoutGood(const uint64_t maxMachineTeamCountGive // Decide if the current layout of servers and machines is good enough to meet the target when building serverTeams and // machineTeams. Sometimes, buildTeams is consistently failed because the layout is bad For example, when each machine // team is smaller than targetTeamNumPerServer, then buildTeam always fails. This can be caused by: (1) each machine has -// too few SSes; (2) machine count is too small Complexity: scan the server_info list +// too few SSes; (2) machine count is too small; Complexity: scan the server_info list bool DDTeamCollection::isServerMachineLayoutGood() const { uint64_t maxMachineTeamCountGivenAMachine = 0; uint64_t maxMachineTeamCount = 0; From e8f7c611f44a764ffb7e18ba6462509d8731246c Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Mon, 4 Nov 2024 17:17:16 -0800 Subject: [PATCH 09/12] fix getNChooseKLowerBound --- fdbclient/ServerKnobs.cpp | 2 +- fdbserver/DDTeamCollection.actor.cpp | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 6fc8a7d0cbf..d28392d2c59 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -365,7 +365,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DD_LARGE_TEAM_DELAY, 60.0 ); init( DD_FIX_WRONG_REPLICAS_DELAY, 60.0 ); init (DD_VALIDATE_SERVER_TEAM_COUNT_AFTER_BUILD_TEAM, false ); if (isSimulated) DD_VALIDATE_SERVER_TEAM_COUNT_AFTER_BUILD_TEAM = true; - init( DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR, 0 ); if (isSimulated) DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR = 15 * 60; + init( DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR, 0 ); if (isSimulated) DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR = 5 * 60; // TeamRemover init( TR_LOW_SPACE_PIVOT_DELAY_SEC, 0 ); if (isSimulated) TR_LOW_SPACE_PIVOT_DELAY_SEC = deterministicRandom()->randomInt(0, 3); diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 5862f2b9fbd..8d1b74a5e7a 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -74,21 +74,21 @@ unsigned EligibilityCounter::getCount(int combinedType) const { } // namespace data_distribution -size_t getNChooseKLowerBound(size_t n, size_t k) { - if (k > n) - return 0; - if (k == 0) +size_t getNChooseKLowerBound(int n, int k) { + ASSERT(k >= 0 && k <= n); + if (k == 0 || k == n) { return 1; - if (k * 2 > n) + } + // Take advantage of symmetry C(n, k) = C(n, n - k) + if (k > n - k) { k = n - k; - - double result = n; - for (size_t i = 2; i <= k; ++i) { - result *= ((n - i + 1) * 1.0 / i); + } + double result = 1; + for (int i = 1; i <= k; ++i) { + result *= ((n - k + i) * 1.0 / i); // avoid overflow } return static_cast(floor(result)); } - class DDTeamCollectionImpl { ACTOR static Future checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) { state double start = now(); From 2b202bd4477ae462d1c4fe3e75f297d7b87ac729 Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Mon, 4 Nov 2024 17:17:30 -0800 Subject: [PATCH 10/12] nits --- fdbserver/DDTeamCollection.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 8d1b74a5e7a..5160b688f89 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -89,6 +89,7 @@ size_t getNChooseKLowerBound(int n, int k) { } return static_cast(floor(result)); } + class DDTeamCollectionImpl { ACTOR static Future checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) { state double start = now(); From fe44a7dc50afb43714139d634d29173f8d071a83 Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Mon, 4 Nov 2024 17:33:48 -0800 Subject: [PATCH 11/12] fix corner case --- fdbserver/DDTeamCollection.actor.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 5160b688f89..c689a468961 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -75,7 +75,10 @@ unsigned EligibilityCounter::getCount(int combinedType) const { } // namespace data_distribution size_t getNChooseKLowerBound(int n, int k) { - ASSERT(k >= 0 && k <= n); + ASSERT(n >= 0 && k >= 0); + if (k > n) { + return 0; + } if (k == 0 || k == n) { return 1; } From 87043525e9325e03a9a6679c8e9820eb360ef5ec Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Mon, 4 Nov 2024 21:22:27 -0800 Subject: [PATCH 12/12] nits --- fdbclient/ServerKnobs.cpp | 2 +- fdbserver/DDTeamCollection.actor.cpp | 25 +++---------------------- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index d28392d2c59..d80017c9062 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -365,7 +365,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DD_LARGE_TEAM_DELAY, 60.0 ); init( DD_FIX_WRONG_REPLICAS_DELAY, 60.0 ); init (DD_VALIDATE_SERVER_TEAM_COUNT_AFTER_BUILD_TEAM, false ); if (isSimulated) DD_VALIDATE_SERVER_TEAM_COUNT_AFTER_BUILD_TEAM = true; - init( DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR, 0 ); if (isSimulated) DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR = 5 * 60; + init( DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR, 0 ); if (isSimulated) DD_BUILD_TEAMS_FAILED_TIMESPAN_AS_ERROR = 10 * 60; // TeamRemover init( TR_LOW_SPACE_PIVOT_DELAY_SEC, 0 ); if (isSimulated) TR_LOW_SPACE_PIVOT_DELAY_SEC = deterministicRandom()->randomInt(0, 3); diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index c689a468961..dcc5b16f06a 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -74,25 +74,6 @@ unsigned EligibilityCounter::getCount(int combinedType) const { } // namespace data_distribution -size_t getNChooseKLowerBound(int n, int k) { - ASSERT(n >= 0 && k >= 0); - if (k > n) { - return 0; - } - if (k == 0 || k == n) { - return 1; - } - // Take advantage of symmetry C(n, k) = C(n, n - k) - if (k > n - k) { - k = n - k; - } - double result = 1; - for (int i = 1; i <= k; ++i) { - result *= ((n - k + i) * 1.0 / i); // avoid overflow - } - return static_cast(floor(result)); -} - class DDTeamCollectionImpl { ACTOR static Future checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) { state double start = now(); @@ -4527,7 +4508,7 @@ bool DDTeamCollection::isMachineLayoutGood(uint64_t& maxMachineTeamCountGivenAMa // Make decision: // (1) Layout is bad if the number of machine/zone is not efficient to satisfy the targetMachineTeamNumPerMachine // when building machineTeams. If this is the case, it is possible that buildTeam consistently fails. - maxMachineTeamCountGivenAMachine = getNChooseKLowerBound(zoneCount - 1, configuration.storageTeamSize - 1) * + maxMachineTeamCountGivenAMachine = nChooseK(zoneCount - 1, configuration.storageTeamSize - 1) * pow(minMachineCountPerZone, configuration.storageTeamSize - 1); int targetMachineTeamNumPerMachine = SERVER_KNOBS->TR_FLAG_REMOVE_MT_WITH_MOST_TEAMS @@ -4537,8 +4518,8 @@ bool DDTeamCollection::isMachineLayoutGood(uint64_t& maxMachineTeamCountGivenAMa // (2) Layout is bad if the number of all healthy machines is not efficient to satisfy the desiredTotalMachineTeams int desiredTotalMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount; - maxMachineTeamCount = getNChooseKLowerBound(zoneCount, configuration.storageTeamSize) * - pow(minMachineCountPerZone, configuration.storageTeamSize); + maxMachineTeamCount = + nChooseK(zoneCount, configuration.storageTeamSize) * pow(minMachineCountPerZone, configuration.storageTeamSize); sufficientToCreateEnoughMachineTeams &= (desiredTotalMachineTeams <= maxMachineTeamCount); TraceEvent(SevInfo, "BuildTeamsDecideMachineGoodLayout", distributorId)