diff --git a/cloud/blockstore/config/storage.proto b/cloud/blockstore/config/storage.proto index 3c0841854ce..8769319fd1e 100644 --- a/cloud/blockstore/config/storage.proto +++ b/cloud/blockstore/config/storage.proto @@ -1074,4 +1074,7 @@ message TStorageServiceConfig // Enabling direct copying of data between disk agents. optional bool UseDirectCopyRange = 394; + + // Maximum number of pending allocation requests per one disk. + optional uint32 MaxNonReplicatedDiskAllocationRequests = 395; } diff --git a/cloud/blockstore/libs/storage/core/config.cpp b/cloud/blockstore/libs/storage/core/config.cpp index 7bd2646e119..8df28ee7638 100644 --- a/cloud/blockstore/libs/storage/core/config.cpp +++ b/cloud/blockstore/libs/storage/core/config.cpp @@ -456,6 +456,7 @@ TDuration MSeconds(ui32 value) xxx(NonReplicatedDontSuspendDevices, bool, false )\ xxx(AddClientRetryTimeoutIncrement, TDuration, MSeconds(100) )\ xxx(MaxNonReplicatedDiskDeallocationRequests, ui32, 16 )\ + xxx(MaxNonReplicatedDiskAllocationRequests, ui32, 16 )\ xxx(BalancerActionDelayInterval, TDuration, Seconds(3) )\ \ xxx(UseMirrorResync, bool, false )\ diff --git a/cloud/blockstore/libs/storage/core/config.h b/cloud/blockstore/libs/storage/core/config.h index 6fc570bf2ed..c3ac765b02e 100644 --- a/cloud/blockstore/libs/storage/core/config.h +++ b/cloud/blockstore/libs/storage/core/config.h @@ -516,6 +516,7 @@ class TStorageConfig TDuration GetDiskRegistryBackupPeriod() const; TString GetDiskRegistryBackupDirPath() const; + ui32 GetMaxNonReplicatedDiskAllocationRequests() const; ui32 GetMaxNonReplicatedDiskDeallocationRequests() const; TDuration GetDiskRegistryMetricsCachePeriod() const; diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor.h b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor.h index aa372f426e0..6542e883990 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor.h +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor.h @@ -83,6 +83,7 @@ class TDiskRegistryActor final // Pending requests TDeque PendingRequests; + THashMap> PendingDiskAllocationRequests; THashMap> PendingDiskDeallocationRequests; bool BrokenDisksDestructionInProgress = false; @@ -227,6 +228,20 @@ class TDiskRegistryActor final TDiskRegistryDatabase& db, TDiskRegistryStateSnapshot& args); + void AddPendingAllocation( + const NActors::TActorContext& ctx, + const TString& diskId, + TRequestInfoPtr requestInfoPtr); + + void ReplyToPendingAllocations( + const NActors::TActorContext& ctx, + const TString& diskId); + + void ReplyToPendingAllocations( + const NActors::TActorContext& ctx, + TVector& requestInfos, + NProto::TError error); + void AddPendingDeallocation( const NActors::TActorContext& ctx, const TString& diskId, diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_allocate.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_allocate.cpp index 06ad674e2a9..dccaaf9e3f7 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_allocate.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_allocate.cpp @@ -131,6 +131,7 @@ void TDiskRegistryActor::ExecuteAddDisk( &result); args.Devices = std::move(result.Devices); + args.DirtyDevices = std::move(result.DirtyDevices); args.DeviceMigrations = std::move(result.Migrations); args.Replicas = std::move(result.Replicas); args.DeviceReplacementUUIDs = std::move(result.DeviceReplacementIds); @@ -157,6 +158,9 @@ void TDiskRegistryActor::CompleteAddDisk( TStringBuilder devices; OutputDevices(args.Devices, devices); + TStringBuilder dirtyDevices; + OutputDevices(args.DirtyDevices, dirtyDevices); + TStringBuilder replicas; replicas << "["; if (!args.Replicas.empty()) { @@ -180,11 +184,12 @@ void TDiskRegistryActor::CompleteAddDisk( migrations << "]"; LOG_INFO(ctx, TBlockStoreComponents::DISK_REGISTRY, - "[%lu] AddDisk success. DiskId=%s Devices=%s Replicas=%s" - " Migrations=%s", + "[%lu] AddDisk success. DiskId=%s Devices=%s DirtyDevices=%s" + " Replicas=%s Migrations=%s", TabletID(), args.DiskId.Quote().c_str(), devices.c_str(), + dirtyDevices.c_str(), replicas.c_str(), migrations.c_str() ); @@ -240,12 +245,70 @@ void TDiskRegistryActor::CompleteAddDisk( response->Record.SetMuteIOErrors(args.MuteIOErrors); } - NCloud::Reply(ctx, *args.RequestInfo, std::move(response)); + if (HasError(args.Error) || args.DirtyDevices.empty()) { + NCloud::Reply(ctx, *args.RequestInfo, std::move(response)); + } else { + AddPendingAllocation(ctx, args.DiskId, args.RequestInfo); + SecureErase(ctx); + } DestroyBrokenDisks(ctx); NotifyUsers(ctx); } +void TDiskRegistryActor::AddPendingAllocation( + const NActors::TActorContext& ctx, + const TString& diskId, + TRequestInfoPtr requestInfo) +{ + auto& requestInfos = PendingDiskAllocationRequests[diskId]; + + if (requestInfos.size() > Config->GetMaxNonReplicatedDiskAllocationRequests()) { + LOG_WARN(ctx, TBlockStoreComponents::DISK_REGISTRY, + "Too many pending allocation requests (%lu) for disk %s. " + "Reject all requests.", + requestInfos.size(), + diskId.Quote().c_str()); + + ReplyToPendingAllocations(ctx, requestInfos, MakeError(E_REJECTED)); + } + + requestInfos.emplace_back(std::move(requestInfo)); +} + +void TDiskRegistryActor::ReplyToPendingAllocations( + const NActors::TActorContext& ctx, + TVector& requestInfos, + NProto::TError error) +{ + for (auto& requestInfo: requestInfos) { + NCloud::Reply( + ctx, + *requestInfo, + std::make_unique(error)); + } + requestInfos.clear(); +} + +void TDiskRegistryActor::ReplyToPendingAllocations( + const NActors::TActorContext& ctx, + const TString& diskId) +{ + auto it = PendingDiskAllocationRequests.find(diskId); + if (it == PendingDiskAllocationRequests.end()) { + return; + } + + LOG_INFO(ctx, TBlockStoreComponents::DISK_REGISTRY, + "Reply to pending allocation requests. DiskId=%s PendingRequests=%d", + diskId.Quote().c_str(), + static_cast(it->second.size())); + + ReplyToPendingAllocations(ctx, it->second, MakeError(S_OK)); + + PendingDiskAllocationRequests.erase(it); +} + //////////////////////////////////////////////////////////////////////////////// void TDiskRegistryActor::HandleDeallocateDisk( @@ -379,7 +442,7 @@ void TDiskRegistryActor::ReplyToPendingDeallocations( } LOG_INFO(ctx, TBlockStoreComponents::DISK_REGISTRY, - "Reply to pending deallocation requests. DiskId=%s PendingRquests=%d", + "Reply to pending deallocation requests. DiskId=%s PendingRequests=%d", diskId.Quote().c_str(), static_cast(it->second.size())); diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_secure_erase.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_secure_erase.cpp index e65ac54d5d7..34995de5c35 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_secure_erase.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_secure_erase.cpp @@ -279,8 +279,8 @@ void TDiskRegistryActor::ExecuteCleanupDevices( TTxDiskRegistry::TCleanupDevices& args) { TDiskRegistryDatabase db(tx.DB); - args.SyncDeallocatedDisks = - State->MarkDevicesAsClean(ctx.Now(), db, args.Devices); + std::tie(args.SyncAllocatedDisks, args.SyncDeallocatedDisks) = + std::move(State->MarkDevicesAsClean(ctx.Now(), db, args.Devices)); } void TDiskRegistryActor::CompleteCleanupDevices( @@ -293,6 +293,10 @@ void TDiskRegistryActor::CompleteCleanupDevices( for (const auto& diskId: args.SyncDeallocatedDisks) { ReplyToPendingDeallocations(ctx, diskId); } + + for (const auto& diskId: args.SyncAllocatedDisks) { + ReplyToPendingAllocations(ctx, diskId); + } } //////////////////////////////////////////////////////////////////////////////// diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp index 0ee39952938..be6d7794adc 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp @@ -749,7 +749,7 @@ void TDiskRegistryState::ProcessDirtyDevices(TVector dirtyDevices) { for (auto&& [uuid, diskId]: dirtyDevices) { if (!diskId.empty()) { - auto error = PendingCleanup.Insert(diskId, std::move(uuid)); + auto error = PendingCleanup.Insert(diskId, std::move(uuid), /*allocation=*/false); if (HasError(error)) { ReportDiskRegistryInsertToPendingCleanupFailed( TStringBuilder() @@ -1299,16 +1299,23 @@ NProto::TError TDiskRegistryState::ReplaceDeviceWithoutDiskStateUpdate( } if (!manual && !deviceReplacementId.empty()) { - auto cleaningDiskId = + auto [allocating, deallocating] = PendingCleanup.FindDiskId(deviceReplacementId); - if (!cleaningDiskId.empty() && cleaningDiskId != diskId) { + if (!allocating || *allocating != diskId) { + allocating = std::nullopt; + } + if (!deallocating || *deallocating != diskId) { + deallocating = std::nullopt; + } + auto owningDisk = allocating ? allocating : deallocating; + if (owningDisk) { return MakeError( E_ARGUMENT, TStringBuilder() << "can't allocate specific device " << deviceReplacementId.Quote() << " for disk " << diskId << " since it is in pending cleanup for disk " - << cleaningDiskId); + << *owningDisk); } } @@ -1414,7 +1421,7 @@ NProto::TError TDiskRegistryState::ReplaceDeviceWithoutDiskStateUpdate( UpdatePlacementGroup(db, diskId, disk, "ReplaceDevice"); UpdateAndReallocateDisk(db, diskId, disk); - error = PendingCleanup.Insert(diskId, deviceId); + error = PendingCleanup.Insert(diskId, deviceId, /*allocation=*/false); if (HasError(error)) { ReportDiskRegistryInsertToPendingCleanupFailed( TStringBuilder() << "An error occurred while replacing device: " @@ -2756,6 +2763,37 @@ NProto::TError TDiskRegistryState::AllocateSimpleDisk( params.BlockSize << " bytes"); } + // check that we can secure erase each dirty allocated device + TVector dirtyDevices; + for (const auto& device: allocatedDevices) { + const auto& uuid = device.GetDeviceUUID(); + if (!IsDirtyDevice(uuid)) { + continue; + } + // if we can't secure erase one of allocated disk's dirty devices, we can't allocate the disk + if (!CanSecureErase(uuid)) { + onError(); + + return MakeError(E_BS_DISK_ALLOCATION_FAILED, TStringBuilder() << + "can't secure erase device " << uuid); + } + dirtyDevices.push_back(uuid); + result->DirtyDevices.push_back(device); + } + + if (dirtyDevices) { + NProto::TError cleanupError = PendingCleanup.Insert(params.DiskId, std::move(dirtyDevices), /*allocation=*/true); + // if we can't secure erase one of allocated disk's dirty devices, we can't allocate the disk + if (HasError(cleanupError)) { + onError(); + + ReportDiskRegistryInsertToPendingCleanupFailed( + TStringBuilder() << "An error occurred while allocating disk: " + << FormatError(cleanupError)); + return cleanupError; + } + } + for (const auto& device: allocatedDevices) { disk.Devices.push_back(device.GetDeviceUUID()); } @@ -2851,7 +2889,7 @@ NProto::TError TDiskRegistryState::DeallocateDisk( JoinSeq(", ", devicesAllowedToBeCleaned).c_str()); auto error = - PendingCleanup.Insert(diskId, std::move(devicesAllowedToBeCleaned)); + PendingCleanup.Insert(diskId, std::move(devicesAllowedToBeCleaned), /*allocation=*/false); if (HasError(error)) { ReportDiskRegistryInsertToPendingCleanupFailed( TStringBuilder() << "An error occurred while deallocating disk: " @@ -2998,6 +3036,11 @@ auto TDiskRegistryState::DeallocateSimpleDisk( db.UpdateDirtyDevice(uuid, diskId); } + auto agents = FindDiskDevicesAgents(disk); + for (const auto* agent: agents) { + DeviceList.UpdateDevices(*agent, DevicePoolConfigs); + } + DeleteAllDeviceMigrations(diskId); DeleteDisk(db, diskId); @@ -3855,16 +3898,20 @@ bool TDiskRegistryState::MarkDeviceAsDirty( return true; } -TDiskRegistryState::TDiskId TDiskRegistryState::MarkDeviceAsClean( +TDiskRegistryState::TOpt2Disk TDiskRegistryState::MarkDeviceAsClean( TInstant now, TDiskRegistryDatabase& db, const TDeviceId& uuid) { - auto ret = MarkDevicesAsClean(now, db, TVector{uuid}); - return ret.empty() ? "" : ret[0]; + auto [alloc, dealloc] = MarkDevicesAsClean(now, db, TVector{uuid}); + return { + alloc.empty() ? std::nullopt : std::make_optional(std::move(alloc[0])), + dealloc.empty() ? std::nullopt + : std::make_optional(std::move(dealloc[0]))}; } -TVector TDiskRegistryState::MarkDevicesAsClean( +std::pair +TDiskRegistryState::MarkDevicesAsClean( TInstant now, TDiskRegistryDatabase& db, const TVector& uuids) @@ -3878,14 +3925,19 @@ TVector TDiskRegistryState::MarkDevicesAsClean( } } - TVector ret; + TAllocatedDisksList allocatedDisks; + TDellocatedDisksList dellocatedDisks; for (const auto& uuid: TryUpdateDevices(now, db, uuids)) { - if (auto diskId = PendingCleanup.EraseDevice(uuid); !diskId.empty()) { - ret.push_back(std::move(diskId)); + auto [allocatedDisk, deallocatedDisk] = PendingCleanup.EraseDevice(uuid); + if (allocatedDisk) { + allocatedDisks.push_back(std::move(*allocatedDisk)); + } + if (deallocatedDisk) { + dellocatedDisks.push_back(std::move(*deallocatedDisk)); } } - return ret; + return {std::move(allocatedDisks), std::move(dellocatedDisks)}; } bool TDiskRegistryState::TryUpdateDevice( @@ -4811,7 +4863,7 @@ void TDiskRegistryState::RemoveFinishedMigrations( DeviceList.ReleaseDevice(m.DeviceId); db.UpdateDirtyDevice(m.DeviceId, diskId); - auto error = PendingCleanup.Insert(diskId, m.DeviceId); + auto error = PendingCleanup.Insert(diskId, m.DeviceId, /*allocation=*/false); if (HasError(error)) { ReportDiskRegistryInsertToPendingCleanupFailed( TStringBuilder() @@ -5072,8 +5124,9 @@ bool TDiskRegistryState::HasDependentSsdDisks( continue; } + auto [allocating, deallocating] = PendingCleanup.FindDiskId(d.GetDeviceUUID()); if (d.GetPoolKind() == NProto::DEVICE_POOL_KIND_LOCAL && - PendingCleanup.FindDiskId(d.GetDeviceUUID())) + (allocating || deallocating)) { return true; } @@ -5598,6 +5651,24 @@ auto TDiskRegistryState::FindDeviceLocation(const TDeviceId& deviceId) const return const_cast(this)->FindDeviceLocation(deviceId); } +auto TDiskRegistryState::FindDiskDevicesAgents(const TDiskState& disk) const + -> std::set +{ + std::set diskAgents; + for (const auto& uuid: disk.Devices) { + // TODO: handle when not found + diskAgents.insert(DeviceList.FindAgentId(uuid)); + } + + std::set agents; + for (const auto& agentId: diskAgents) { + // TODO: handle when not found + agents.insert(AgentList.FindAgent(agentId)); + } + + return std::move(agents); +} + auto TDiskRegistryState::FindDeviceLocation(const TDeviceId& deviceId) -> std::pair { @@ -6354,7 +6425,9 @@ NProto::TDiskRegistryStateBackup TDiskRegistryState::BackupState() const transform(GetDirtyDevices(), backup.MutableDirtyDevices(), [this] (auto& x) { NProto::TDiskRegistryStateBackup::TDirtyDevice dd; dd.SetId(x.GetDeviceUUID()); - dd.SetDiskId(PendingCleanup.FindDiskId(x.GetDeviceUUID())); + auto [allocating, deallocating] = PendingCleanup.FindDiskId(x.GetDeviceUUID()); // TODO: need to backup + Y_UNUSED(allocating); + dd.SetDiskId(*deallocating); return dd; }); diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h index 3d806eb2189..e8a322d8f3f 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h @@ -281,6 +281,15 @@ class TDiskRegistryState using TCheckpoints = THashMap; using TPlacementGroups = THashMap; + using TAllocatingDiskId = TDiskRegistryState::TDiskId; + using TDeallocatingDiskId = TDiskRegistryState::TDiskId; + using TAllocatedDisksList = TVector; + using TDellocatedDisksList = TVector; + + template + using TOpt2 = std::pair, std::optional>; + using TOpt2Disk = TOpt2; + private: TLog Log; @@ -392,6 +401,7 @@ class TDiskRegistryState struct TAllocateDiskResult { TVector Devices; + TVector DirtyDevices; TVector Migrations; TVector> Replicas; TVector DeviceReplacementIds; @@ -503,16 +513,17 @@ class TDiskRegistryState /// Mark selected device as clean and remove it /// from lists of suspended/dirty/pending cleanup devices - /// @return disk id where selected device was allocated - TDiskId MarkDeviceAsClean( + /// @return allocated/deallocated disk id of where selected device was allocated/deallocated + TOpt2Disk MarkDeviceAsClean( TInstant now, TDiskRegistryDatabase& db, const TDeviceId& uuid); /// Mark selected devices as clean and remove them /// from lists of suspended/dirty/pending cleanup devices - /// @return vector of disk ids where selected devices were allocated - TVector MarkDevicesAsClean( + /// @return vector of allocated/deallocated disk ids where selected devices were allocated/deallocated + std::pair + MarkDevicesAsClean( TInstant now, TDiskRegistryDatabase& db, const TVector& uuids); @@ -933,6 +944,9 @@ class TDiskRegistryState auto FindDeviceLocation(const TDeviceId& uuid) const -> std::pair; + auto FindDiskDevicesAgents(const TDiskState& disk) const + -> std::set; + TVector FindDevices( const TString& agentId, const TString& path) const; diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h b/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h index 43619484ef4..f37cb008cb3 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h @@ -127,6 +127,7 @@ struct TTxDiskRegistry NProto::TError Error; TVector Devices; + TVector DirtyDevices; TVector DeviceMigrations; TVector> Replicas; TVector DeviceReplacementUUIDs; @@ -305,6 +306,7 @@ struct TTxDiskRegistry const TRequestInfoPtr RequestInfo; const TVector Devices; + TVector SyncAllocatedDisks; TVector SyncDeallocatedDisks; explicit TCleanupDevices( @@ -316,6 +318,7 @@ struct TTxDiskRegistry void Clear() { + SyncAllocatedDisks.clear(); SyncDeallocatedDisks.clear(); } }; diff --git a/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp b/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp index 4cbe6b32257..02e1948b947 100644 --- a/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp @@ -130,6 +130,7 @@ void TDeviceList::UpdateDevices( auto& nodeDevices = NodeDevices[agent.GetNodeId()]; nodeDevices.FreeDevices.clear(); + nodeDevices.FreeDevicesIncludingDirty.clear(); nodeDevices.Rack.clear(); nodeDevices.TotalSize = 0; @@ -187,6 +188,15 @@ void TDeviceList::UpdateDevices( nodeDevices.FreeDevices.push_back(device); } + const bool isFreeIncludingDirty = + DevicesAllocationAllowed(device.GetPoolKind(), agent.GetState()) && + device.GetState() == NProto::DEVICE_STATE_ONLINE && + !AllocatedDevices.contains(uuid) && + !SuspendedDevices.contains(uuid); + if (isFreeIncludingDirty) { + nodeDevices.FreeDevicesIncludingDirty.push_back(device); + } + auto& poolNames = PoolKind2PoolNames[device.GetPoolKind()]; auto it = Find(poolNames, device.GetPoolName()); if (it == poolNames.end()) { @@ -197,6 +207,7 @@ void TDeviceList::UpdateDevices( } SortBy(nodeDevices.FreeDevices, TBySortQueryKey()); + SortBy(nodeDevices.FreeDevicesIncludingDirty, TBySortQueryKey()); } void TDeviceList::RemoveDevices(const NProto::TAgentConfig& agent) @@ -396,19 +407,20 @@ bool TDeviceList::ValidateAllocationQuery( return false; } - const TNodeDevices& nodeDevices = nodeItr->second; + TNodeDevices& nodeDevices = nodeItr->second; if (query.ForbiddenRacks.contains(nodeDevices.Rack)) { return false; } + const auto& availableDevices = GetAvailableDevices(&nodeDevices, query); const auto freeDeviceItr = FindIf( - nodeDevices.FreeDevices, + availableDevices, [&targetDeviceId] (const NProto::TDeviceConfig& device) { return device.GetDeviceUUID() == targetDeviceId; }); - if (freeDeviceItr == nodeDevices.FreeDevices.end()) { + if (freeDeviceItr == availableDevices.end()) { return false; } @@ -468,7 +480,7 @@ auto TDeviceList::SelectRacks( const auto* nodeDevices = NodeDevices.FindPtr(node.NodeId); Y_ABORT_UNLESS(nodeDevices); - auto r = FindDeviceRange(query, poolName, nodeDevices->FreeDevices); + auto r = FindDeviceRange(query, poolName, GetAvailableDevices(nodeDevices, query)); node.OccupiedSpace = nodeDevices->TotalSize; rackTotalSpace += nodeDevices->TotalSize; @@ -526,6 +538,24 @@ auto TDeviceList::SelectRacks( return result; } +TVector& TDeviceList::GetAvailableDevices( + TNodeDevices* nodeDevices, + const TAllocationQuery& query) const { + const bool allowDirtyDevices = + query.PoolKind == NProto::DEVICE_POOL_KIND_LOCAL && + true; // TODO: query.allowDirtyLocalDevices; + return allowDirtyDevices ? nodeDevices->FreeDevicesIncludingDirty : nodeDevices->FreeDevices; +} + +const TVector& TDeviceList::GetAvailableDevices( + const TNodeDevices* nodeDevices, + const TAllocationQuery& query) const { + const bool allowDirtyDevices = + query.PoolKind == NProto::DEVICE_POOL_KIND_LOCAL && + true; // TODO: query.allowDirtyLocalDevices; + return allowDirtyDevices ? nodeDevices->FreeDevicesIncludingDirty : nodeDevices->FreeDevices; +} + TVector TDeviceList::CollectDevices( const TAllocationQuery& query, const TString& poolName) @@ -539,13 +569,12 @@ TVector TDeviceList::CollectDevices( for (const auto& rack: SelectRacks(query, poolName)) { for (const auto& node: rack.Nodes) { - const auto* nodeDevices = NodeDevices.FindPtr(node.NodeId); + auto* nodeDevices = NodeDevices.FindPtr(node.NodeId); Y_ABORT_UNLESS(nodeDevices); // finding free devices belonging to this node that match our // query - auto [begin, end] = - FindDeviceRange(query, poolName, nodeDevices->FreeDevices); + auto [begin, end] = FindDeviceRange(query, poolName, GetAvailableDevices(nodeDevices, query)); using TDeviceIter = decltype(begin); struct TDeviceInfo @@ -611,7 +640,6 @@ TVector TDeviceList::CollectDevices( if (query.PoolKind == NProto::DEVICE_POOL_KIND_LOCAL) { // here we go again - ranges.clear(); totalSize = query.GetTotalByteCount(); } @@ -670,9 +698,9 @@ TVector TDeviceList::AllocateDevices( }); auto& nodeDevices = NodeDevices[nodeId]; - + auto& availableDevices = GetAvailableDevices(&nodeDevices, query); for (const auto& arange: aranges) { - nodeDevices.FreeDevices.erase(arange.first, arange.second); + availableDevices.erase(arange.first, arange.second); } } @@ -716,17 +744,22 @@ void TDeviceList::MarkDeviceAsDirty(const TDeviceId& id) void TDeviceList::RemoveDeviceFromFreeList(const TDeviceId& id) { auto nodeId = FindNodeId(id); - + const auto& predicate = [&](const auto& x) + { + return x.GetDeviceUUID() == id; + }; if (nodeId) { auto& devices = NodeDevices[nodeId].FreeDevices; + auto& devicesIncludingDirty = + NodeDevices[nodeId].FreeDevicesIncludingDirty; - auto it = FindIf(devices, [&] (const auto& x) { - return x.GetDeviceUUID() == id; - }); - - if (it != devices.end()) { + if (auto* it = FindIf(devices, predicate); it != devices.end()) { devices.erase(it); } + + if (auto* it = FindIf(devicesIncludingDirty, predicate); it != devicesIncludingDirty.end()) { + devicesIncludingDirty.erase(it); + } } } diff --git a/cloud/blockstore/libs/storage/disk_registry/model/device_list.h b/cloud/blockstore/libs/storage/disk_registry/model/device_list.h index a3c4bbdb444..a94356685d1 100644 --- a/cloud/blockstore/libs/storage/disk_registry/model/device_list.h +++ b/cloud/blockstore/libs/storage/disk_registry/model/device_list.h @@ -47,6 +47,8 @@ class TDeviceList // sorted by {PoolKind, BlockSize} TVector FreeDevices; + // sorted by {PoolKind, BlockSize} + TVector FreeDevicesIncludingDirty; ui64 TotalSize = 0; }; @@ -182,6 +184,13 @@ class TDeviceList const TDeviceId& id, const NProto::TDeviceConfig& device); void RemoveFromAllDevices(const TDeviceId& id); + // TODO: this maybe class method of nodeDevices + TVector& GetAvailableDevices( + TNodeDevices* nodeDevices, + const TAllocationQuery& query) const; + const TVector& GetAvailableDevices( + const TNodeDevices* nodeDevices, + const TAllocationQuery& query) const; }; //////////////////////////////////////////////////////////////////////////////// diff --git a/cloud/blockstore/libs/storage/disk_registry/model/pending_cleanup.cpp b/cloud/blockstore/libs/storage/disk_registry/model/pending_cleanup.cpp index d96bdf1211f..46aadd38ba3 100644 --- a/cloud/blockstore/libs/storage/disk_registry/model/pending_cleanup.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/model/pending_cleanup.cpp @@ -10,9 +10,10 @@ namespace NCloud::NBlockStore::NStorage { NProto::TError TPendingCleanup::Insert( const TString& diskId, - TVector uuids) + TVector uuids, + bool allocation) { - auto error = ValidateInsertion(diskId, uuids); + auto error = ValidateInsertion(diskId, uuids, allocation); if (HasError(error)) { return error; } @@ -21,21 +22,22 @@ NProto::TError TPendingCleanup::Insert( for (auto& uuid: uuids) { Y_DEBUG_ABORT_UNLESS(!uuid.empty()); - auto [_, success] = DeviceToDisk.emplace(std::move(uuid), diskId); - Y_DEBUG_ABORT_UNLESS(success); + auto& [allocatingDiskId, deallocatingDiskId] = DeviceToDisk[uuid]; + (allocation ? allocatingDiskId : deallocatingDiskId) = diskId; } return {}; } -NProto::TError TPendingCleanup::Insert(const TString& diskId, TString uuid) +NProto::TError TPendingCleanup::Insert(const TString& diskId, TString uuid, bool allocation) { - return Insert(diskId, TVector{std::move(uuid)}); + return Insert(diskId, TVector{std::move(uuid)}, allocation); } [[nodiscard]] NProto::TError TPendingCleanup::ValidateInsertion( const TString& diskId, - const TVector& uuids) const + const TVector& uuids, + bool allocation) const { if (diskId.empty() || uuids.empty()) { return MakeError( @@ -55,7 +57,11 @@ NProto::TError TPendingCleanup::Insert(const TString& diskId, TString uuid) << JoinStrings(uuids, ", ") << "]"); } - const auto* foundDiskId = DeviceToDisk.FindPtr(uuid); + const auto* foundDisks = DeviceToDisk.FindPtr(uuid); + if (!foundDisks) { + continue; + } + const auto& foundDiskId = allocation ? foundDisks->first : foundDisks->second; if (foundDiskId) { return MakeError( E_ARGUMENT, @@ -70,27 +76,39 @@ NProto::TError TPendingCleanup::Insert(const TString& diskId, TString uuid) return {}; } -TString TPendingCleanup::EraseDevice(const TString& uuid) +TPendingCleanup::TOpt2Disk TPendingCleanup::EraseDevice(const TString& uuid) { auto it = DeviceToDisk.find(uuid); if (it == DeviceToDisk.end()) { return {}; } - auto diskId = std::move(it->second); - DeviceToDisk.erase(it); + auto& [allocatingDisk, deallocatingDisk] = it->second; + TOpt2Disk ret; - Y_DEBUG_ABORT_UNLESS(DiskToDeviceCount.contains(diskId)); - if (--DiskToDeviceCount[diskId] > 0) { - return {}; + Y_DEBUG_ABORT_UNLESS(allocatingDisk || deallocatingDisk); + Y_DEBUG_ABORT_UNLESS(!allocatingDisk || DiskToDeviceCount.contains(*allocatingDisk)); + Y_DEBUG_ABORT_UNLESS(!deallocatingDisk || DiskToDeviceCount.contains(*deallocatingDisk)); + if (allocatingDisk && --DiskToDeviceCount[*allocatingDisk] <= 0) { + DiskToDeviceCount.erase(*allocatingDisk); + ret.first = std::move(allocatingDisk); + allocatingDisk.reset(); + } + + if (deallocatingDisk && --DiskToDeviceCount[*deallocatingDisk] <= 0) { + DiskToDeviceCount.erase(*deallocatingDisk); + ret.second = std::move(deallocatingDisk); + deallocatingDisk.reset(); } - DiskToDeviceCount.erase(diskId); + if (!allocatingDisk && !deallocatingDisk) { + DeviceToDisk.erase(it); + } - return diskId; + return ret; } -TString TPendingCleanup::FindDiskId(const TString& uuid) const +TPendingCleanup::TOpt2Disk TPendingCleanup::FindDiskId(const TString& uuid) const { auto it = DeviceToDisk.find(uuid); if (it == DeviceToDisk.end()) { @@ -106,9 +124,31 @@ bool TPendingCleanup::EraseDisk(const TString& diskId) return false; } - EraseNodesIf(DeviceToDisk, [&] (auto& x) { - return x.second == diskId; - }); + bool isAllocatingDisk = false; + bool isDeallocatingDisk = false; + auto it = std::find_if( + DeviceToDisk.begin(), + DeviceToDisk.end(), + [&](const auto& elem) + { + auto& [allocating, deallocating] = elem.second; + isAllocatingDisk = allocating && *allocating == diskId; + isDeallocatingDisk = deallocating && *deallocating == diskId; + return isAllocatingDisk || isDeallocatingDisk; + }); + + Y_ABORT_IF(isAllocatingDisk && isDeallocatingDisk); + + auto& [allocating, deallocating] = it->second; + if (isAllocatingDisk) { + allocating.reset(); + } else { + deallocating.reset(); + } + + if (!allocating && !deallocating) { + DeviceToDisk.erase(it); + } return true; } diff --git a/cloud/blockstore/libs/storage/disk_registry/model/pending_cleanup.h b/cloud/blockstore/libs/storage/disk_registry/model/pending_cleanup.h index 841184ce555..8be7fe31c82 100644 --- a/cloud/blockstore/libs/storage/disk_registry/model/pending_cleanup.h +++ b/cloud/blockstore/libs/storage/disk_registry/model/pending_cleanup.h @@ -15,27 +15,42 @@ namespace NCloud::NBlockStore::NStorage { class TPendingCleanup { +public: + using TAllocatingDiskId = TString; + using TDeallocatingDiskId = TString; + + template + using TOpt2 = std::pair, std::optional>; + using TOpt2Disk = TOpt2; + private: THashMap DiskToDeviceCount; - THashMap DeviceToDisk; + THashMap DeviceToDisk; public: [[nodiscard]] NProto::TError Insert( const TString& diskId, - TVector uuids); - [[nodiscard]] NProto::TError Insert(const TString& diskId, TString uuid); - - TString EraseDevice(const TString& uuid); + TVector uuids, + bool allocation = false); + [[nodiscard]] NProto::TError + Insert(const TString& diskId, TString uuid, bool allocation = false); + + /// Removes the device from deallocating disk (for pending deallocation + /// disks) + /// @return diskId of allocated/deallocated disk if the device with the + /// given UUID was the last to complete its allocation/deallocation + TOpt2Disk EraseDevice(const TString& uuid); bool EraseDisk(const TString& diskId); - [[nodiscard]] TString FindDiskId(const TString& uuid) const; + [[nodiscard]] TOpt2Disk FindDiskId(const TString& uuid) const; [[nodiscard]] bool IsEmpty() const; [[nodiscard]] bool Contains(const TString& diskId) const; private: [[nodiscard]] NProto::TError ValidateInsertion( const TString& diskId, - const TVector& uuids) const; + const TVector& uuids, + bool allocation) const; }; } // namespace NCloud::NBlockStore::NStorage