diff --git a/cloud/filestore/libs/diagnostics/critical_events.h b/cloud/filestore/libs/diagnostics/critical_events.h index 5b9b794cd3b..32042b326b2 100644 --- a/cloud/filestore/libs/diagnostics/critical_events.h +++ b/cloud/filestore/libs/diagnostics/critical_events.h @@ -20,6 +20,7 @@ namespace NCloud::NFileStore{ xxx(NodeNotFoundInFollower) \ xxx(NotEnoughResultsInGetNodeAttrBatchResponses) \ xxx(AsyncDestroyHandleFailed) \ + xxx(DuplicateRequestId) \ // FILESTORE_CRITICAL_EVENTS #define FILESTORE_IMPOSSIBLE_EVENTS(xxx) \ diff --git a/cloud/filestore/libs/storage/tablet/session.h b/cloud/filestore/libs/storage/tablet/session.h index 38a6e4a5241..cf6b233491f 100644 --- a/cloud/filestore/libs/storage/tablet/session.h +++ b/cloud/filestore/libs/storage/tablet/session.h @@ -58,11 +58,12 @@ using TSessionLockMultiMap = THashMultiMap; struct TDupCacheEntry: NProto::TDupCacheEntry { - bool Commited = false; + bool Committed = false; + bool Dropped = false; - TDupCacheEntry(const NProto::TDupCacheEntry& proto, bool commited) + TDupCacheEntry(const NProto::TDupCacheEntry& proto, bool committed) : NProto::TDupCacheEntry(proto) - , Commited(commited) + , Committed(committed) {} }; @@ -112,7 +113,7 @@ struct TSession return SubSessions.IsValid(); } - bool HasSeqNo(ui64 seqNo) + bool HasSeqNo(ui64 seqNo) const { return SubSessions.HasSeqNo(seqNo); } @@ -154,7 +155,7 @@ struct TSession ui64 GenerateDupCacheEntryId() { - return LastDupCacheEntryId++; + return ++LastDupCacheEntryId; } void LoadDupCacheEntry(NProto::TDupCacheEntry entry) @@ -177,6 +178,21 @@ struct TSession return nullptr; } + void DropDupEntry(ui64 requestId) + { + if (!requestId) { + return; + } + + auto it = DupCache.find(requestId); + if (it == DupCache.end()) { + return; + } + + it->second->Dropped = true; + DupCache.erase(it); + } + TDupCacheEntry* AccessDupEntry(ui64 requestId) { if (!requestId) { @@ -191,12 +207,12 @@ struct TSession return nullptr; } - void AddDupCacheEntry(NProto::TDupCacheEntry proto, bool commited) + void AddDupCacheEntry(NProto::TDupCacheEntry proto, bool committed) { Y_ABORT_UNLESS(proto.GetRequestId()); Y_ABORT_UNLESS(proto.GetEntryId()); - DupCacheEntries.emplace_back(std::move(proto), commited); + DupCacheEntries.emplace_back(std::move(proto), committed); auto& entry = DupCacheEntries.back(); auto [_, inserted] = DupCache.emplace(entry.GetRequestId(), &entry); @@ -206,7 +222,7 @@ struct TSession void CommitDupCacheEntry(ui64 requestId) { if (auto it = DupCache.find(requestId); it != DupCache.end()) { - it->second->Commited = true; + it->second->Committed = true; } } @@ -217,7 +233,8 @@ struct TSession } auto entry = DupCacheEntries.front(); - DupCache.erase(entry.GetRequestId()); + const auto erased = DupCache.erase(entry.GetRequestId()); + Y_DEBUG_ABORT_UNLESS(erased || entry.Dropped); DupCacheEntries.pop_front(); return entry.GetEntryId(); diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_createhandle.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_createhandle.cpp index 8128e20f49b..395890d4e40 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_createhandle.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_createhandle.cpp @@ -58,10 +58,23 @@ void TIndexTabletActor::HandleCreateHandle( } auto* msg = ev->Get(); - if (const auto* e = session->LookupDupEntry(GetRequestId(msg->Record))) { + const auto requestId = GetRequestId(msg->Record); + if (const auto* e = session->LookupDupEntry(requestId)) { auto response = std::make_unique(); GetDupCacheEntry(e, response->Record); - return NCloud::Reply(ctx, *ev, std::move(response)); + // sometimes bugged clients send us duplicate request ids + // see #2033 + if (msg->Record.GetName().Empty() && msg->Record.GetNodeId() + != response->Record.GetNodeAttr().GetId()) + { + ReportDuplicateRequestId(TStringBuilder() << "CreateHandle response" + << " for different NodeId found for RequestId=" << requestId + << ": " << response->Record.GetNodeAttr().GetId() + << " != " << msg->Record.GetNodeId()); + session->DropDupEntry(requestId); + } else { + return NCloud::Reply(ctx, *ev, std::move(response)); + } } if (msg->Record.GetFollowerFileSystemId()) { diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_createnode.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_createnode.cpp index 6ba2cd0729e..ab5a2586521 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_createnode.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_createnode.cpp @@ -295,6 +295,7 @@ void TIndexTabletActor::HandleCreateNode( GetDupCacheEntry(e, response->Record); if (response->Record.GetNode().GetId() == 0) { // it's an external node which is not yet created in follower + // this check is needed for the case of leader reboot *response->Record.MutableError() = MakeError( E_REJECTED, "node not yet created in follower"); diff --git a/cloud/filestore/libs/storage/tablet/tablet_state_sessions.cpp b/cloud/filestore/libs/storage/tablet/tablet_state_sessions.cpp index 742f62b8911..a6240496978 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_state_sessions.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_state_sessions.cpp @@ -844,14 +844,15 @@ void TIndexTabletState::GetDupCacheEntry( \ const TDupCacheEntry* entry, \ NProto::T##name##Response& response) \ { \ - if (entry->Commited && entry->Has##name()) { \ + if (entry->Committed && entry->Has##name()) { \ response = entry->Get##name(); \ - } else if (!entry->Commited) { \ + } else if (!entry->Committed) { \ *response.MutableError() = ErrorDuplicate(); \ } else if (!entry->Has##name()) { \ *response.MutableError() = MakeError( \ E_ARGUMENT, TStringBuilder() << "invalid request dup cache type: " \ << entry->ShortUtf8DebugString().Quote()); \ + ReportDuplicateRequestId(response.GetError().GetMessage()); \ } \ } \ // FILESTORE_IMPLEMENT_DUPCACHE diff --git a/cloud/filestore/libs/storage/tablet/tablet_ut_nodes.cpp b/cloud/filestore/libs/storage/tablet/tablet_ut_nodes.cpp index f7f9bdd37b6..c1b09dec138 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_ut_nodes.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_ut_nodes.cpp @@ -1081,14 +1081,14 @@ Y_UNIT_TEST_SUITE(TIndexTabletTest_Nodes) auto request = tablet.CreateCreateNodeRequest(TCreateNodeArgs::File(RootNodeId, "xxx")); request->Record.MutableHeaders()->SetRequestId(reqId); - return std::move(request); + return request; }; auto createOther = [&] (ui64 reqId) { auto request = tablet.CreateUnlinkNodeRequest(RootNodeId, "xxx", false); request->Record.MutableHeaders()->SetRequestId(reqId); - return std::move(request); + return request; }; ui64 nodeId = InvalidNodeId; @@ -1104,11 +1104,14 @@ Y_UNIT_TEST_SUITE(TIndexTabletTest_Nodes) tablet.SendRequest(createOther(100500)); { auto response = tablet.RecvUnlinkNodeResponse(); - UNIT_ASSERT(HasError(response->Record.GetError())); + UNIT_ASSERT_VALUES_EQUAL_C( + E_ARGUMENT, + response->Record.GetError().GetCode(), + response->Record.GetError().GetMessage()); } } - Y_UNIT_TEST(ShouldWaitForDupRequestToBeCommited) + Y_UNIT_TEST(ShouldWaitForDupRequestToBeCommitted) { TTestEnv env; env.CreateSubDomain("nfs"); @@ -1601,6 +1604,68 @@ Y_UNIT_TEST_SUITE(TIndexTabletTest_Nodes) UNIT_ASSERT_VALUES_EQUAL(0, stats.GetDeletionMarkersCount()); } } + + Y_UNIT_TEST(ShouldIdentifyRequestIdCollision) + { + TTestEnv env; + env.CreateSubDomain("nfs"); + + ui32 nodeIdx = env.CreateNode("nfs"); + ui64 tabletId = env.BootIndexTablet(nodeIdx); + + TIndexTabletClient tablet(env.GetRuntime(), nodeIdx, tabletId); + tablet.InitSession("client", "session"); + + auto createRequest = [&] (ui64 reqId, ui64 nodeId) { + auto request = tablet.CreateCreateHandleRequest( + nodeId, TCreateHandleArgs::RDWR); + request->Record.MutableHeaders()->SetRequestId(reqId); + + return request; + }; + + const TString name1 = "file1"; + const TString name2 = "file2"; + + const auto nodeId1 = tablet.CreateNode(TCreateNodeArgs::File( + RootNodeId, + name1))->Record.GetNode().GetId(); + const auto nodeId2 = tablet.CreateNode(TCreateNodeArgs::File( + RootNodeId, + name2))->Record.GetNode().GetId(); + + UNIT_ASSERT_VALUES_UNEQUAL(InvalidNodeId, nodeId1); + UNIT_ASSERT_VALUES_UNEQUAL(InvalidNodeId, nodeId2); + UNIT_ASSERT_VALUES_UNEQUAL(nodeId1, nodeId2); + + const ui64 requestId = 100500; + + tablet.SendRequest(createRequest(requestId, nodeId1)); + { + auto response = tablet.RecvCreateHandleResponse(); + UNIT_ASSERT_VALUES_EQUAL_C( + S_OK, + response->Record.GetError().GetCode(), + response->Record.GetError().GetMessage()); + + UNIT_ASSERT_VALUES_EQUAL( + nodeId1, + response->Record.GetNodeAttr().GetId()); + } + + tablet.SendRequest(createRequest(requestId, nodeId2)); + { + auto response = tablet.RecvCreateHandleResponse(); + UNIT_ASSERT_VALUES_EQUAL_C( + S_OK, + response->Record.GetError().GetCode(), + response->Record.GetError().GetMessage()); + + UNIT_ASSERT_VALUES_EQUAL( + nodeId2, + response->Record.GetNodeAttr().GetId()); + } + } } } // namespace NCloud::NFileStore::NStorage