diff --git a/source/layers/validation/checkers/events_deadlock/zel_events_deadlock_checker.cpp b/source/layers/validation/checkers/events_deadlock/zel_events_deadlock_checker.cpp index e29e228..07764b7 100644 --- a/source/layers/validation/checkers/events_deadlock/zel_events_deadlock_checker.cpp +++ b/source/layers/validation/checkers/events_deadlock/zel_events_deadlock_checker.cpp @@ -528,7 +528,7 @@ eventsDeadlockChecker::ZEeventsDeadlockChecker::zeCommandListImmediateAppendComm } void eventsDeadlockChecker::ZEeventsDeadlockChecker::checkForDeadlock(std::string zeCallDisc, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) { - int this_action_new_node_id = invalidDagID; + uint32_t this_action_new_node_id = invalidDagID; // Check if user is using invalid events, hint if it doesn't exist in eventToDagID if (eventToDagID.find(hSignalEvent) == eventToDagID.end()) { @@ -548,19 +548,15 @@ void eventsDeadlockChecker::ZEeventsDeadlockChecker::checkForDeadlock(std::strin // This event already exists in the DAG. Get the DAG node ID. // For example when there is indeed a deadlock it would have already been created. this_action_new_node_id = it->second; - - // std::cout << "\tFound event in eventToDagID: hSignalEvent = " << hSignalEvent << ", this_action_new_node_id = " << this_action_new_node_id << std::endl; } } if (this_action_new_node_id == invalidDagID) { // Create node in DAG - this_action_new_node_id = addNodeInDag(); // nextDagID++; + this_action_new_node_id = addNodeInDag(); // Now we know where the hSignalEvent points from/out in the DAG. Update the eventToDagID map. eventToDagID[hSignalEvent] = this_action_new_node_id; - - // std::cout << "\tUpdated eventToDagID: hSignalEvent = " << hSignalEvent << ", this_action_new_node_id = " << this_action_new_node_id << std::endl; } // Add this action to the actionToDagID map @@ -582,35 +578,40 @@ void eventsDeadlockChecker::ZEeventsDeadlockChecker::checkForDeadlock(std::strin for (uint32_t i = 0; i < numWaitEvents; i++) { auto it = eventToDagID.find(phWaitEvents[i]); - int dagID = it->second; + uint32_t dagID = it->second; if (dagID == invalidDagID) { // Create a new node in the DAG for this wait event. That action will be created some time in the future. - dagID = addNodeInDag(); // nextDagID++; + dagID = addNodeInDag(); it->second = dagID; } auto getActionDetails = [&](int dagID) -> std::string { - return (dagIDToAction.find(dagID) != dagIDToAction.end()) ? dagIDToAction[dagID].first : "PLACEHOLDER"; + auto actionIt = dagIDToAction.find(dagID); + return (actionIt != dagIDToAction.end()) ? actionIt->second.first : "PLACEHOLDER"; }; - std::string fromAction = getActionDetails(dagID); - std::string toAction = getActionDetails(this_action_new_node_id); - if (!addEdgeInDag(dagID, this_action_new_node_id)) { + std::string fromAction = getActionDetails(dagID); + std::string toAction = getActionDetails(this_action_new_node_id); - auto path = dag.PathDagIDs(this_action_new_node_id, dagID, 5); - - std::string spacePrefix = ""; std::cerr << "Warning: There may be a potential event deadlock!\n"; - std::cerr << "Adding the following dependency would create a cycle in the DAG:\n\tFrom:" << fromAction << "\n\tTo:" << toAction << "\n"; + std::cerr << "Adding the following dependency would create a cycle in the DAG:\n\tFrom: " << fromAction << "\n\tTo: " << toAction << "\n"; std::cerr << "There is already a path:\n"; + + constexpr uint32_t maxPathLength = 15; + auto path = dag.PathDagIDs(this_action_new_node_id, dagID, maxPathLength); auto dagIDsInPath = path.first; std::cerr << getActionDetails(dagIDsInPath[0]) << "\n"; - for (uint32_t i = 1; i < dagIDsInPath.size(); i++) { + std::string spacePrefix = ""; + for (uint32_t j = 1; j < dagIDsInPath.size(); j++) { std::cerr << spacePrefix << "|\n" - << spacePrefix << "-> " << getActionDetails(dagIDsInPath[i]) << "\n"; + << spacePrefix << "-> " << getActionDetails(dagIDsInPath[j]) << "\n"; spacePrefix += " "; } + if (path.second) { + std::cerr << spacePrefix << "|\n" + << spacePrefix << "-> ...\n"; + } } } } diff --git a/source/layers/validation/checkers/events_deadlock/zel_events_deadlock_checker.h b/source/layers/validation/checkers/events_deadlock/zel_events_deadlock_checker.h index 8d94bac..664cf6f 100644 --- a/source/layers/validation/checkers/events_deadlock/zel_events_deadlock_checker.h +++ b/source/layers/validation/checkers/events_deadlock/zel_events_deadlock_checker.h @@ -14,11 +14,12 @@ #include "ze_api.h" #include "ze_validation_layer.h" +#include #include namespace validation_layer { -constexpr int invalidDagID = -1; +constexpr uint32_t invalidDagID = std::numeric_limits::max(); using actionAndSignalEvent = std::pair; class __zedlllocal eventsDeadlockChecker : public validationChecker { @@ -48,25 +49,25 @@ class __zedlllocal eventsDeadlockChecker : public validationChecker { ze_result_t zeCommandListAppendSignalEventPrologue(ze_command_list_handle_t hCommandList, ze_event_handle_t hEvent) override; ze_result_t zeCommandListAppendWaitOnEventsPrologue(ze_command_list_handle_t hCommandList, uint32_t numEvents, ze_event_handle_t *phEvents) override; ze_result_t zeEventHostSignalPrologue(ze_event_handle_t hEvent) override; - ze_result_t zeCommandListAppendEventResetPrologue( ze_command_list_handle_t hCommandList, ze_event_handle_t hEvent ) override; - ze_result_t zeEventHostResetPrologue( ze_event_handle_t hEvent ) override; - ze_result_t zeCommandListAppendQueryKernelTimestampsPrologue( ze_command_list_handle_t hCommandList, uint32_t numEvents, ze_event_handle_t* phEvents, void* dstptr, const size_t* pOffsets, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override; - ze_result_t zeCommandListAppendLaunchKernelPrologue( ze_command_list_handle_t hCommandList, ze_kernel_handle_t hKernel, const ze_group_count_t* pLaunchFuncArgs, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override; - ze_result_t zeCommandListAppendLaunchCooperativeKernelPrologue( ze_command_list_handle_t hCommandList, ze_kernel_handle_t hKernel, const ze_group_count_t* pLaunchFuncArgs, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override; - ze_result_t zeCommandListAppendLaunchKernelIndirectPrologue( ze_command_list_handle_t hCommandList, ze_kernel_handle_t hKernel, const ze_group_count_t* pLaunchArgumentsBuffer, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override; - ze_result_t zeCommandListAppendLaunchMultipleKernelsIndirectPrologue( ze_command_list_handle_t hCommandList, uint32_t numKernels, ze_kernel_handle_t* phKernels, const uint32_t* pCountBuffer, const ze_group_count_t* pLaunchArgumentsBuffer, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override; - ze_result_t zeCommandListUpdateMutableCommandSignalEventExpPrologue( ze_command_list_handle_t hCommandList, uint64_t commandId, ze_event_handle_t hSignalEvent ) override; - ze_result_t zeCommandListUpdateMutableCommandWaitEventsExpPrologue( ze_command_list_handle_t hCommandList, uint64_t commandId, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override; - ze_result_t zeCommandListAppendImageCopyToMemoryExtPrologue( ze_command_list_handle_t hCommandList, void* dstptr, ze_image_handle_t hSrcImage, const ze_image_region_t* pSrcRegion, uint32_t destRowPitch, uint32_t destSlicePitch, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override; - ze_result_t zeCommandListAppendImageCopyFromMemoryExtPrologue( ze_command_list_handle_t hCommandList, ze_image_handle_t hDstImage, const void* srcptr, const ze_image_region_t* pDstRegion, uint32_t srcRowPitch, uint32_t srcSlicePitch, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override; - ze_result_t zeCommandListImmediateAppendCommandListsExpPrologue( ze_command_list_handle_t hCommandListImmediate, uint32_t numCommandLists, ze_command_list_handle_t* phCommandLists, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override; + ze_result_t zeCommandListAppendEventResetPrologue(ze_command_list_handle_t hCommandList, ze_event_handle_t hEvent) override; + ze_result_t zeEventHostResetPrologue(ze_event_handle_t hEvent) override; + ze_result_t zeCommandListAppendQueryKernelTimestampsPrologue(ze_command_list_handle_t hCommandList, uint32_t numEvents, ze_event_handle_t *phEvents, void *dstptr, const size_t *pOffsets, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendLaunchKernelPrologue(ze_command_list_handle_t hCommandList, ze_kernel_handle_t hKernel, const ze_group_count_t *pLaunchFuncArgs, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendLaunchCooperativeKernelPrologue(ze_command_list_handle_t hCommandList, ze_kernel_handle_t hKernel, const ze_group_count_t *pLaunchFuncArgs, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendLaunchKernelIndirectPrologue(ze_command_list_handle_t hCommandList, ze_kernel_handle_t hKernel, const ze_group_count_t *pLaunchArgumentsBuffer, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendLaunchMultipleKernelsIndirectPrologue(ze_command_list_handle_t hCommandList, uint32_t numKernels, ze_kernel_handle_t *phKernels, const uint32_t *pCountBuffer, const ze_group_count_t *pLaunchArgumentsBuffer, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListUpdateMutableCommandSignalEventExpPrologue(ze_command_list_handle_t hCommandList, uint64_t commandId, ze_event_handle_t hSignalEvent) override; + ze_result_t zeCommandListUpdateMutableCommandWaitEventsExpPrologue(ze_command_list_handle_t hCommandList, uint64_t commandId, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendImageCopyToMemoryExtPrologue(ze_command_list_handle_t hCommandList, void *dstptr, ze_image_handle_t hSrcImage, const ze_image_region_t *pSrcRegion, uint32_t destRowPitch, uint32_t destSlicePitch, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendImageCopyFromMemoryExtPrologue(ze_command_list_handle_t hCommandList, ze_image_handle_t hDstImage, const void *srcptr, const ze_image_region_t *pDstRegion, uint32_t srcRowPitch, uint32_t srcSlicePitch, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListImmediateAppendCommandListsExpPrologue(ze_command_list_handle_t hCommandListImmediate, uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; private: // Add node in the DAG and get its ID. int addNodeInDag() { return dag.NewNode(); } // Add edge in the DAG. - bool addEdgeInDag(int x, int y) { return dag.InsertEdge(x, y); } + bool addEdgeInDag(uint32_t x, uint32_t y) { return dag.InsertEdge(x, y); } // Inserts new actions and events in the DAG based on the ze. void checkForDeadlock(std::string zeCallDisc, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents); @@ -75,10 +76,10 @@ class __zedlllocal eventsDeadlockChecker : public validationChecker { xla::GraphCycles dag; // events point from/out to a DAG node. This map stores the DAG ID for each event (if there is one). - std::unordered_map eventToDagID; + std::unordered_map eventToDagID; // This map acts as a bi-directional map to eventToDagID. It maps DAG ID to a pair containing action description and signal event. - std::unordered_map dagIDToAction; + std::unordered_map dagIDToAction; }; class ZESeventsDeadlockChecker : public ZESValidationEntryPoints {}; class ZETeventsDeadlockChecker : public ZETValidationEntryPoints {}; diff --git a/third_party/xla/graphcycles.cc b/third_party/xla/graphcycles.cc index 8e010c9..231eab6 100755 --- a/third_party/xla/graphcycles.cc +++ b/third_party/xla/graphcycles.cc @@ -389,8 +389,11 @@ std::pair, bool> GraphCycles::PathDagIDs(int x, int y, cons int np = FindPath(x, y, kPathSize, path.data()); bool overflow = np > max_path_len; - for (int i = 0; i < np; i++) { - path.push_back(path[i]); + for (int i = 0; i < np; i++) { + if (i >= kPathSize) { + break; + } + path.push_back(path[i]); } return std::pair, bool>(path, overflow);