Skip to content

Commit

Permalink
feature: events deadlock detection in validation layer
Browse files Browse the repository at this point in the history
Related-To: NEO-12810

Signed-off-by: Chandio, Bibrak Qamar <[email protected]>
  • Loading branch information
bibrak committed Nov 5, 2024
1 parent 7433d7c commit 5d24b5b
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ eventsDeadlockChecker::ZEeventsDeadlockChecker::zeCommandListImmediateAppendComm
}

void eventsDeadlockChecker::ZEeventsDeadlockChecker::checkForDeadlock(std::string zeCallDisc, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) {
int this_action_new_node_id = invalidDagID;
uint32_t this_action_new_node_id = invalidDagID;

// Check if user is using invalid events, hint if it doesn't exist in eventToDagID
if (eventToDagID.find(hSignalEvent) == eventToDagID.end()) {
Expand All @@ -548,19 +548,15 @@ void eventsDeadlockChecker::ZEeventsDeadlockChecker::checkForDeadlock(std::strin
// This event already exists in the DAG. Get the DAG node ID.
// For example when there is indeed a deadlock it would have already been created.
this_action_new_node_id = it->second;

// std::cout << "\tFound event in eventToDagID: hSignalEvent = " << hSignalEvent << ", this_action_new_node_id = " << this_action_new_node_id << std::endl;
}
}

if (this_action_new_node_id == invalidDagID) {
// Create node in DAG
this_action_new_node_id = addNodeInDag(); // nextDagID++;
this_action_new_node_id = addNodeInDag();

// Now we know where the hSignalEvent points from/out in the DAG. Update the eventToDagID map.
eventToDagID[hSignalEvent] = this_action_new_node_id;

// std::cout << "\tUpdated eventToDagID: hSignalEvent = " << hSignalEvent << ", this_action_new_node_id = " << this_action_new_node_id << std::endl;
}

// Add this action to the actionToDagID map
Expand All @@ -582,35 +578,40 @@ void eventsDeadlockChecker::ZEeventsDeadlockChecker::checkForDeadlock(std::strin
for (uint32_t i = 0; i < numWaitEvents; i++) {
auto it = eventToDagID.find(phWaitEvents[i]);

int dagID = it->second;
uint32_t dagID = it->second;
if (dagID == invalidDagID) {
// Create a new node in the DAG for this wait event. That action will be created some time in the future.
dagID = addNodeInDag(); // nextDagID++;
dagID = addNodeInDag();
it->second = dagID;
}

auto getActionDetails = [&](int dagID) -> std::string {
return (dagIDToAction.find(dagID) != dagIDToAction.end()) ? dagIDToAction[dagID].first : "PLACEHOLDER";
auto actionIt = dagIDToAction.find(dagID);
return (actionIt != dagIDToAction.end()) ? actionIt->second.first : "PLACEHOLDER";
};

std::string fromAction = getActionDetails(dagID);
std::string toAction = getActionDetails(this_action_new_node_id);

if (!addEdgeInDag(dagID, this_action_new_node_id)) {
std::string fromAction = getActionDetails(dagID);
std::string toAction = getActionDetails(this_action_new_node_id);

auto path = dag.PathDagIDs(this_action_new_node_id, dagID, 5);

std::string spacePrefix = "";
std::cerr << "Warning: There may be a potential event deadlock!\n";
std::cerr << "Adding the following dependency would create a cycle in the DAG:\n\tFrom:" << fromAction << "\n\tTo:" << toAction << "\n";
std::cerr << "Adding the following dependency would create a cycle in the DAG:\n\tFrom: " << fromAction << "\n\tTo: " << toAction << "\n";
std::cerr << "There is already a path:\n";

constexpr uint32_t maxPathLength = 15;
auto path = dag.PathDagIDs(this_action_new_node_id, dagID, maxPathLength);
auto dagIDsInPath = path.first;
std::cerr << getActionDetails(dagIDsInPath[0]) << "\n";
for (uint32_t i = 1; i < dagIDsInPath.size(); i++) {
std::string spacePrefix = "";
for (uint32_t j = 1; j < dagIDsInPath.size(); j++) {
std::cerr << spacePrefix << "|\n"
<< spacePrefix << "-> " << getActionDetails(dagIDsInPath[i]) << "\n";
<< spacePrefix << "-> " << getActionDetails(dagIDsInPath[j]) << "\n";
spacePrefix += " ";
}
if (path.second) {
std::cerr << spacePrefix << "|\n"
<< spacePrefix << "-> ...\n";
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@
#include "ze_api.h"
#include "ze_validation_layer.h"

#include <limits>
#include <string>

namespace validation_layer {

constexpr int invalidDagID = -1;
constexpr uint32_t invalidDagID = std::numeric_limits<uint32_t>::max();
using actionAndSignalEvent = std::pair<std::string, ze_event_handle_t>;

class __zedlllocal eventsDeadlockChecker : public validationChecker {
Expand Down Expand Up @@ -48,25 +49,25 @@ class __zedlllocal eventsDeadlockChecker : public validationChecker {
ze_result_t zeCommandListAppendSignalEventPrologue(ze_command_list_handle_t hCommandList, ze_event_handle_t hEvent) override;
ze_result_t zeCommandListAppendWaitOnEventsPrologue(ze_command_list_handle_t hCommandList, uint32_t numEvents, ze_event_handle_t *phEvents) override;
ze_result_t zeEventHostSignalPrologue(ze_event_handle_t hEvent) override;
ze_result_t zeCommandListAppendEventResetPrologue( ze_command_list_handle_t hCommandList, ze_event_handle_t hEvent ) override;
ze_result_t zeEventHostResetPrologue( ze_event_handle_t hEvent ) override;
ze_result_t zeCommandListAppendQueryKernelTimestampsPrologue( ze_command_list_handle_t hCommandList, uint32_t numEvents, ze_event_handle_t* phEvents, void* dstptr, const size_t* pOffsets, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override;
ze_result_t zeCommandListAppendLaunchKernelPrologue( ze_command_list_handle_t hCommandList, ze_kernel_handle_t hKernel, const ze_group_count_t* pLaunchFuncArgs, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override;
ze_result_t zeCommandListAppendLaunchCooperativeKernelPrologue( ze_command_list_handle_t hCommandList, ze_kernel_handle_t hKernel, const ze_group_count_t* pLaunchFuncArgs, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override;
ze_result_t zeCommandListAppendLaunchKernelIndirectPrologue( ze_command_list_handle_t hCommandList, ze_kernel_handle_t hKernel, const ze_group_count_t* pLaunchArgumentsBuffer, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override;
ze_result_t zeCommandListAppendLaunchMultipleKernelsIndirectPrologue( ze_command_list_handle_t hCommandList, uint32_t numKernels, ze_kernel_handle_t* phKernels, const uint32_t* pCountBuffer, const ze_group_count_t* pLaunchArgumentsBuffer, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override;
ze_result_t zeCommandListUpdateMutableCommandSignalEventExpPrologue( ze_command_list_handle_t hCommandList, uint64_t commandId, ze_event_handle_t hSignalEvent ) override;
ze_result_t zeCommandListUpdateMutableCommandWaitEventsExpPrologue( ze_command_list_handle_t hCommandList, uint64_t commandId, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override;
ze_result_t zeCommandListAppendImageCopyToMemoryExtPrologue( ze_command_list_handle_t hCommandList, void* dstptr, ze_image_handle_t hSrcImage, const ze_image_region_t* pSrcRegion, uint32_t destRowPitch, uint32_t destSlicePitch, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override;
ze_result_t zeCommandListAppendImageCopyFromMemoryExtPrologue( ze_command_list_handle_t hCommandList, ze_image_handle_t hDstImage, const void* srcptr, const ze_image_region_t* pDstRegion, uint32_t srcRowPitch, uint32_t srcSlicePitch, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override;
ze_result_t zeCommandListImmediateAppendCommandListsExpPrologue( ze_command_list_handle_t hCommandListImmediate, uint32_t numCommandLists, ze_command_list_handle_t* phCommandLists, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t* phWaitEvents ) override;
ze_result_t zeCommandListAppendEventResetPrologue(ze_command_list_handle_t hCommandList, ze_event_handle_t hEvent) override;
ze_result_t zeEventHostResetPrologue(ze_event_handle_t hEvent) override;
ze_result_t zeCommandListAppendQueryKernelTimestampsPrologue(ze_command_list_handle_t hCommandList, uint32_t numEvents, ze_event_handle_t *phEvents, void *dstptr, const size_t *pOffsets, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override;
ze_result_t zeCommandListAppendLaunchKernelPrologue(ze_command_list_handle_t hCommandList, ze_kernel_handle_t hKernel, const ze_group_count_t *pLaunchFuncArgs, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override;
ze_result_t zeCommandListAppendLaunchCooperativeKernelPrologue(ze_command_list_handle_t hCommandList, ze_kernel_handle_t hKernel, const ze_group_count_t *pLaunchFuncArgs, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override;
ze_result_t zeCommandListAppendLaunchKernelIndirectPrologue(ze_command_list_handle_t hCommandList, ze_kernel_handle_t hKernel, const ze_group_count_t *pLaunchArgumentsBuffer, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override;
ze_result_t zeCommandListAppendLaunchMultipleKernelsIndirectPrologue(ze_command_list_handle_t hCommandList, uint32_t numKernels, ze_kernel_handle_t *phKernels, const uint32_t *pCountBuffer, const ze_group_count_t *pLaunchArgumentsBuffer, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override;
ze_result_t zeCommandListUpdateMutableCommandSignalEventExpPrologue(ze_command_list_handle_t hCommandList, uint64_t commandId, ze_event_handle_t hSignalEvent) override;
ze_result_t zeCommandListUpdateMutableCommandWaitEventsExpPrologue(ze_command_list_handle_t hCommandList, uint64_t commandId, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override;
ze_result_t zeCommandListAppendImageCopyToMemoryExtPrologue(ze_command_list_handle_t hCommandList, void *dstptr, ze_image_handle_t hSrcImage, const ze_image_region_t *pSrcRegion, uint32_t destRowPitch, uint32_t destSlicePitch, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override;
ze_result_t zeCommandListAppendImageCopyFromMemoryExtPrologue(ze_command_list_handle_t hCommandList, ze_image_handle_t hDstImage, const void *srcptr, const ze_image_region_t *pDstRegion, uint32_t srcRowPitch, uint32_t srcSlicePitch, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override;
ze_result_t zeCommandListImmediateAppendCommandListsExpPrologue(ze_command_list_handle_t hCommandListImmediate, uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override;

private:
// Add node in the DAG and get its ID.
int addNodeInDag() { return dag.NewNode(); }

// Add edge in the DAG.
bool addEdgeInDag(int x, int y) { return dag.InsertEdge(x, y); }
bool addEdgeInDag(uint32_t x, uint32_t y) { return dag.InsertEdge(x, y); }

// Inserts new actions and events in the DAG based on the ze<API CALLS>.
void checkForDeadlock(std::string zeCallDisc, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
Expand All @@ -75,10 +76,10 @@ class __zedlllocal eventsDeadlockChecker : public validationChecker {
xla::GraphCycles dag;

// events point from/out to a DAG node. This map stores the DAG ID for each event (if there is one).
std::unordered_map<ze_event_handle_t, int> eventToDagID;
std::unordered_map<ze_event_handle_t, uint32_t> eventToDagID;

// This map acts as a bi-directional map to eventToDagID. It maps DAG ID to a pair containing action description and signal event.
std::unordered_map<int, actionAndSignalEvent> dagIDToAction;
std::unordered_map<uint32_t, actionAndSignalEvent> dagIDToAction;
};
class ZESeventsDeadlockChecker : public ZESValidationEntryPoints {};
class ZETeventsDeadlockChecker : public ZETValidationEntryPoints {};
Expand Down
7 changes: 5 additions & 2 deletions third_party/xla/graphcycles.cc
Original file line number Diff line number Diff line change
Expand Up @@ -389,8 +389,11 @@ std::pair<std::vector<int32_t>, bool> GraphCycles::PathDagIDs(int x, int y, cons
int np = FindPath(x, y, kPathSize, path.data());
bool overflow = np > max_path_len;

for (int i = 0; i < np; i++) {
path.push_back(path[i]);
for (int i = 0; i < np; i++) {
if (i >= kPathSize) {
break;
}
path.push_back(path[i]);
}

return std::pair<std::vector<int32_t>, bool>(path, overflow);
Expand Down

0 comments on commit 5d24b5b

Please sign in to comment.