Skip to content

Commit

Permalink
feature: events deadlock detection in validation layer
Browse files Browse the repository at this point in the history
Related-To: NEO-12810

Signed-off-by: Chandio, Bibrak Qamar <[email protected]>
  • Loading branch information
bibrak committed Nov 5, 2024
1 parent a66309f commit ae901c7
Show file tree
Hide file tree
Showing 4 changed files with 464 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@ eventsDeadlockChecker::~eventsDeadlockChecker() {
}
}

ze_result_t eventsDeadlockChecker::ZEeventsDeadlockChecker::zeEventCreateEpilogue(ze_event_pool_handle_t hEventPool, const ze_event_desc_t *desc, ze_event_handle_t *phEvent) {
ze_result_t eventsDeadlockChecker::ZEeventsDeadlockChecker::zeEventCreateEpilogue(
ze_event_pool_handle_t hEventPool, ///< [in] handle of the event pool
const ze_event_desc_t *desc, ///< [in] pointer to event descriptor
ze_event_handle_t *phEvent ///< [out] pointer to handle of event object created
) {

eventToDagID[*phEvent] = invalidDagID;

Expand All @@ -46,7 +50,6 @@ ze_result_t
eventsDeadlockChecker::ZEeventsDeadlockChecker::zeEventDestroyEpilogue(
ze_event_handle_t hEvent ///< [in][release] handle of event object to destroy
) {

if (eventToDagID.find(hEvent) != eventToDagID.end()) {
// Deleted event from eventToDagID but not from the dagIDToAction map as it may be needed for printing the discription of the action when printing path in the DAG.

Expand Down Expand Up @@ -266,8 +269,8 @@ eventsDeadlockChecker::ZEeventsDeadlockChecker::zeCommandListAppendSignalEventPr
ze_command_list_handle_t hCommandList, ///< [in] handle of the command list
ze_event_handle_t hEvent ///< [in] handle of the event
) {
// TODO: Implememt this
// checkForDeadlock("zeCommandListAppendSignalEvent", hEvent, 0, nullptr);

checkForDeadlock("zeCommandListAppendSignalEvent", hEvent, 0, nullptr);

return ZE_RESULT_SUCCESS;
}
Expand All @@ -288,19 +291,41 @@ ze_result_t
eventsDeadlockChecker::ZEeventsDeadlockChecker::zeEventHostSignalPrologue(
ze_event_handle_t hEvent ///< [in] handle of the event
) {
// TODO: Implememt this
// checkForDeadlock("zeEventHostSignal", hEvent, 0, nullptr);
checkForDeadlock("zeEventHostSignal", hEvent, 0, nullptr);

return ZE_RESULT_SUCCESS;
}

void eventsDeadlockChecker::ZEeventsDeadlockChecker::resetEventInEventToDagID(
const std::string &zeCallDisc, /// action discription
const ze_event_handle_t hEvent ///< [in] handle of the event
) {

auto it = eventToDagID.find(hEvent);
// Check if user is using invalid events, hint if it doesn't exist in eventToDagID.
if (it == eventToDagID.end()) {
std::cerr << "Warning: hSignalEvent {" << hEvent << "} might be an invalid event in call to " << zeCallDisc << std::endl;
return;
}

if (it->second != invalidDagID) {

auto action = dagIDToAction.find(it->second);
if (action != dagIDToAction.end()) {
action->second.second = invalidEventAddress; // Reset
}

it->second = invalidDagID; // Reset
}
}

ze_result_t
eventsDeadlockChecker::ZEeventsDeadlockChecker::zeCommandListAppendEventResetPrologue(
ze_command_list_handle_t hCommandList, ///< [in] handle of the command list
ze_event_handle_t hEvent ///< [in] handle of the event
) {
// TODO: Implememt this
// checkForDeadlock("zeCommandListAppendEventReset", hEvent, 0, nullptr);

resetEventInEventToDagID("zeCommandListAppendEventReset", hEvent);

return ZE_RESULT_SUCCESS;
}
Expand All @@ -309,8 +334,8 @@ ze_result_t
eventsDeadlockChecker::ZEeventsDeadlockChecker::zeEventHostResetPrologue(
ze_event_handle_t hEvent ///< [in] handle of the event
) {
// TODO: Implememt this
// checkForDeadlock("zeEventHostReset", hEvent, 0, nullptr);

resetEventInEventToDagID("zeEventHostReset", hEvent);

return ZE_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -412,7 +437,7 @@ eventsDeadlockChecker::ZEeventsDeadlockChecker::zeCommandListUpdateMutableComman
uint64_t commandId, ///< [in] command identifier
ze_event_handle_t hSignalEvent ///< [in][optional] handle of the event to signal on completion
) {
// TODO: Implememt this

checkForDeadlock("zeCommandListUpdateMutableCommandSignalEventExp", hSignalEvent, 0, nullptr);

return ZE_RESULT_SUCCESS;
Expand Down Expand Up @@ -492,30 +517,51 @@ eventsDeadlockChecker::ZEeventsDeadlockChecker::zeCommandListImmediateAppendComm
return ZE_RESULT_SUCCESS;
}

void eventsDeadlockChecker::ZEeventsDeadlockChecker::checkForDeadlock(std::string zeCallDisc, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) {
void eventsDeadlockChecker::ZEeventsDeadlockChecker::validateSignalEventOwnership(const std::string &zeCallDisc,
const ze_event_handle_t hSignalEvent) {
const auto it = eventToDagID.find(hSignalEvent);
const auto dagID = it->second;
if (it != eventToDagID.end() && dagID != invalidDagID) {
std::string previousActionOwner = (dagIDToAction.find(dagID) != dagIDToAction.end()) ? dagIDToAction.find(dagID)->second.first : "UNKNOWN ACTION";
std::cerr << "Warning: " << zeCallDisc << " is using the same ze_event_handle_t for signal {" << hSignalEvent << "} which has been previously used by: " << previousActionOwner << std::endl;
}
}

void eventsDeadlockChecker::ZEeventsDeadlockChecker::checkForDeadlock(
const std::string &zeCallDisc, /// action discription
const ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to forming the outgoing edge in the DAG
const uint32_t numWaitEvents, ///< [in][optional] number of events that point to this action.
const ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events that point to this action.
) {
uint32_t this_action_new_node_id = invalidDagID;

// Check if user is using invalid events, hint if it doesn't exist in eventToDagID.
if (eventToDagID.find(hSignalEvent) == eventToDagID.end()) {
std::cerr << "Warning: hSignalEvent {" << hSignalEvent << "} might be an invalid event." << std::endl;
return;
}
for (uint32_t i = 0; i < numWaitEvents; i++) {
if (eventToDagID.find(phWaitEvents[i]) == eventToDagID.end()) {
std::cerr << "Warning: phWaitEvents {" << hSignalEvent << "} might be an invalid event." << std::endl;
if (hSignalEvent != nullptr) {

auto it = eventToDagID.find(hSignalEvent);
// Check if user is using invalid events, hint if it doesn't exist in eventToDagID.
if (it == eventToDagID.end()) {
std::cerr << "Warning: hSignalEvent {" << hSignalEvent << "} might be an invalid event in call to " << zeCallDisc << std::endl;
return;
}
}

if (hSignalEvent != nullptr) {
auto it = eventToDagID.find(hSignalEvent);
if (it != eventToDagID.end() && it->second != invalidDagID) {
// A passive check to see if the user is using the same event for multiple actions.
// It only print warnings and does not stop the event deadlock checker.
validateSignalEventOwnership(zeCallDisc, hSignalEvent);

if (it->second != invalidDagID) {
// This event already exists in the DAG. Get the DAG node ID.
// For example when there is indeed a deadlock it would have already been created.
this_action_new_node_id = it->second;
}
}

for (uint32_t i = 0; i < numWaitEvents; i++) {
if (eventToDagID.find(phWaitEvents[i]) == eventToDagID.end()) {
std::cerr << "Warning: phWaitEvents {" << hSignalEvent << "} might be an invalid event in call to " << zeCallDisc << std::endl;
return;
}
}

if (this_action_new_node_id == invalidDagID) {
// Create node in DAG
this_action_new_node_id = addNodeInDag();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@
#include "ze_api.h"
#include "ze_validation_layer.h"

#include <limits>
#include <limits>
#include <string>

namespace validation_layer {

constexpr uint32_t invalidDagID = std::numeric_limits<uint32_t>::max();
constexpr ze_event_handle_t invalidEventAddress = std::numeric_limits<ze_event_handle_t>::max();
using actionAndSignalEvent = std::pair<std::string, ze_event_handle_t>;

class __zedlllocal eventsDeadlockChecker : public validationChecker {
Expand Down Expand Up @@ -65,8 +66,15 @@ class __zedlllocal eventsDeadlockChecker : public validationChecker {
// Add edge in the DAG.
bool addEdgeInDag(uint32_t x, uint32_t y) { return dag.InsertEdge(x, y); }

// In case the user uses a single hSignalEvent twice or more, which is an ill usage.
void validateSignalEventOwnership(const std::string &zeCallDisc, const ze_event_handle_t hSignalEvent);

// Inserts new actions and events in the DAG based on the ze<API CALLS>.
void checkForDeadlock(std::string zeCallDisc, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
void checkForDeadlock(const std::string &zeCallDisc, const ze_event_handle_t hSignalEvent, const uint32_t numWaitEvents, const ze_event_handle_t *phWaitEvents);

// Reset the event to have an invalid DAG ID such that it can be reused.
// Useful for zeCalls such as zeCommandListAppendEventReset and zeEventHostReset.
void resetEventInEventToDagID(const std::string &zeCallDisc, ze_event_handle_t hEvent);

// The DAG structure.
xla::GraphCycles dag;
Expand Down
6 changes: 5 additions & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,8 @@ set_property(TEST tests_both_gpu PROPERTY ENVIRONMENT "ZE_ENABLE_NULL_DRIVER=1")
add_test(NAME tests_both_npu COMMAND tests --gtest_filter=*GivenLevelZeroLoaderPresentWhenCallingzeInitThenZeInitDriversThenBothCallsSucceedWithNPUTypes*)
set_property(TEST tests_both_npu PROPERTY ENVIRONMENT "ZE_ENABLE_NULL_DRIVER=1")
add_test(NAME tests_event_deadlock COMMAND tests --gtest_filter=*GivenLevelZeroLoaderPresentWhenCallingzeCommandListAppendMemoryCopyWithCircularDependencyOnEventsThenValidationLayerPrintsWarningOfDeadlock*)
set_property(TEST tests_event_deadlock PROPERTY ENVIRONMENT "ZE_ENABLE_NULL_DRIVER=1;ZE_ENABLE_VALIDATION_LAYER=1;ZEL_ENABLE_EVENTSDEADLOCK_CHECKER=1")
set_property(TEST tests_event_deadlock PROPERTY ENVIRONMENT "ZE_ENABLE_NULL_DRIVER=1;ZE_ENABLE_VALIDATION_LAYER=1;ZEL_ENABLE_EVENTSDEADLOCK_CHECKER=1")
add_test(NAME tests_event_deadlock_reset COMMAND tests --gtest_filter=*GivenLevelZeroLoaderPresentWhenCallingzeCommandListAppendMemoryCopyWithCircularDependencyOnEventsAndExplicitCallzeEventHostSignalThenValidationLayerPrintsWarningOfIllegalUsage*)
set_property(TEST tests_event_deadlock_reset PROPERTY ENVIRONMENT "ZE_ENABLE_NULL_DRIVER=1;ZE_ENABLE_VALIDATION_LAYER=1;ZEL_ENABLE_EVENTSDEADLOCK_CHECKER=1")
add_test(NAME tests_event_reset_reuse COMMAND tests --gtest_filter=*GivenLevelZeroLoaderPresentWhenCallingzeEventHostResetWithAlreadySignaledEventThenUsingEventAgainThenValidationLayerDoesNotPrintsWarningOfIllegalUsage*)
set_property(TEST tests_event_reset_reuse PROPERTY ENVIRONMENT "ZE_ENABLE_NULL_DRIVER=1;ZE_ENABLE_VALIDATION_LAYER=1;ZEL_ENABLE_EVENTSDEADLOCK_CHECKER=1")
Loading

0 comments on commit ae901c7

Please sign in to comment.