diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index 57e64e08d1006..ac8dc4d2c9ec7 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -611,7 +611,13 @@ SYCL::getDeviceLibraries(const Compilation &C, const llvm::Triple &TargetTriple, {"libsycl-asan-cpu", "internal"}, {"libsycl-asan-dg2", "internal"}, {"libsycl-asan-pvc", "internal"}}; - const SYCLDeviceLibsList SYCLDeviceMsanLibs = {{"libsycl-msan", "internal"}}; + const SYCLDeviceLibsList SYCLDeviceMsanLibs = { + {"libsycl-msan", "internal"}, + {"libsycl-msan-cpu", "internal"}, + // Currently, we only provide aot msan libdevice for PVC and CPU. + // For DG2, we just use libsycl-msan as placeholder. + {"libsycl-msan", "internal"}, + {"libsycl-msan-pvc", "internal"}}; #endif const SYCLDeviceLibsList SYCLNativeCpuDeviceLibs = { @@ -769,7 +775,7 @@ SYCL::getDeviceLibraries(const Compilation &C, const llvm::Triple &TargetTriple, if (SanitizeVal == "address") addSingleLibrary(SYCLDeviceAsanLibs[sanitizer_lib_idx]); else if (SanitizeVal == "memory") - addLibraries(SYCLDeviceMsanLibs); + addSingleLibrary(SYCLDeviceMsanLibs[sanitizer_lib_idx]); #endif if (isNativeCPU) diff --git a/clang/test/Driver/sycl-device-lib-old-model.cpp b/clang/test/Driver/sycl-device-lib-old-model.cpp index b4e3a3f9cf164..fd65dde48338a 100644 --- a/clang/test/Driver/sycl-device-lib-old-model.cpp +++ b/clang/test/Driver/sycl-device-lib-old-model.cpp @@ -355,3 +355,26 @@ // SYCL_DEVICE_MSAN_MACRO-SAME: "USE_SYCL_DEVICE_MSAN" // SYCL_DEVICE_MSAN_MACRO: llvm-link{{.*}} "-only-needed" // SYCL_DEVICE_MSAN_MACRO-SAME: "{{.*}}libsycl-msan.bc" + +/// ########################################################################### +/// test behavior of linking libsycl-msan-pvc for PVC target AOT compilation when msan flag is applied. +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc --no-offload-new-driver %s --sysroot=%S/Inputs/SYCL \ +// RUN: -Xarch_device -fsanitize=memory -### 2>&1 | FileCheck %s -check-prefix=SYCL_DEVICE_LIB_MSAN_PVC +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc" --no-offload-new-driver %s \ +// RUN: --sysroot=%S/Inputs/SYCL -Xarch_device -fsanitize=memory -### 2>&1 | FileCheck %s -check-prefix=SYCL_DEVICE_LIB_MSAN_PVC +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" --no-offload-new-driver %s \ +// RUN: --sysroot=%S/Inputs/SYCL -Xarch_device -fsanitize=memory -### 2>&1 | FileCheck %s -check-prefix=SYCL_DEVICE_LIB_MSAN_PVC +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.60.7" --no-offload-new-driver %s \ +// RUN: --sysroot=%S/Inputs/SYCL -Xarch_device -fsanitize=memory -### 2>&1 | FileCheck %s -check-prefix=SYCL_DEVICE_LIB_MSAN_PVC +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xs "-device 12.60.7" --no-offload-new-driver %s --sysroot=%S/Inputs/SYCL \ +// RUN: -Xarch_device -fsanitize=memory -### 2>&1 | FileCheck %s -check-prefix=SYCL_DEVICE_LIB_MSAN_PVC +// SYCL_DEVICE_LIB_MSAN_PVC: llvm-link{{.*}} "{{.*}}libsycl-crt.bc" +// SYCL_DEVICE_LIB_MSAN_PVC-SAME: "{{.*}}libsycl-msan-pvc.bc" + + +/// ########################################################################### +/// test behavior of linking libsycl-msan-cpu for CPU target AOT compilation when msan flag is applied. +// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 --no-offload-new-driver %s --sysroot=%S/Inputs/SYCL \ +// RUN: -Xarch_device -fsanitize=memory -### 2>&1 | FileCheck %s -check-prefix=SYCL_DEVICE_LIB_MSAN_CPU +// SYCL_DEVICE_LIB_MSAN_CPU: llvm-link{{.*}} "{{.*}}libsycl-crt.bc" +// SYCL_DEVICE_LIB_MSAN_CPU-SAME: "{{.*}}libsycl-msan-cpu.bc" diff --git a/clang/test/Driver/sycl-device-lib.cpp b/clang/test/Driver/sycl-device-lib.cpp index 9e07edf2287fa..a80cdfbd847d7 100644 --- a/clang/test/Driver/sycl-device-lib.cpp +++ b/clang/test/Driver/sycl-device-lib.cpp @@ -352,3 +352,16 @@ // SYCL_DEVICE_MSAN_MACRO: "-cc1" // SYCL_DEVICE_MSAN_MACRO-SAME: "USE_SYCL_DEVICE_MSAN" // SYCL_DEVICE_MSAN_MACRO: libsycl-msan.new.o + +/// test behavior of msan libdevice linking when -fsanitize=memory is available for AOT targets +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc --offload-new-driver %s --sysroot=%S/Inputs/SYCL \ +// RUN: -fsanitize=memory -### 2>&1 | FileCheck %s -check-prefix=SYCL_DEVICE_LIB_MSAN_PVC +// SYCL_DEVICE_LIB_MSAN_PVC: clang-linker-wrapper{{.*}} "-sycl-device-libraries +// SYCL_DEVICE_LIB_MSAN_PVC-SAME: {{.*}}libsycl-msan-pvc.new.o + +/// test behavior of msan libdevice linking when -fsanitize=memory is available for AOT targets +// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 --offload-new-driver %s --sysroot=%S/Inputs/SYCL \ +// RUN: -fsanitize=memory -### 2>&1 | FileCheck %s -check-prefix=SYCL_DEVICE_LIB_MSAN_CPU +// SYCL_DEVICE_LIB_MSAN_CPU: clang-linker-wrapper{{.*}} "-sycl-device-libraries +// SYCL_DEVICE_LIB_MSAN_CPU-SAME: {{.*}}libsycl-msan-cpu.new.o + diff --git a/libclc/libspirv/include/libspirv/spirv.h b/libclc/libspirv/include/libspirv/spirv.h index 657ae6a220cf8..53c5353e58739 100644 --- a/libclc/libspirv/include/libspirv/spirv.h +++ b/libclc/libspirv/include/libspirv/spirv.h @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include diff --git a/libclc/libspirv/include/libspirv/workitem/get_local_linear_id.h b/libclc/libspirv/include/libspirv/workitem/get_local_linear_id.h new file mode 100644 index 0000000000000..dff7923b9d31c --- /dev/null +++ b/libclc/libspirv/include/libspirv/workitem/get_local_linear_id.h @@ -0,0 +1,9 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +_CLC_DECL _CLC_OVERLOAD size_t __spirv_LocalInvocationIndex(); diff --git a/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl b/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl index 245a93985071a..3aa99d8a9180c 100644 --- a/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl +++ b/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl @@ -365,17 +365,6 @@ __CLC_GROUP_COLLECTIVE__DF16(_Z20__spirv_GroupFMulKHRjjDF16_, #undef __CLC_ADD #undef __CLC_MUL -long __clc__get_linear_local_id() { - size_t id_x = __spirv_LocalInvocationId_x(); - size_t id_y = __spirv_LocalInvocationId_y(); - size_t id_z = __spirv_LocalInvocationId_z(); - size_t size_x = __spirv_WorkgroupSize_x(); - size_t size_y = __spirv_WorkgroupSize_y(); - size_t size_z = __spirv_WorkgroupSize_z(); - uint sg_size = __spirv_SubgroupMaxSize(); - return (id_z * size_y * size_x + id_y * size_x + id_x); -} - long __clc__2d_to_linear_local_id(ulong2 id) { size_t size_x = __spirv_WorkgroupSize_x(); size_t size_y = __spirv_WorkgroupSize_y(); @@ -396,7 +385,7 @@ long __clc__3d_to_linear_local_id(ulong3 id) { return _Z28__spirv_SubgroupShuffleINTELI##TYPE_MANGLED##ET_S0_j( \ x, local_id); \ } \ - bool source = (__clc__get_linear_local_id() == local_id); \ + bool source = (__spirv_LocalInvocationIndex() == local_id); \ __local TYPE *scratch = __CLC_APPEND(__clc__get_group_scratch_, TYPE)(); \ if (source) { \ *scratch = x; \ diff --git a/libclc/libspirv/lib/generic/SOURCES b/libclc/libspirv/lib/generic/SOURCES index 7f894dbed825a..d4ee07d5c15db 100644 --- a/libclc/libspirv/lib/generic/SOURCES +++ b/libclc/libspirv/lib/generic/SOURCES @@ -205,5 +205,6 @@ shared/vload.cl shared/vstore.cl workitem/get_global_id.cl workitem/get_global_size.cl +workitem/get_local_linear_id.cl workitem/get_num_sub_groups.cl workitem/get_sub_group_size.cl diff --git a/libclc/libspirv/lib/generic/workitem/get_local_linear_id.cl b/libclc/libspirv/lib/generic/workitem/get_local_linear_id.cl new file mode 100644 index 0000000000000..6b61944476842 --- /dev/null +++ b/libclc/libspirv/lib/generic/workitem/get_local_linear_id.cl @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +_CLC_DEF _CLC_OVERLOAD size_t __spirv_LocalInvocationIndex() { + return __spirv_LocalInvocationId_z() * __spirv_WorkgroupSize_y() * + __spirv_WorkgroupSize_x() + + __spirv_LocalInvocationId_y() * __spirv_WorkgroupSize_x() + + __spirv_LocalInvocationId_x(); +} diff --git a/libclc/libspirv/lib/ptx-nvidiacl/group/collectives.cl b/libclc/libspirv/lib/ptx-nvidiacl/group/collectives.cl index 933777243632b..13b4c1f96354a 100644 --- a/libclc/libspirv/lib/ptx-nvidiacl/group/collectives.cl +++ b/libclc/libspirv/lib/ptx-nvidiacl/group/collectives.cl @@ -624,17 +624,6 @@ __CLC_GROUP_COLLECTIVE__DF16(_Z20__spirv_GroupFMulKHRjjDF16_, #undef __CLC_ADD #undef __CLC_MUL -long __clc__get_linear_local_id() { - size_t id_x = __spirv_LocalInvocationId_x(); - size_t id_y = __spirv_LocalInvocationId_y(); - size_t id_z = __spirv_LocalInvocationId_z(); - size_t size_x = __spirv_WorkgroupSize_x(); - size_t size_y = __spirv_WorkgroupSize_y(); - size_t size_z = __spirv_WorkgroupSize_z(); - uint sg_size = __spirv_SubgroupMaxSize(); - return (id_z * size_y * size_x + id_y * size_x + id_x); -} - long __clc__2d_to_linear_local_id(ulong2 id) { size_t size_x = __spirv_WorkgroupSize_x(); size_t size_y = __spirv_WorkgroupSize_y(); @@ -654,7 +643,7 @@ long __clc__3d_to_linear_local_id(ulong3 id) { if (scope == Subgroup) { \ return __clc__SubgroupShuffle(x, local_id); \ } \ - bool source = (__clc__get_linear_local_id() == local_id); \ + bool source = (__spirv_LocalInvocationIndex() == local_id); \ __local TYPE *scratch = __CLC_APPEND(__clc__get_group_scratch_, TYPE)(); \ if (source) { \ *scratch = x; \ diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake index 64979f2b65e4a..daa9726a81dd3 100644 --- a/libdevice/cmake/modules/SYCLLibdevice.cmake +++ b/libdevice/cmake/modules/SYCLLibdevice.cmake @@ -248,46 +248,46 @@ if (NOT MSVC AND UR_SANITIZER_INCLUDE_DIR) -I${UR_SANITIZER_INCLUDE_DIR} -I${CMAKE_CURRENT_SOURCE_DIR}) - set(asan_pvc_compile_opts_obj -fsycl -c + set(sanitizer_pvc_compile_opts_obj -fsycl -c ${sanitizer_generic_compile_opts} ${sycl_pvc_target_opt} -D__LIBDEVICE_PVC__) - set(asan_cpu_compile_opts_obj -fsycl -c + set(sanitizer_cpu_compile_opts_obj -fsycl -c ${sanitizer_generic_compile_opts} ${sycl_cpu_target_opt} -D__LIBDEVICE_CPU__) - set(asan_dg2_compile_opts_obj -fsycl -c + set(sanitizer_dg2_compile_opts_obj -fsycl -c ${sanitizer_generic_compile_opts} ${sycl_dg2_target_opt} -D__LIBDEVICE_DG2__) - set(asan_pvc_compile_opts_bc ${bc_device_compile_opts} + set(sanitizer_pvc_compile_opts_bc ${bc_device_compile_opts} ${sanitizer_generic_compile_opts} -D__LIBDEVICE_PVC__) - set(asan_cpu_compile_opts_bc ${bc_device_compile_opts} + set(sanitizer_cpu_compile_opts_bc ${bc_device_compile_opts} ${sanitizer_generic_compile_opts} -D__LIBDEVICE_CPU__) - set(asan_dg2_compile_opts_bc ${bc_device_compile_opts} + set(sanitizer_dg2_compile_opts_bc ${bc_device_compile_opts} ${sanitizer_generic_compile_opts} -D__LIBDEVICE_DG2__) - set(asan_pvc_compile_opts_obj-new-offload -fsycl -c --offload-new-driver + set(sanitizer_pvc_compile_opts_obj-new-offload -fsycl -c --offload-new-driver -foffload-lto=thin ${sanitizer_generic_compile_opts} ${sycl_pvc_target_opt} -D__LIBDEVICE_PVC__) - set(asan_cpu_compile_opts_obj-new-offload -fsycl -c --offload-new-driver + set(sanitizer_cpu_compile_opts_obj-new-offload -fsycl -c --offload-new-driver -foffload-lto=thin ${sanitizer_generic_compile_opts} ${sycl_cpu_target_opt} -D__LIBDEVICE_CPU__) - set(asan_dg2_compile_opts_obj-new-offload -fsycl -c --offload-new-driver + set(sanitizer_dg2_compile_opts_obj-new-offload -fsycl -c --offload-new-driver -foffload-lto=thin ${sanitizer_generic_compile_opts} ${sycl_dg2_target_opt} @@ -373,16 +373,16 @@ else() -I${CMAKE_CURRENT_SOURCE_DIR}) # asan aot - set(asan_filetypes obj obj-new-offload bc) + set(sanitizer_filetypes obj obj-new-offload bc) set(asan_devicetypes pvc cpu dg2) - foreach(asan_ft IN LISTS asan_filetypes) + foreach(asan_ft IN LISTS sanitizer_filetypes) foreach(asan_device IN LISTS asan_devicetypes) compile_lib_ext(libsycl-asan-${asan_device} SRC sanitizer/asan_rtl.cpp FILETYPE ${asan_ft} DEPENDENCIES ${asan_obj_deps} - OPTS ${asan_${asan_device}_compile_opts_${asan_ft}}) + OPTS ${sanitizer_${asan_device}_compile_opts_${asan_ft}}) endforeach() endforeach() @@ -393,6 +393,19 @@ else() EXTRA_OPTS -fno-sycl-instrument-device-code -I${UR_SANITIZER_INCLUDE_DIR} -I${CMAKE_CURRENT_SOURCE_DIR}) + + set(msan_devicetypes pvc cpu) + + foreach(msan_ft IN LISTS sanitizer_filetypes) + foreach(msan_device IN LISTS msan_devicetypes) + compile_lib_ext(libsycl-msan-${msan_device} + SRC sanitizer/msan_rtl.cpp + FILETYPE ${msan_ft} + DEPENDENCIES ${msan_obj_deps} + OPTS ${sanitizer_${msan_device}_compile_opts_${msan_ft}}) + endforeach() + endforeach() + endif() endif() diff --git a/libdevice/sanitizer/msan_rtl.cpp b/libdevice/sanitizer/msan_rtl.cpp index 8f3babd709038..4645cab519a8c 100644 --- a/libdevice/sanitizer/msan_rtl.cpp +++ b/libdevice/sanitizer/msan_rtl.cpp @@ -201,6 +201,11 @@ DEVICE_EXTERN_C_NOINLINE uptr __msan_get_shadow(uptr addr, uint32_t as) { MSAN_DEBUG(__spirv_ocl_printf(__msan_print_launchinfo, (void *)launch_info, launch_info->GlobalShadowOffset)); +#if defined(__LIBDEVICE_PVC__) + shadow_ptr = __msan_get_shadow_pvc(addr, as); +#elif defined(__LIBDEVICE_CPU__) + shadow_ptr = __msan_get_shadow_cpu(addr); +#else if (LIKELY(launch_info->DeviceTy == DeviceType::CPU)) { shadow_ptr = __msan_get_shadow_cpu(addr); } else if (launch_info->DeviceTy == DeviceType::GPU_PVC) { @@ -209,6 +214,7 @@ DEVICE_EXTERN_C_NOINLINE uptr __msan_get_shadow(uptr addr, uint32_t as) { MSAN_DEBUG(__spirv_ocl_printf(__msan_print_unsupport_device_type, launch_info->DeviceTy)); } +#endif MSAN_DEBUG(__spirv_ocl_printf(__msan_print_shadow, (void *)addr, as, (void *)shadow_ptr, *(u8 *)shadow_ptr)); diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index 72841724fa01d..202bfdcdc19e3 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") + set(UNIFIED_RUNTIME_REPO "https://github.com/Bensuo/unified-runtime.git") include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 58a9a6ed5f458..83a8ce76d2c81 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit 08d36b76a5b1c4f080e3301507a39525ab5ab365 -# Merge: 4c504dbc e6b61c67 +# commit f07688dbc20c73d7e480cb62d7dc0ce7dc822bd3 +# Merge: 7d864b6c 3dbf8b24 # Author: Kenneth Benzie (Benie) -# Date: Tue Feb 4 13:14:19 2025 +0000 -# Merge pull request #2614 from kurapov-peter/spills -# Add UR_KERNEL_INFO_SPILL_MEM_SIZE kernel info prop -set(UNIFIED_RUNTIME_TAG 08d36b76a5b1c4f080e3301507a39525ab5ab365) +# Date: Tue Feb 4 15:45:49 2025 +0000 +# Merge pull request #2618 from winstonzhang-intel/max_eu_count_calculation +# [L0] MAX_COMPUTE_UNITS using ze_eu_count_ext_t +set(UNIFIED_RUNTIME_TAG "ewan/native_command") diff --git a/sycl/doc/extensions/experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc index 143d8f3fd4521..7f64a5b4e01f7 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc @@ -173,12 +173,82 @@ dependencies are satisfied. The SYCL command described above completes once all of the native asynchronous tasks it contains have completed. -The call to `interopCallable` must not submit any synchronous tasks to the +The call to `interopCallable` should not submit any synchronous tasks to the native backend object, and it must not block waiting for any tasks to complete. The call also must not add tasks to backend objects that underly any other queue, aside from the queue that is associated with this handler. If it does any of these things, the behavior is undefined. +=== sycl_ext_oneapi_graph Interaction + +`ext_codeplay_enqueue_native_command` can be used in the +link:../experimental/sycl_ext_oneapi_graph.asciidoc[sycl_ext_oneapi_graph] +extension as a graph node. The `interopCallable` object will be invoked +during `command_graph::finalize()` when the backend object for the graph +is available to give to the user as a handle. The user then may +add nodes to this backend graph objects using native APIs. Note that this +involves a synchronous API call to a native backend object, which is an +exception to earlier advice about submitting synchronous task to native +backend objects inside `interopCallable`. + +The runtime will schedule the dependencies of the user added nodes such +that they respect the graph node edges. + +=== Additions to the interop_handler class + +TODO: Document backend return types, Move this info to main graphs spec, + defin interaction with host-task. + +* CUGraph +* hipGraph_t +* ze_command_list_handle_t +* cl_command_buffer_khr + + + +```c++ +using graph = ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable>; + +class interop_handle { + bool ext_oneapi_has_graph() const; + + template + backend_return_t ext_oneapi_get_native_graph() const; + +}; +``` + +Table {counter: tableNumber}. Additional member functions of the `sycl::interop_handle` class. +[cols="2a,a"] +|=== +|Member function|Description + +| +[source,c++] +---- +bool interop_handle::ext_oneapi_has_graph() const; +---- + +| Query if the `interop_handle` object has a native graph object available. + +| +[source,c++] +---- +template +backend_return_t interop_handle::ext_oneapi_get_native_graph() const; +---- + +| Return the native graph object associated with the `interop_handle`. + +Exceptions: + +* Throws with error code `invalid` if there is no native graph object + associated with the interop handle. + +|=== + + == Example This example demonstrates how to use this extension to enqueue asynchronous @@ -206,12 +276,3 @@ q.submit([&](sycl::handler &cgh) { }); q.wait(); ``` - -== Issues - -=== sycl_ext_oneapi_graph - -`ext_codeplay_enqueue_native_command` -cannot be used in graph nodes. A synchronous exception will be thrown with error -code `invalid` if a user tries to add them to a graph. - diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 649c00fb474b5..2171eda82d96e 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -2077,13 +2077,9 @@ extensions. ==== sycl_ext_codeplay_enqueue_native_command -`ext_codeplay_enqueue_native_command`, defined in +`ext_codeplay_enqueue_native_command` commands, defined in link:../experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc[sycl_ext_codeplay_enqueue_native_command] -cannot be used in graph nodes. A synchronous exception will be thrown with error -code `invalid` if a user tries to add them to a graph. - -Removing this restriction is something we may look at for future revisions of -`sycl_ext_oneapi_graph`. +can be used in graph nodes. See the section on `sycl_ext_oneapi_graph`. ==== sycl_ext_intel_queue_index diff --git a/sycl/include/sycl/detail/backend_traits_cuda.hpp b/sycl/include/sycl/detail/backend_traits_cuda.hpp index 89bef47d01a4b..0f9f987f62ce4 100644 --- a/sycl/include/sycl/detail/backend_traits_cuda.hpp +++ b/sycl/include/sycl/detail/backend_traits_cuda.hpp @@ -17,6 +17,7 @@ #include #include #include +#include #include typedef int CUdevice; @@ -24,6 +25,7 @@ typedef struct CUctx_st *CUcontext; typedef struct CUstream_st *CUstream; typedef struct CUevent_st *CUevent; typedef struct CUmod_st *CUmodule; +typedef struct CUgraph_st *CUgraph; // As defined in the CUDA 10.1 header file. This requires CUDA version > 3.2 #if defined(_WIN64) || defined(__LP64__) @@ -102,6 +104,16 @@ template <> struct BackendReturn { using type = CUstream; }; +using graph = ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable>; +template <> struct BackendInput { + using type = CUgraph; +}; + +template <> struct BackendReturn { + using type = CUgraph; +}; + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/detail/backend_traits_hip.hpp b/sycl/include/sycl/detail/backend_traits_hip.hpp index 9f7cbb2bfdd91..521ba328b7a9a 100644 --- a/sycl/include/sycl/detail/backend_traits_hip.hpp +++ b/sycl/include/sycl/detail/backend_traits_hip.hpp @@ -17,6 +17,7 @@ #include #include #include +#include #include typedef int HIPdevice; @@ -25,6 +26,8 @@ typedef struct ihipStream_t *HIPstream; typedef struct ihipEvent_t *HIPevent; typedef struct ihipModule_t *HIPmodule; typedef void *HIPdeviceptr; +typedef struct ihipGraph *HIPGraph; +typedef struct hipGraphNode *HIPGraphNode; namespace sycl { inline namespace _V1 { @@ -96,6 +99,16 @@ template <> struct BackendReturn { using type = HIPstream; }; +using graph = ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable>; +template <> struct BackendInput { + using type = HIPGraph; +}; + +template <> struct BackendReturn { + using type = HIPGraph; +}; + template <> struct InteropFeatureSupportMap { static constexpr bool MakePlatform = false; static constexpr bool MakeDevice = true; diff --git a/sycl/include/sycl/detail/backend_traits_level_zero.hpp b/sycl/include/sycl/detail/backend_traits_level_zero.hpp index f2220c863c123..e1947ffafe326 100644 --- a/sycl/include/sycl/detail/backend_traits_level_zero.hpp +++ b/sycl/include/sycl/detail/backend_traits_level_zero.hpp @@ -20,13 +20,14 @@ #include // for device #include // for event #include // for ownership -#include // for image -#include // for kernel -#include // for kernel_b... -#include // for bundle_s... -#include // for platform -#include // for property... -#include // for range +#include +#include // for image +#include // for kernel +#include // for kernel_b... +#include // for bundle_s... +#include // for platform +#include // for property... +#include // for range #include // for variant #include // for vector @@ -207,6 +208,16 @@ template <> struct BackendReturn { using type = ze_kernel_handle_t; }; +using graph = ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable>; +template <> struct BackendInput { + using type = ze_command_list_handle_t; +}; + +template <> struct BackendReturn { + using type = ze_command_list_handle_t; +}; + template <> struct InteropFeatureSupportMap { static constexpr bool MakePlatform = true; static constexpr bool MakeDevice = true; diff --git a/sycl/include/sycl/detail/backend_traits_opencl.hpp b/sycl/include/sycl/detail/backend_traits_opencl.hpp index b203013c0e903..01f06b38867a7 100644 --- a/sycl/include/sycl/detail/backend_traits_opencl.hpp +++ b/sycl/include/sycl/detail/backend_traits_opencl.hpp @@ -21,9 +21,10 @@ #include // for assertion and ur handles #include // for device #include // for event -#include // for kernel -#include // for bundle_state -#include // for platform +#include +#include // for kernel +#include // for bundle_state +#include // for platform #include // for vector @@ -132,6 +133,16 @@ template <> struct BackendReturn { using type = cl_kernel; }; +using graph = ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable>; +template <> struct BackendInput { + using type = cl_command_buffer_khr; +}; + +template <> struct BackendReturn { + using type = cl_command_buffer_khr; +}; + template <> struct InteropFeatureSupportMap { static constexpr bool MakePlatform = true; static constexpr bool MakeDevice = true; diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index e2e87c30ea945..6401bdbbab463 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -114,7 +114,8 @@ enum class node_type { prefetch = 6, memadvise = 7, ext_oneapi_barrier = 8, - host_task = 9 + host_task = 9, + native_command = 10 }; /// Class representing a node in the graph, returned by command_graph::add(). diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index a76d6002d9d87..705c9c520dec4 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1934,9 +1934,6 @@ class __SYCL_EXPORT handler { void(interop_handle)>::value> ext_codeplay_enqueue_native_command([[maybe_unused]] FuncT &&Func) { #ifndef __SYCL_DEVICE_ONLY__ - throwIfGraphAssociated< - ext::oneapi::experimental::detail::UnsupportedGraphFeatures:: - sycl_ext_codeplay_enqueue_native_command>(); ext_codeplay_enqueue_native_command_impl(Func); #endif } diff --git a/sycl/include/sycl/interop_handle.hpp b/sycl/include/sycl/interop_handle.hpp index 2e7408cf5c0f9..ea2f2bb9bcf09 100644 --- a/sycl/include/sycl/interop_handle.hpp +++ b/sycl/include/sycl/interop_handle.hpp @@ -16,6 +16,7 @@ #include // for getSyclObjImpl #include #include // for accessor_property_list +#include #include // for image #include // for ur_mem_handle_t, ur... @@ -49,6 +50,9 @@ class interop_handle { /// interop_handle. __SYCL_EXPORT backend get_backend() const noexcept; + /// Returns true if command-group is being added to a graph as a node + __SYCL_EXPORT bool ext_oneapi_has_graph() const noexcept; + /// Receives a SYCL accessor that has been defined as a requirement for the /// command group, and returns the underlying OpenCL memory object that is /// used by the SYCL runtime. If the accessor passed as parameter is not part @@ -134,6 +138,26 @@ class interop_handle { #endif } + using graph = ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable>; + template + backend_return_t ext_oneapi_get_native_graph() const { +#ifndef __SYCL_DEVICE_ONLY__ + // TODO: replace the exception thrown below with the SYCL 2020 exception + // with the error code 'errc::backend_mismatch' when those new exceptions + // are ready to be used. + if (Backend != get_backend()) + throw exception(make_error_code(errc::invalid), + "Incorrect backend argument was passed"); + + // C-style cast required to allow various native types + return (backend_return_t)getNativeGraph(); +#else + // we believe this won't be ever called on device side + return 0; +#endif + } + /// Returns the SYCL application interoperability native backend object /// associated with the device associated with the SYCL queue that the host /// task was submitted to. The native backend object returned must be in @@ -186,8 +210,9 @@ class interop_handle { interop_handle(std::vector MemObjs, const std::shared_ptr &Queue, const std::shared_ptr &Device, - const std::shared_ptr &Context) - : MQueue(Queue), MDevice(Device), MContext(Context), + const std::shared_ptr &Context, + const ur_exp_command_buffer_handle_t &Graph) + : MQueue(Queue), MDevice(Device), MContext(Context), MGraph(Graph), MMemObjs(std::move(MemObjs)) {} template @@ -211,10 +236,12 @@ class interop_handle { getNativeQueue(int32_t &NativeHandleDesc) const; __SYCL_EXPORT ur_native_handle_t getNativeDevice() const; __SYCL_EXPORT ur_native_handle_t getNativeContext() const; + __SYCL_EXPORT ur_native_handle_t getNativeGraph() const; std::shared_ptr MQueue; std::shared_ptr MDevice; std::shared_ptr MContext; + ur_exp_command_buffer_handle_t MGraph; std::vector MMemObjs; }; diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 5f9f79d878d03..c940b35c81448 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -824,6 +824,8 @@ ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNode( std::shared_ptr Node) { // Queue which will be used for allocation operations for accessors. + // Will also be used in native commands to return to the user in + // `interop_handler::get_native_queue()` calls auto AllocaQueue = std::make_shared( DeviceImpl, sycl::detail::getSyclObjImpl(Ctx), sycl::async_handler{}, sycl::property_list{}); diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index e609123b4f285..f5d960b2d15e6 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -71,6 +71,8 @@ inline node_type getNodeTypeFromCG(sycl::detail::CGType CGType) { return node_type::host_task; case sycl::detail::CGType::ExecCommandBuffer: return node_type::subgraph; + case sycl::detail::CGType::EnqueueNativeCommand: + return node_type::native_command; default: assert(false && "Invalid Graph Node Type"); return node_type::empty; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 005008a74ebd0..d92077084fc2b 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -454,7 +454,7 @@ class DispatchHostTask { "Host task submissions should have an associated queue"); interop_handle IH{MReqToMem, HostTask.MQueue, HostTask.MQueue->getDeviceImplPtr(), - HostTask.MQueue->getContextImplPtr()}; + HostTask.MQueue->getContextImplPtr(), nullptr}; // TODO: should all the backends that support this entry point use this // for host task? auto &Queue = HostTask.MQueue; @@ -2879,6 +2879,19 @@ ur_result_t enqueueReadWriteHostPipe(const QueueImplPtr &Queue, return Error; } +namespace { + +struct CommandBufferNativeCommandData { + sycl::interop_handle ih; + std::function func; +}; + +void CommandBufferInteropFreeFunc(void *InteropData) { + auto *Data = reinterpret_cast(InteropData); + return Data->func(Data->ih); +} +} // namespace + ur_result_t ExecCGCommand::enqueueImpCommandBuffer() { assert(MQueue && "Command buffer enqueue should have an associated queue"); // Wait on host command dependencies @@ -3045,6 +3058,55 @@ ur_result_t ExecCGCommand::enqueueImpCommandBuffer() { MEvent->setSyncPoint(OutSyncPoint); return UR_RESULT_SUCCESS; } + case CGType::EnqueueNativeCommand: { + // Queue is created by graph_impl before creating command to submit to + // scheduler. + const AdapterPtr &Adapter = MQueue->getAdapter(); + const auto Backend = MQueue->get_device().get_backend(); + CGHostTask *HostTask = (CGHostTask *)MCommandGroup.get(); + + // TODO - Doc this + ur_exp_command_buffer_handle_t ChildCommandBuffer = nullptr; + if (Backend == sycl::backend::ext_oneapi_cuda || + Backend == sycl::backend::ext_oneapi_hip) { + + ur_exp_command_buffer_desc_t Desc{ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC /*stype*/, + nullptr /*pnext*/, false /* updatable */, false /* in-order */, + false /* profilable*/ + }; + auto ContextImpl = sycl::detail::getSyclObjImpl(MQueue->get_context()); + auto DeviceImpl = sycl::detail::getSyclObjImpl(MQueue->get_device()); + Adapter->call( + ContextImpl->getHandleRef(), DeviceImpl->getHandleRef(), &Desc, + &ChildCommandBuffer); + } + + std::vector ReqToMem; // TODO work with buffers + interop_handle IH{ReqToMem, HostTask->MQueue, + HostTask->MQueue->getDeviceImplPtr(), + HostTask->MQueue->getContextImplPtr(), + ChildCommandBuffer ? ChildCommandBuffer : MCommandBuffer}; + CommandBufferNativeCommandData CustomOpData{ + IH, HostTask->MHostTask->MInteropTask}; + + Adapter->call( + MCommandBuffer, CommandBufferInteropFreeFunc, &CustomOpData, + ChildCommandBuffer, MSyncPointDeps.size(), + MSyncPointDeps.empty() ? nullptr : MSyncPointDeps.data(), + &OutSyncPoint); + + if (ChildCommandBuffer) { + ur_result_t Res = Adapter->call_nocheck< + sycl::detail::UrApiKind::urCommandBufferReleaseExp>( + ChildCommandBuffer); + (void)Res; + assert(Res == UR_RESULT_SUCCESS); + } + + MEvent->setSyncPoint(OutSyncPoint); + return UR_RESULT_SUCCESS; + } default: throw exception(make_error_code(errc::runtime), @@ -3416,7 +3478,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { EnqueueNativeCommandData CustomOpData{ interop_handle{ReqToMem, HostTask->MQueue, HostTask->MQueue->getDeviceImplPtr(), - HostTask->MQueue->getContextImplPtr()}, + HostTask->MQueue->getContextImplPtr(), nullptr}, HostTask->MHostTask->MInteropTask}; ur_bool_t NativeCommandSupport = false; diff --git a/sycl/source/interop_handle.cpp b/sycl/source/interop_handle.cpp index aabf22702ef5f..91f1e3f3f032f 100644 --- a/sycl/source/interop_handle.cpp +++ b/sycl/source/interop_handle.cpp @@ -23,6 +23,10 @@ backend interop_handle::get_backend() const noexcept { return detail::getImplBackend(MQueue); } +bool interop_handle::ext_oneapi_has_graph() const noexcept { + return MGraph != nullptr; +} + ur_native_handle_t interop_handle::getNativeMem(detail::Requirement *Req) const { auto Iter = std::find_if(std::begin(MMemObjs), std::end(MMemObjs), @@ -53,5 +57,17 @@ interop_handle::getNativeQueue(int32_t &NativeHandleDesc) const { return MQueue->getNative(NativeHandleDesc); } +ur_native_handle_t interop_handle::getNativeGraph() const { + if (!MGraph) { + throw exception(make_error_code(errc::invalid), + "Command-Group is not being added as a graph node"); + } + + auto Adapter = MQueue->getAdapter(); + ur_native_handle_t Handle; + Adapter->call(MGraph, + &Handle); + return Handle; +} } // namespace _V1 } // namespace sycl diff --git a/sycl/test-e2e/Graph/NativeCommand/lit.local.cfg b/sycl/test-e2e/Graph/NativeCommand/lit.local.cfg new file mode 100644 index 0000000000000..f01e2216db41b --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/lit.local.cfg @@ -0,0 +1 @@ +config.required_features += ['aspect-ext_oneapi_limited_graph'] diff --git a/sycl/test-e2e/Graph/NativeCommand/native_cuda_explicit_usm.cpp b/sycl/test-e2e/Graph/NativeCommand/native_cuda_explicit_usm.cpp new file mode 100644 index 0000000000000..6493b9ff49cf9 --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_cuda_explicit_usm.cpp @@ -0,0 +1,81 @@ +// RUN: %{build} -o %t.out -lcuda +// RUN: %{run} %t.out +// REQUIRES: cuda, cuda_dev_kit + +#include "../graph_common.hpp" +#include +#include +#include + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + int *PtrY = malloc_device(Size, Queue); + + exp_ext::command_graph Graph{Queue}; + + Graph.begin_recording(Queue); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrX[i] = i; + PtrY[i] = 0; + } + }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.depends_on(EventA); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_oneapi_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + CUgraph NativeGraph = + IH.ext_oneapi_get_native_graph(); + + CUDA_MEMCPY3D Params; + std::memset(&Params, 0, sizeof(CUDA_MEMCPY3D)); + Params.srcMemoryType = CU_MEMORYTYPE_DEVICE; + Params.srcDevice = (CUdeviceptr)PtrX; + Params.srcHost = nullptr; + Params.dstMemoryType = CU_MEMORYTYPE_DEVICE; + Params.dstDevice = (CUdeviceptr)PtrY, Params.dstHost = nullptr; + Params.WidthInBytes = Size * sizeof(int); + Params.Height = 1; + Params.Depth = 1; + + CUgraphNode Node; + CUcontext Context = IH.get_native_context(); + auto Res = cuGraphAddMemcpyNode(&Node, NativeGraph, nullptr, 0, &Params, + Context); + assert(Res == CUDA_SUCCESS); + }); + }); + + Queue.submit([&](handler &CGH) { + CGH.depends_on(EventB); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + std::vector HostData(Size); + + Queue.copy(PtrY, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostData[i], + std::string("HostData at index ") + std::to_string(i))); + } + + free(PtrX, Queue); + free(PtrY, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_buffer.cpp b/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_buffer.cpp new file mode 100644 index 0000000000000..df41d31539015 --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_buffer.cpp @@ -0,0 +1,114 @@ +// RUN: %{build} -o %t.out -lcuda +// RUN: %{run} %t.out +// REQUIRES: cuda, cuda_dev_kit + +#include "../graph_common.hpp" +#include +#include +#include + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + buffer BufX{Size}; + BufX.set_write_back(false); + buffer BufY{Size}; + BufY.set_write_back(false); + + { + + exp_ext::command_graph Graph{ + Queue, {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + Graph.begin_recording(Queue); + + Queue.submit([&](handler &CGH) { + auto AccX = BufX.get_access(); + auto AccY = BufY.get_access(); + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + AccX[i] = i; + AccY[i] = 0; + } + }); + }); + + Queue.submit([&](handler &CGH) { + auto AccX = BufX.get_access(); + auto AccY = BufY.get_access(); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_oneapi_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + // Newly created stream for this node + auto NativeStream = IH.get_native_queue(); + // Graph already created with cuGraphCreate + CUgraph NativeGraph = + IH.ext_oneapi_get_native_graph(); + + auto PtrX = IH.get_native_mem(AccX); + auto PtrY = IH.get_native_mem(AccY); + + // Start stream capture + // After CUDA 12.3 we can use cuStreamBeginCaptureToGraph to capture + // the stream directly in the native graph, rather than needing to + // instantiate the stream capture as a new graph. +#if CUDA_VERSION >= 12030 + auto Res = cuStreamBeginCaptureToGraph(NativeStream, NativeGraph, + nullptr, nullptr, 0, + CU_STREAM_CAPTURE_MODE_GLOBAL); + assert(Res == CUDA_SUCCESS); +#else + // Start stream capture + auto Res = + cuStreamBeginCapture(NativeStream, CU_STREAM_CAPTURE_MODE_GLOBAL); + assert(Res == CUDA_SUCCESS); +#endif + + // Add memcopy node + Res = cuMemcpyAsync(PtrY, PtrX, Size * sizeof(int), NativeStream); + assert(Res == CUDA_SUCCESS); + +#if CUDA_VERSION >= 12030 + Res = cuStreamEndCapture(NativeStream, &NativeGraph); + assert(Res == CUDA_SUCCESS); +#else + // cuStreamEndCapture returns a new graph, if we overwrite + // "NativeGraph" it won't be picked up by the UR runtime, as it's + // a passed-by-value pointer + CUgraph RecordedGraph; + Res = cuStreamEndCapture(NativeStream, &RecordedGraph); + assert(Res == CUDA_SUCCESS); + + // Add graph to native graph as a child node + // Need to return a node object for the node to be created, + // can't be nullptr. + CUgraphNode Node; + cuGraphAddChildGraphNode(&Node, NativeGraph, nullptr, 0, RecordedGraph); + assert(Res == CUDA_SUCCESS); +#endif + }); + }); + + Queue.submit([&](handler &CGH) { + auto AccY = BufY.get_access(); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { AccY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + } + + auto HostAcc = BufY.get_host_access(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostAcc[i], + std::string("HostAcc at index ") + std::to_string(i))); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_usm.cpp b/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_usm.cpp new file mode 100644 index 0000000000000..91c95b95be2ae --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_usm.cpp @@ -0,0 +1,106 @@ +// RUN: %{build} -o %t.out -lcuda +// RUN: %{run} %t.out +// REQUIRES: cuda, cuda_dev_kit + +#include "../graph_common.hpp" +#include +#include +#include + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + int *PtrY = malloc_device(Size, Queue); + + exp_ext::command_graph Graph{Queue}; + + Graph.begin_recording(Queue); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrX[i] = i; + PtrY[i] = 0; + } + }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.depends_on(EventA); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_oneapi_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + // Newly created stream for this node + auto NativeStream = IH.get_native_queue(); + // Graph already created with cuGraphCreate + CUgraph NativeGraph = + IH.ext_oneapi_get_native_graph(); + + // Start stream capture + // After CUDA 12.3 we can use cuStreamBeginCaptureToGraph to capture + // the stream directly in the native graph, rather than needing to + // instantiate the stream capture as a new graph. +#if CUDA_VERSION >= 12030 + auto Res = cuStreamBeginCaptureToGraph(NativeStream, NativeGraph, nullptr, + nullptr, 0, + CU_STREAM_CAPTURE_MODE_GLOBAL); + assert(Res == CUDA_SUCCESS); +#else + auto Res = + cuStreamBeginCapture(NativeStream, CU_STREAM_CAPTURE_MODE_GLOBAL); + assert(Res == CUDA_SUCCESS); +#endif + + // Add memcopy node + Res = cuMemcpyAsync((CUdeviceptr)PtrY, (CUdeviceptr)PtrX, + Size * sizeof(int), NativeStream); + assert(Res == CUDA_SUCCESS); + +#if CUDA_VERSION >= 12030 + Res = cuStreamEndCapture(NativeStream, &NativeGraph); + assert(Res == CUDA_SUCCESS); +#else + // cuStreamEndCapture returns a new graph, if we overwrite + // "NativeGraph" it won't be picked up by the UR runtime, as it's + // a passed-by-value pointer + CUgraph RecordedGraph; + Res = cuStreamEndCapture(NativeStream, &RecordedGraph); + assert(Res == CUDA_SUCCESS); + + // Add graph to native graph as a child node + // Need to return a node object for the node to be created, + // can't be nullptr. + CUgraphNode Node; + cuGraphAddChildGraphNode(&Node, NativeGraph, nullptr, 0, RecordedGraph); + assert(Res == CUDA_SUCCESS); +#endif + }); + }); + + Queue.submit([&](handler &CGH) { + CGH.depends_on(EventB); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + std::vector HostData(Size); + + Queue.copy(PtrY, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostData[i], + std::string("HostData at index ") + std::to_string(i))); + } + + free(PtrX, Queue); + free(PtrY, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/NativeCommand/native_hip_explicit_usm.cpp b/sycl/test-e2e/Graph/NativeCommand/native_hip_explicit_usm.cpp new file mode 100644 index 0000000000000..15b5cfb053279 --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_hip_explicit_usm.cpp @@ -0,0 +1,75 @@ +// FIXME: the rocm include path and link path are highly platform dependent, +// we should set this with some variable instead. +// RUN: %{build} -Wno-error=deprecated-pragma -o %t.out -I%rocm_path/include -L%rocm_path/lib -lamdhip64 +// RUN: %{run} %t.out +// REQUIRES: target-amd + +#include "../graph_common.hpp" +#include +#include + +#define __HIP_PLATFORM_AMD__ +#include + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + int *PtrY = malloc_device(Size, Queue); + + exp_ext::command_graph Graph{Queue}; + + Graph.begin_recording(Queue); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrX[i] = i; + PtrY[i] = 0; + } + }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.depends_on(EventA); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_oneapi_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + // Graph already created with hipGraphCreate + HIPGraph NativeGraph = + IH.ext_oneapi_get_native_graph(); + + HIPGraphNode Node; + auto Res = hipGraphAddMemcpyNode1D(&Node, NativeGraph, nullptr, 0 + PtrY, PtrX, Size * sizeof(int), hipMemcpyDefault)); + + assert(Res == hipSuccess); + }); + }); + + Queue.submit([&](handler &CGH) { + CGH.depends_on(EventB); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + std::vector HostData(Size); + + Queue.copy(PtrY, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostData[i], + std::string("HostData at index ") + std::to_string(i))); + } + + free(PtrX, Queue); + free(PtrY, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/NativeCommand/native_hip_record_buffer.cpp b/sycl/test-e2e/Graph/NativeCommand/native_hip_record_buffer.cpp new file mode 100644 index 0000000000000..e65850e9b8e10 --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_hip_record_buffer.cpp @@ -0,0 +1,119 @@ +// FIXME: the rocm include path and link path are highly platform dependent, +// we should set this with some variable instead. +// RUN: %{build} -Wno-error=deprecated-pragma -o %t.out -I%rocm_path/include -L%rocm_path/lib -lamdhip64 +// RUN: %{run} %t.out +// REQUIRES: target-amd + +#include "../graph_common.hpp" +#include +#include + +#define __HIP_PLATFORM_AMD__ +#include + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + buffer BufX{Size}; + BufX.set_write_back(false); + buffer BufY{Size}; + BufY.set_write_back(false); + + { + + exp_ext::command_graph Graph{ + Queue, {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + Graph.begin_recording(Queue); + + Queue.submit([&](handler &CGH) { + auto AccX = BufX.get_access(); + auto AccY = BufY.get_access(); + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + AccX[i] = i; + AccY[i] = 0; + } + }); + }); + + Queue.submit([&](handler &CGH) { + auto AccX = BufX.get_access(); + auto AccY = BufY.get_access(); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_oneapi_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + // Newly created stream for this node + auto NativeStream = IH.get_native_queue(); + // Graph already created with hipGraphCreate + HIPGraph NativeGraph = + IH.ext_oneapi_get_native_graph(); + + auto PtrX = IH.get_native_mem(AccX); + auto PtrY = IH.get_native_mem(AccY); + + // Start stream capture + // After HIP 6.2 we can use hipStreamBeginCaptureToGraph to capture + // the stream directly in the native graph, rather than needing to + // instantiate the stream capture as a new graph. +#if HIP_VERSION + auto Res = + hipStreamBeginCapture(NativeStream, NativeGraph, nullptr, nullptr, + 0, hipStreamCaptureModeGlobal); + assert(Res == hipSuccess); +#else + auto Res = + hipStreamBeginCapture(NativeStream, hipStreamCaptureModeGlobal); + assert(Res == hipSuccess); +#endif + + // Add memcopy node + Res = hipMemcpyWithStream(PtrY, PtrX, sizeof(int) * Size, + hipMemcpyDefault, NativeStream); + assert(Res == hipSuccess); + +#if HIP_VERSION + Res = hipStreamEndCapture(NativeStream, &NativeGraph); + assert(Res == hipSuccess); +#else + // hipStreamEndCapture returns a new graph, if we overwrite + // "NativeGraph" it won't be picked up by the UR runtime, as it's + // a passed-by-value pointer + HIPGraph RecordedGraph; + Res = hipStreamEndCapture(NativeStream, &RecordedGraph); + assert(Res == hipSuccess); + + // Add graph to native graph as a child node + // Need to return a node object for the node to be created, + // can't be nullptr. + HIPGraphNode Node; + hipGraphAddChildGraphNode(&Node, NativeGraph, nullptr, 0, + RecordedGraph); + assert(Res == hipSuccess); +#endif + }); + }); + + Queue.submit([&](handler &CGH) { + auto AccY = BufY.get_access(); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { AccY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + } + + auto HostAcc = BufY.get_host_access(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostAcc[i], + std::string("HostAcc at index ") + std::to_string(i))); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/NativeCommand/native_hip_record_usm.cpp b/sycl/test-e2e/Graph/NativeCommand/native_hip_record_usm.cpp new file mode 100644 index 0000000000000..6e85deae957d0 --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_hip_record_usm.cpp @@ -0,0 +1,109 @@ +// FIXME: the rocm include path and link path are highly platform dependent, +// we should set this with some variable instead. +// RUN: %{build} -Wno-error=deprecated-pragma -o %t.out -I%rocm_path/include -L%rocm_path/lib -lamdhip64 +// RUN: %{run} %t.out +// REQUIRES: target-amd + +#include "../graph_common.hpp" +#include +#include + +#define __HIP_PLATFORM_AMD__ +#include + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + int *PtrY = malloc_device(Size, Queue); + + exp_ext::command_graph Graph{Queue}; + + Graph.begin_recording(Queue); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrX[i] = i; + PtrY[i] = 0; + } + }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.depends_on(EventA); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_oneapi_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + // Newly created stream for this node + auto NativeStream = IH.get_native_queue(); + // Graph already created with hipGraphCreate + HIPGraph NativeGraph = + IH.ext_oneapi_get_native_graph(); + + // Start stream capture + // After HIP 6.2 we can use hipStreamBeginCaptureToGraph to capture + // the stream directly in the native graph, rather than needing to + // instantiate the stream capture as a new graph. +#if HIP_VERSION + auto Res = hipStreamBeginCapture(NativeStream, NativeGraph, nullptr, + nullptr, 0, hipStreamCaptureModeGlobal); + assert(Res == hipSuccess); +#else + auto Res = + hipStreamBeginCapture(NativeStream, hipStreamCaptureModeGlobal); + assert(Res == hipSuccess); +#endif + + // Add memcopy node + Res = hipMemcpyWithStream(PtrY, PtrX, sizeof(int) * Size, + hipMemcpyDefault, NativeStream); + assert(Res == hipSuccess); + +#if HIP_VERSION + Res = hipStreamEndCapture(NativeStream, &NativeGraph); + assert(Res == hipSuccess); +#else + // hipStreamEndCapture returns a new graph, if we overwrite + // "NativeGraph" it won't be picked up by the UR runtime, as it's + // a passed-by-value pointer + HIPGraph RecordedGraph; + Res = hipStreamEndCapture(NativeStream, &RecordedGraph); + assert(Res == hipSuccess); + + // Add graph to native graph as a child node + // Need to return a node object for the node to be created, + // can't be nullptr. + HIPGraphNode Node; + hipGraphAddChildGraphNode(&Node, NativeGraph, nullptr, 0, RecordedGraph); + assert(Res == hipSuccess); +#endif + }); + }); + + Queue.submit([&](handler &CGH) { + CGH.depends_on(EventB); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + std::vector HostData(Size); + + Queue.copy(PtrY, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostData[i], + std::string("HostData at index ") + std::to_string(i))); + } + + free(PtrX, Queue); + free(PtrY, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/NativeCommand/native_level-zero_usm.cpp b/sycl/test-e2e/Graph/NativeCommand/native_level-zero_usm.cpp new file mode 100644 index 0000000000000..a78b3cf5ebcd5 --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_level-zero_usm.cpp @@ -0,0 +1,68 @@ +// RUN: %{build} %level_zero_options -o %t.out +// RUN: %{run} %t.out +// REQUIRES: level_zero, level_zero_dev_kit + +#include "../graph_common.hpp" +#include +#include +#include + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + int *PtrY = malloc_device(Size, Queue); + + exp_ext::command_graph Graph{Queue}; + + Graph.begin_recording(Queue); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrX[i] = i; + PtrY[i] = 0; + } + }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.depends_on(EventA); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_oneapi_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + ze_command_list_handle_t NativeGraph = + IH.ext_oneapi_get_native_graph(); + + auto Res = zeCommandListAppendMemoryCopy( + NativeGraph, PtrY, PtrX, Size * sizeof(int), nullptr, 0, nullptr); + assert(Res == ZE_RESULT_SUCCESS); + }); + }); + + Queue.submit([&](handler &CGH) { + CGH.depends_on(EventB); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + std::vector HostData(Size); + + Queue.copy(PtrY, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostData[i], + std::string("HostData at index ") + std::to_string(i))); + } + + free(PtrX, Queue); + free(PtrY, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/NativeCommand/native_opencl_buffer.cpp b/sycl/test-e2e/Graph/NativeCommand/native_opencl_buffer.cpp new file mode 100644 index 0000000000000..343a0a5cce47f --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_opencl_buffer.cpp @@ -0,0 +1,85 @@ +// RUN: %{build} -Wno-error=deprecated-declarations -o %t.out %threads_lib %opencl_lib +// RUN: %{run} %t.out +// REQUIRES: opencl + +#include "../graph_common.hpp" +#include +#include + +#include + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + buffer BufX{Size}; + BufX.set_write_back(false); + buffer BufY{Size}; + BufY.set_write_back(false); + + { + + exp_ext::command_graph Graph{ + Queue, {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + Graph.begin_recording(Queue); + + Queue.submit([&](handler &CGH) { + auto AccX = BufX.get_access(); + auto AccY = BufY.get_access(); + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + AccX[i] = i; + AccY[i] = 0; + } + }); + }); + + Queue.submit([&](handler &CGH) { + auto AccX = BufX.get_access(); + auto AccY = BufY.get_access(); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_oneapi_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + auto Platform = + get_native(Queue.get_context().get_platform()); + clCommandCopyBufferKHR_fn clCommandCopyBufferKHR = + reinterpret_cast( + clGetExtensionFunctionAddressForPlatform( + Platform, "clCommandCopyBufferKHR")); + assert(clCommandCopyBufferKHR != nullptr); + + cl_command_buffer_khr NativeGraph = + IH.ext_oneapi_get_native_graph(); + auto SrcBuffer = IH.get_native_mem(AccX); + auto DstBuffer = IH.get_native_mem(AccY); + + auto Res = clCommandCopyBufferKHR( + NativeGraph, nullptr, nullptr, SrcBuffer[0], DstBuffer[0], 0, 0, + Size * sizeof(int), 0, nullptr, nullptr, nullptr); + assert(Res == CL_SUCCESS); + }); + }); + + Queue.submit([&](handler &CGH) { + auto AccY = BufY.get_access(); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { AccY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + } + + auto HostAcc = BufY.get_host_access(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostAcc[i], + std::string("HostAcc at index ") + std::to_string(i))); + } + + return 0; +} diff --git a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp index 452738b8fca86..ddb9236e5047e 100644 --- a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp @@ -380,19 +380,6 @@ TEST_F(CommandGraphTest, BindlessExceptionCheck) { sycl::free(ImgMemUSM, Ctxt); } -// ext_codeplay_enqueue_native_command isn't supported with SYCL graphs -TEST_F(CommandGraphTest, EnqueueCustomCommandCheck) { - std::error_code ExceptionCode = make_error_code(sycl::errc::success); - try { - Graph.add([&](sycl::handler &CGH) { - CGH.ext_codeplay_enqueue_native_command([=](sycl::interop_handle IH) {}); - }); - } catch (exception &Exception) { - ExceptionCode = Exception.code(); - } - ASSERT_EQ(ExceptionCode, sycl::errc::invalid); -} - // sycl_ext_oneapi_work_group_scratch_memory isn't supported with SYCL graphs TEST_F(CommandGraphTest, WorkGroupScratchMemoryCheck) { ASSERT_THROW(